diff --git a/sft_pretrain/Full_smoe_sigmoidgating/added_tokens.json b/sft_pretrain/Full_smoe_sigmoidgating/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/added_tokens.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4477091c8e5e4d06ea14a8a918edb0ae2310c298 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/generation_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a190e72beeda78697edb9261ebebbd9eb2ada8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbac22ab0b60e8b76c0f9651f88baa86b1867382c42568d7c2c64eafe84c261 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aca01e470204cdcd3167f1d3de47b5396a2daa87 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a160d964bfa7d3b876c773fceb1027e5f943d5ae880b002509c5384a10ab408 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c6a09eee681ee73f2c414b29acd6a4e23469592 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269d6dc672e95da123ab0fbbf51b1c9378ae0430bea34a4de779267c54d37b7c +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e12a7820469600060c29ee643922976411d175d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384bf9205b35756cd6f0ba0b0e79514d7865dda09ae8dd1fd6f766c7aae71b28 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe28dcf40366c9ab1124f27bde7c96f766f603ee --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5073f55924055feb668c324b125b457d7a5471b6608314d06f900b8d6e5386c +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a228592777b91f0c1d5982c490ff91e08bd5246e --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07d25735bf278595131c20046ee6be8ed90e25611386b6e4eaac2a852a84558 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88ab8e1c3adbc6a2ba70b638ee62645d30d1a36c --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f43f8595a502c8bbbcc5fff6dc9bfdca19606057353b6e46403a7dd42ee5ec4 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c20a810e78b87f1d8692165e939fa86fc243b65b --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cf1e2d3e681530dc87e91294be511ca175dcd28865eec16910bfa55e8332ab1 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/latest b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/latest new file mode 100644 index 0000000000000000000000000000000000000000..f37da78e3c7eee26ebe5f06b54d6621716edb6b9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/latest @@ -0,0 +1 @@ +global_step1040 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04c3e5d232da406d9cfd18a66c6c21962afd7464 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee8dbec27bb56cb7988f25c16252d05a29682c9b7e036213d744cf6aa7b5b79 +size 3759025152 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model.safetensors.index.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_0.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_1.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_2.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_3.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/special_tokens_map.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/tokenizer.model b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/tokenizer_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/trainer_state.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f095fe4f7ac15020d853fb54485fde2a9c506fe9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/trainer_state.json @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03935784, + "balance_loss_mlp": 2.84935808, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 13.498251331228948, + "language_loss": 2.81572914, + "learning_rate": 0.0, + "loss": 1.90346789, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.30480647087097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0351246, + "balance_loss_mlp": 2.65644169, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 27.482987886380492, + "language_loss": 8.76816368, + "learning_rate": 0.00013726078121135892, + "loss": 8.80328846, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 8.578125, + "step": 2, + "time_per_iteration": 2.6929261684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03513305, + "balance_loss_mlp": 2.65728736, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 28.576563245741852, + "language_loss": 9.00053596, + "learning_rate": 0.00021755319103969496, + "loss": 9.03566933, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 8.578125, + "step": 3, + "time_per_iteration": 2.7945075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03667009, + "balance_loss_mlp": 2.78657675, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 15.694146018083416, + "language_loss": 2.74122858, + "learning_rate": 0.00027452156242271784, + "loss": 2.77789879, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 8.828125, + "step": 4, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03933422, + "balance_loss_mlp": 3.01102829, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 3.505338851882968, + "language_loss": 1.83478093, + "learning_rate": 0.0003187096642208417, + "loss": 1.87411511, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 9.2109375, + "step": 5, + "time_per_iteration": 2.651094675064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04005588, + "balance_loss_mlp": 3.05420256, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 3.050600048840319, + "language_loss": 1.61776543, + "learning_rate": 0.0003548139722510539, + "loss": 1.65782118, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 9.4921875, + "step": 6, + "time_per_iteration": 2.697614908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03708502, + "balance_loss_mlp": 2.7708497, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7974788691124679, + "language_loss": 1.32417345, + "learning_rate": 0.00038533972973918044, + "loss": 1.36125851, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 9.3515625, + "step": 7, + "time_per_iteration": 2.6407949924468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0332405, + "balance_loss_mlp": 2.38868618, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.7144720842381633, + "language_loss": 1.25956392, + "learning_rate": 0.0004117823436340768, + "loss": 1.29280448, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 9.3359375, + "step": 8, + "time_per_iteration": 2.6287930011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02785454, + "balance_loss_mlp": 1.8508532, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.3140255221758466, + "language_loss": 1.29993415, + "learning_rate": 0.00043510638207938993, + "loss": 1.32778871, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 9.3203125, + "step": 9, + "time_per_iteration": 2.8048858642578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0244685, + "balance_loss_mlp": 1.50004196, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.19799802642524775, + "language_loss": 1.19032216, + "learning_rate": 0.00045597044543220066, + "loss": 1.2147907, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 9.4453125, + "step": 10, + "time_per_iteration": 2.7669434547424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02310187, + "balance_loss_mlp": 1.35117221, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.14485632700798082, + "language_loss": 1.18421102, + "learning_rate": 0.00047484428652143135, + "loss": 1.20731282, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 9.5703125, + "step": 11, + "time_per_iteration": 2.9067423343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02309394, + "balance_loss_mlp": 1.33740926, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.1366980934684776, + "language_loss": 1.24379897, + "learning_rate": 0.0004920747534624128, + "loss": 1.26689291, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 9.703125, + "step": 12, + "time_per_iteration": 2.612813949584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022984, + "balance_loss_mlp": 1.32565212, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.11957957623458634, + "language_loss": 1.26615512, + "learning_rate": 0.0005079252465375872, + "loss": 1.28913903, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 9.7109375, + "step": 13, + "time_per_iteration": 2.879688262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02311662, + "balance_loss_mlp": 1.34730673, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.10749127497061137, + "language_loss": 1.14448667, + "learning_rate": 0.0005226005109505393, + "loss": 1.16760325, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 9.625, + "step": 14, + "time_per_iteration": 2.568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02285502, + "balance_loss_mlp": 1.3615818, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.11405493545380829, + "language_loss": 1.20514369, + "learning_rate": 0.0005362628552605367, + "loss": 1.22799873, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 9.21875, + "step": 15, + "time_per_iteration": 2.6814210414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02243131, + "balance_loss_mlp": 1.36117291, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.10465613456634369, + "language_loss": 1.24307358, + "learning_rate": 0.0005490431248454357, + "loss": 1.26550484, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 8.84375, + "step": 16, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02323403, + "balance_loss_mlp": 1.52994621, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2929644268686402, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78028512, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.90625, + "step": 17, + "time_per_iteration": 6.376815319061279 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02154669, + "balance_loss_mlp": 1.37418151, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.15081794947454089, + "language_loss": 1.11159086, + "learning_rate": 0.0005723671632907488, + "loss": 1.13313746, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.80078125, + "step": 18, + "time_per_iteration": 2.721731424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02067628, + "balance_loss_mlp": 1.35466075, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11430094844987627, + "language_loss": 1.15730095, + "learning_rate": 0.0005830738490244919, + "loss": 1.1779772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 7.12890625, + "step": 19, + "time_per_iteration": 2.691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01966178, + "balance_loss_mlp": 1.31958628, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10166759343386816, + "language_loss": 1.17760253, + "learning_rate": 0.0005932312266435596, + "loss": 1.19726431, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.46484375, + "step": 20, + "time_per_iteration": 2.779218912124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01836812, + "balance_loss_mlp": 1.26727819, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.12846528828878043, + "language_loss": 1.12106359, + "learning_rate": 0.0006028929207788754, + "loss": 1.13943172, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 5.70703125, + "step": 21, + "time_per_iteration": 2.716970443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01720951, + "balance_loss_mlp": 1.21970022, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09445288880840001, + "language_loss": 1.16516471, + "learning_rate": 0.0006121050677327902, + "loss": 1.18237424, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 5.0078125, + "step": 22, + "time_per_iteration": 2.92696475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01630624, + "balance_loss_mlp": 1.19193399, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.11621712848760359, + "language_loss": 1.06380248, + "learning_rate": 0.0006209076479463684, + "loss": 1.08010876, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.3984375, + "step": 23, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01572853, + "balance_loss_mlp": 1.18394423, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.10970997088624258, + "language_loss": 1.16519284, + "learning_rate": 0.0006293355346737718, + "loss": 1.18092132, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 3.88476562, + "step": 24, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0152954, + "balance_loss_mlp": 1.18755198, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.09735665571869598, + "language_loss": 1.12784922, + "learning_rate": 0.0006374193284416834, + "loss": 1.14314473, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 3.42382812, + "step": 25, + "time_per_iteration": 2.7919249534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0148827, + "balance_loss_mlp": 1.19282198, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.09233879954989622, + "language_loss": 1.11062908, + "learning_rate": 0.0006451860277489461, + "loss": 1.12551177, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 2.953125, + "step": 26, + "time_per_iteration": 2.581066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462989, + "balance_loss_mlp": 1.20988345, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.12330238493557526, + "language_loss": 1.19441557, + "learning_rate": 0.0006526595731190848, + "loss": 1.20904553, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 2.52929688, + "step": 27, + "time_per_iteration": 2.49725604057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423898, + "balance_loss_mlp": 1.20874906, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.09841719698503415, + "language_loss": 1.12322927, + "learning_rate": 0.0006598612921618983, + "loss": 1.13746822, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 2.15625, + "step": 28, + "time_per_iteration": 2.822068929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399446, + "balance_loss_mlp": 1.21443295, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.2589331093265968, + "language_loss": 1.06232262, + "learning_rate": 0.0006668102665011454, + "loss": 1.07631707, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 1.84765625, + "step": 29, + "time_per_iteration": 3.2402820587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444994, + "balance_loss_mlp": 1.28353739, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.1317361033328709, + "language_loss": 1.14859319, + "learning_rate": 0.0006735236364718957, + "loss": 1.16304302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 1.61425781, + "step": 30, + "time_per_iteration": 2.6861231327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333301, + "balance_loss_mlp": 1.20445967, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.07039345614882069, + "language_loss": 1.13512135, + "learning_rate": 0.0006800168558381346, + "loss": 1.14845431, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 1.28808594, + "step": 31, + "time_per_iteration": 2.6444640159606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254242, + "balance_loss_mlp": 1.153772, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.07602265872136475, + "language_loss": 1.1720531, + "learning_rate": 0.0006863039060567947, + "loss": 1.18459558, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 1.00439453, + "step": 32, + "time_per_iteration": 2.7225399017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117915, + "balance_loss_mlp": 1.10071015, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.062098451262649575, + "language_loss": 1.09530759, + "learning_rate": 0.0006923974775611263, + "loss": 1.10709918, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.78417969, + "step": 33, + "time_per_iteration": 2.795565366744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155392, + "balance_loss_mlp": 1.09416604, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0750568617782567, + "language_loss": 1.06307364, + "learning_rate": 0.0006983091239737814, + "loss": 1.0746274, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.61132812, + "step": 34, + "time_per_iteration": 3.0703423023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.0903163, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.057198892540160154, + "language_loss": 1.05094206, + "learning_rate": 0.0007040493939600222, + "loss": 1.06232452, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.47949219, + "step": 35, + "time_per_iteration": 2.8476996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136821, + "balance_loss_mlp": 1.09926963, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.07105443011946577, + "language_loss": 1.05056715, + "learning_rate": 0.0007096279445021078, + "loss": 1.06193542, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.37548828, + "step": 36, + "time_per_iteration": 2.8306472301483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_mlp": 1.12274194, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09366404592926651, + "language_loss": 1.11846077, + "learning_rate": 0.0007150536386503726, + "loss": 1.12998605, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.29736328, + "step": 37, + "time_per_iteration": 2.875190258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150569, + "balance_loss_mlp": 1.12677491, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.0928332145488954, + "language_loss": 1.04548562, + "learning_rate": 0.0007203346302358509, + "loss": 1.05699134, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.23791504, + "step": 38, + "time_per_iteration": 3.0075292587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128748, + "balance_loss_mlp": 1.10757613, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.056043607360260886, + "language_loss": 1.09224963, + "learning_rate": 0.000725478437577282, + "loss": 1.10353708, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.21179199, + "step": 39, + "time_per_iteration": 2.78564715385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_mlp": 1.09953475, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.2122838817863008, + "language_loss": 1.04638147, + "learning_rate": 0.0007304920078549186, + "loss": 1.0575583, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18151855, + "step": 40, + "time_per_iteration": 2.745100975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133734, + "balance_loss_mlp": 1.11621058, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.14528393981530327, + "language_loss": 1.06509054, + "learning_rate": 0.0007353817735343603, + "loss": 1.07642794, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.17529297, + "step": 41, + "time_per_iteration": 2.7425575256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.10357416, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.06769616325508275, + "language_loss": 1.0188365, + "learning_rate": 0.0007401537019902344, + "loss": 1.03003538, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.16308594, + "step": 42, + "time_per_iteration": 2.6797902584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118919, + "balance_loss_mlp": 1.10271883, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.14916902722339276, + "language_loss": 1.05194306, + "learning_rate": 0.0007448133392900729, + "loss": 1.06313229, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.1619873, + "step": 43, + "time_per_iteration": 2.779276132583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_mlp": 1.09945166, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.052417895665492535, + "language_loss": 1.00651026, + "learning_rate": 0.0007493658489441491, + "loss": 1.0176717, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.16711426, + "step": 44, + "time_per_iteration": 2.965435028076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_mlp": 1.09195447, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.04248825884697869, + "language_loss": 1.04600978, + "learning_rate": 0.0007538160463002316, + "loss": 1.05709875, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.16967773, + "step": 45, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_mlp": 1.08735132, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.08538228051147774, + "language_loss": 1.08093452, + "learning_rate": 0.0007581684291577274, + "loss": 1.09198785, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.17980957, + "step": 46, + "time_per_iteration": 2.6020169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.04723509056908367, + "language_loss": 1.10695386, + "learning_rate": 0.0007624272050891776, + "loss": 1.11800754, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.19006348, + "step": 47, + "time_per_iteration": 2.8620407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_mlp": 1.08244705, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.07235265954126073, + "language_loss": 1.00601125, + "learning_rate": 0.0007665963158851307, + "loss": 1.01704311, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.20751953, + "step": 48, + "time_per_iteration": 2.8312995433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114938, + "balance_loss_mlp": 1.09308696, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.10505304652404167, + "language_loss": 1.09914839, + "learning_rate": 0.0007706794594783609, + "loss": 1.1102978, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.21850586, + "step": 49, + "time_per_iteration": 2.779561758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.0874207, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.04709564792407722, + "language_loss": 1.08694363, + "learning_rate": 0.0007746801096530423, + "loss": 1.09804368, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.22583008, + "step": 50, + "time_per_iteration": 2.785332441329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_mlp": 1.09285581, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09574874491356838, + "language_loss": 1.13402438, + "learning_rate": 0.0007786015338021173, + "loss": 1.14518726, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.23425293, + "step": 51, + "time_per_iteration": 2.676326274871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_mlp": 1.09500206, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.12325193255180054, + "language_loss": 1.06019998, + "learning_rate": 0.0007824468089603051, + "loss": 1.07138121, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.23144531, + "step": 52, + "time_per_iteration": 2.688828945159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_mlp": 1.11038983, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.07208467676878935, + "language_loss": 1.05329835, + "learning_rate": 0.0007862188363098669, + "loss": 1.06464922, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.24707031, + "step": 53, + "time_per_iteration": 3.3342933654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126914, + "balance_loss_mlp": 1.10158229, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.09794855088059086, + "language_loss": 1.06043434, + "learning_rate": 0.0007899203543304438, + "loss": 1.07170355, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25354004, + "step": 54, + "time_per_iteration": 2.933236837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145083, + "balance_loss_mlp": 1.12053776, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.1404118977896248, + "language_loss": 1.20000231, + "learning_rate": 0.0007935539507422731, + "loss": 1.2114532, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.24536133, + "step": 55, + "time_per_iteration": 2.8257975578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.12969017, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.05382700946372506, + "language_loss": 1.10560298, + "learning_rate": 0.0007971220733732573, + "loss": 1.11713552, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.2355957, + "step": 56, + "time_per_iteration": 2.749382495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_mlp": 1.13151693, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.17392462927294325, + "language_loss": 1.05995011, + "learning_rate": 0.0008006270400641869, + "loss": 1.07150006, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.23474121, + "step": 57, + "time_per_iteration": 2.743929147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_mlp": 1.10234821, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.10169017538987117, + "language_loss": 1.06833839, + "learning_rate": 0.0008040710477125043, + "loss": 1.07959747, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.23547363, + "step": 58, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111797, + "balance_loss_mlp": 1.08861065, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.059941584643697095, + "language_loss": 1.07409072, + "learning_rate": 0.0008074561805429771, + "loss": 1.08520865, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.23181152, + "step": 59, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123772, + "balance_loss_mlp": 1.09970331, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.06438674129900752, + "language_loss": 1.04891515, + "learning_rate": 0.0008107844176832545, + "loss": 1.06015277, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.24072266, + "step": 60, + "time_per_iteration": 2.7009053230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.11569333, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.09833112160800331, + "language_loss": 1.0671711, + "learning_rate": 0.0008140576401132568, + "loss": 1.07856739, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.23913574, + "step": 61, + "time_per_iteration": 2.678501844406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114169, + "balance_loss_mlp": 1.11887348, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.11014501355567002, + "language_loss": 1.07748628, + "learning_rate": 0.0008172776370494935, + "loss": 1.08890319, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.22814941, + "step": 62, + "time_per_iteration": 2.7718141078948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116479, + "balance_loss_mlp": 1.09356666, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.06441650429015075, + "language_loss": 1.15269816, + "learning_rate": 0.0008204461118185703, + "loss": 1.16386294, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.22912598, + "step": 63, + "time_per_iteration": 2.5839178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_mlp": 1.09543014, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.06608006175674933, + "language_loss": 1.04523873, + "learning_rate": 0.0008235646872681536, + "loss": 1.05641007, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.21728516, + "step": 64, + "time_per_iteration": 2.5611703395843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_mlp": 1.10659182, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.07834673611922068, + "language_loss": 1.04319417, + "learning_rate": 0.0008266349107584288, + "loss": 1.05447328, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.21313477, + "step": 65, + "time_per_iteration": 2.727666139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141841, + "balance_loss_mlp": 1.1207881, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.06003338375813584, + "language_loss": 1.07126927, + "learning_rate": 0.0008296582587724851, + "loss": 1.08268762, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21057129, + "step": 66, + "time_per_iteration": 2.716701030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127113, + "balance_loss_mlp": 1.10609627, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.04807876202194694, + "language_loss": 1.04662776, + "learning_rate": 0.0008326361411800136, + "loss": 1.05789876, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21008301, + "step": 67, + "time_per_iteration": 2.9571592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114337, + "balance_loss_mlp": 1.09446514, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.05551510449528945, + "language_loss": 1.05008268, + "learning_rate": 0.0008355699051851403, + "loss": 1.06122601, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1986084, + "step": 68, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.1242373, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.0697970629442659, + "language_loss": 1.12296045, + "learning_rate": 0.0008384608389860635, + "loss": 1.13439655, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.19372559, + "step": 69, + "time_per_iteration": 2.685215711593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.122311, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.08511613263061502, + "language_loss": 1.02745342, + "learning_rate": 0.000841310175171381, + "loss": 1.03886437, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.18774414, + "step": 70, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_mlp": 1.12464356, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.055787325190813475, + "language_loss": 1.0065217, + "learning_rate": 0.000844119093875517, + "loss": 1.0179472, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.17944336, + "step": 71, + "time_per_iteration": 2.753220319747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152267, + "balance_loss_mlp": 1.13508892, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08668312915327946, + "language_loss": 1.05463254, + "learning_rate": 0.0008468887257134666, + "loss": 1.0661552, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.17199707, + "step": 72, + "time_per_iteration": 2.7056305408477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117134, + "balance_loss_mlp": 1.15478206, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07356095482564125, + "language_loss": 1.08388793, + "learning_rate": 0.0008496201545131264, + "loss": 1.09560132, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.16564941, + "step": 73, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152437, + "balance_loss_mlp": 1.13545001, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.06787935984484554, + "language_loss": 1.06090975, + "learning_rate": 0.0008523144198617317, + "loss": 1.07243395, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16992188, + "step": 74, + "time_per_iteration": 3.2090003490448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1223346, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.04825332815792917, + "language_loss": 1.053195, + "learning_rate": 0.0008549725194813783, + "loss": 1.06458783, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.16967773, + "step": 75, + "time_per_iteration": 2.654343605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.10599899, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.03887402020767282, + "language_loss": 1.04797029, + "learning_rate": 0.0008575954114472099, + "loss": 1.05919111, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.1607666, + "step": 76, + "time_per_iteration": 3.119884967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.1187191, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.056937643991546806, + "language_loss": 1.02038705, + "learning_rate": 0.0008601840162606118, + "loss": 1.03173184, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.1574707, + "step": 77, + "time_per_iteration": 3.025688886642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146725, + "balance_loss_mlp": 1.13034582, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04989291514363055, + "language_loss": 1.08127129, + "learning_rate": 0.000862739218788641, + "loss": 1.09273863, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16381836, + "step": 78, + "time_per_iteration": 2.7922520637512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149681, + "balance_loss_mlp": 1.13339734, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.06709094188277621, + "language_loss": 1.06189477, + "learning_rate": 0.0008652618700799138, + "loss": 1.07339156, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.1628418, + "step": 79, + "time_per_iteration": 2.6902618408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_mlp": 1.1367681, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.062162504049989416, + "language_loss": 1.05161238, + "learning_rate": 0.0008677527890662774, + "loss": 1.06314492, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16491699, + "step": 80, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_mlp": 1.13076603, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.04934081686369646, + "language_loss": 1.06529951, + "learning_rate": 0.0008702127641587799, + "loss": 1.0767715, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.16430664, + "step": 81, + "time_per_iteration": 2.634038209915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_mlp": 1.12558985, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.08879987127008451, + "language_loss": 1.0221808, + "learning_rate": 0.0008726425547457192, + "loss": 1.0336051, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.16845703, + "step": 82, + "time_per_iteration": 2.74308705329895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.13108134, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.06313420095488197, + "language_loss": 1.01906681, + "learning_rate": 0.0008750428925998964, + "loss": 1.03054249, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.16491699, + "step": 83, + "time_per_iteration": 2.777132511138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146759, + "balance_loss_mlp": 1.13009322, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.11663644047392754, + "language_loss": 1.07169831, + "learning_rate": 0.0008774144832015932, + "loss": 1.08316588, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16674805, + "step": 84, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01524523, + "balance_loss_mlp": 1.51412809, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.22860236459315994, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76298833, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.10400391, + "step": 85, + "time_per_iteration": 4.57580041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166169, + "balance_loss_mlp": 1.1501826, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.05249425037579876, + "language_loss": 1.01959693, + "learning_rate": 0.0008820741205014318, + "loss": 1.03125858, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.15979004, + "step": 86, + "time_per_iteration": 2.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223619, + "balance_loss_mlp": 1.20703709, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.10761462625124436, + "language_loss": 1.03955913, + "learning_rate": 0.0008843634575408404, + "loss": 1.05179524, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.16577148, + "step": 87, + "time_per_iteration": 2.6694159507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228231, + "balance_loss_mlp": 1.21267366, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.10737104518045529, + "language_loss": 1.05078888, + "learning_rate": 0.0008866266301555082, + "loss": 1.06307125, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.15551758, + "step": 88, + "time_per_iteration": 2.7686069011688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212138, + "balance_loss_mlp": 1.19609249, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.1616084590878673, + "language_loss": 1.0609467, + "learning_rate": 0.0008888642296509615, + "loss": 1.07306814, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.16040039, + "step": 89, + "time_per_iteration": 2.625988721847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199649, + "balance_loss_mlp": 1.18316197, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.07585409016808545, + "language_loss": 1.1065979, + "learning_rate": 0.0008910768275115906, + "loss": 1.11859453, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16491699, + "step": 90, + "time_per_iteration": 2.793017864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_mlp": 1.15697813, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.07277460951060387, + "language_loss": 1.06493175, + "learning_rate": 0.0008932649762767675, + "loss": 1.07666695, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16552734, + "step": 91, + "time_per_iteration": 2.5919723510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169355, + "balance_loss_mlp": 1.15323818, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.10172519854243242, + "language_loss": 1.09112859, + "learning_rate": 0.0008954292103690864, + "loss": 1.10282218, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.16113281, + "step": 92, + "time_per_iteration": 2.9366836547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174542, + "balance_loss_mlp": 1.15828145, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.07803491111319032, + "language_loss": 1.10981905, + "learning_rate": 0.0008975700468778296, + "loss": 1.12156439, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16259766, + "step": 93, + "time_per_iteration": 2.592458963394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156862, + "balance_loss_mlp": 1.14067388, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.09102852745954727, + "language_loss": 1.04703569, + "learning_rate": 0.0008996879863005366, + "loss": 1.05860424, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.71566104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148536, + "balance_loss_mlp": 1.13235974, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.03859462796979438, + "language_loss": 1.04768109, + "learning_rate": 0.0009017835132453337, + "loss": 1.05916631, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.16174316, + "step": 95, + "time_per_iteration": 2.664511203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_mlp": 1.121889, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.060963703759419355, + "language_loss": 1.04675508, + "learning_rate": 0.0009038570970964896, + "loss": 1.05813384, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.15991211, + "step": 96, + "time_per_iteration": 2.7669789791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_mlp": 1.10899043, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0943042692373462, + "language_loss": 1.02071011, + "learning_rate": 0.0009059091926454854, + "loss": 1.03196073, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16064453, + "step": 97, + "time_per_iteration": 2.6028668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_mlp": 1.11052442, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.06745462513624549, + "language_loss": 1.0144124, + "learning_rate": 0.0009079402406897198, + "loss": 1.02567911, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.16137695, + "step": 98, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127975, + "balance_loss_mlp": 1.11166739, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.10523687850003575, + "language_loss": 1.03251696, + "learning_rate": 0.0009099506686008212, + "loss": 1.04379678, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16308594, + "step": 99, + "time_per_iteration": 2.8251914978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116643, + "balance_loss_mlp": 1.10100293, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.08495157768411668, + "language_loss": 1.0609076, + "learning_rate": 0.0009119408908644013, + "loss": 1.07207406, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15625, + "step": 100, + "time_per_iteration": 2.6573309898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.12211871, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.09022378013673595, + "language_loss": 1.11755276, + "learning_rate": 0.0009139113095929519, + "loss": 1.12892556, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15124512, + "step": 101, + "time_per_iteration": 2.844698429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159661, + "balance_loss_mlp": 1.14373517, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.0892612752622512, + "language_loss": 1.05698013, + "learning_rate": 0.0009158623150134762, + "loss": 1.06857681, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15917969, + "step": 102, + "time_per_iteration": 2.589857339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_mlp": 1.12158906, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.06508497546963277, + "language_loss": 1.05496848, + "learning_rate": 0.000917794285931332, + "loss": 1.06634164, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15710449, + "step": 103, + "time_per_iteration": 2.6433918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_mlp": 1.1019367, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.07675487095909958, + "language_loss": 0.97610366, + "learning_rate": 0.0009197075901716639, + "loss": 0.98728061, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.1574707, + "step": 104, + "time_per_iteration": 2.709157943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137693, + "balance_loss_mlp": 1.12159956, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.05257934075389246, + "language_loss": 1.0758431, + "learning_rate": 0.0009216025849997171, + "loss": 1.08722019, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16088867, + "step": 105, + "time_per_iteration": 2.7638583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111903, + "balance_loss_mlp": 1.09596467, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.07457888312135433, + "language_loss": 1.02261579, + "learning_rate": 0.0009234796175212258, + "loss": 1.03373492, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.15930176, + "step": 106, + "time_per_iteration": 2.9391980171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117989, + "balance_loss_mlp": 1.10228872, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.06024423434996524, + "language_loss": 1.05948544, + "learning_rate": 0.000925339025064007, + "loss": 1.07066536, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.15686035, + "step": 107, + "time_per_iteration": 2.975294828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118819, + "balance_loss_mlp": 1.10334611, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.07105297051955457, + "language_loss": 0.99294066, + "learning_rate": 0.0009271811355418027, + "loss": 1.00412893, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.15454102, + "step": 108, + "time_per_iteration": 2.8750014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125269, + "balance_loss_mlp": 1.10940242, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09212378946406244, + "language_loss": 1.05636311, + "learning_rate": 0.0009290062678013548, + "loss": 1.06761575, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.15856934, + "step": 109, + "time_per_iteration": 2.8552017211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119898, + "balance_loss_mlp": 1.10393572, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.059465971869905314, + "language_loss": 1.04477715, + "learning_rate": 0.0009308147319536321, + "loss": 1.05597615, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.1595459, + "step": 110, + "time_per_iteration": 2.6493232250213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129754, + "balance_loss_mlp": 1.11385095, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.08324280754141193, + "language_loss": 1.10257316, + "learning_rate": 0.0009326068296900676, + "loss": 1.11387074, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.15893555, + "step": 111, + "time_per_iteration": 2.8384125232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112769, + "balance_loss_mlp": 1.11171615, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06941460102767082, + "language_loss": 1.01355243, + "learning_rate": 0.0009343828545846161, + "loss": 1.02482939, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.15966797, + "step": 112, + "time_per_iteration": 2.7743477821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114893, + "balance_loss_mlp": 1.13326573, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.047977415311889204, + "language_loss": 1.05199587, + "learning_rate": 0.0009361430923823841, + "loss": 1.06348515, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.15649414, + "step": 113, + "time_per_iteration": 2.6022982597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.10308659, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.080001842017843, + "language_loss": 1.09258401, + "learning_rate": 0.0009378878212755459, + "loss": 1.10376549, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15039062, + "step": 114, + "time_per_iteration": 2.491594076156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115419, + "balance_loss_mlp": 1.09967113, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.05036418666557463, + "language_loss": 0.9906168, + "learning_rate": 0.0009396173121672103, + "loss": 1.00177097, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.15734863, + "step": 115, + "time_per_iteration": 2.668848991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_mlp": 1.10945916, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.05918191636932359, + "language_loss": 1.04414749, + "learning_rate": 0.0009413318289238633, + "loss": 1.05539548, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.15307617, + "step": 116, + "time_per_iteration": 2.7496132850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106823, + "balance_loss_mlp": 1.09139705, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.1124204963758038, + "language_loss": 0.96924931, + "learning_rate": 0.0009430316286169771, + "loss": 0.98031747, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.15405273, + "step": 117, + "time_per_iteration": 3.026118278503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_mlp": 1.11998308, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.03693994945601898, + "language_loss": 1.02417183, + "learning_rate": 0.0009447169617543361, + "loss": 1.03552485, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15307617, + "step": 118, + "time_per_iteration": 2.575666666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156925, + "balance_loss_mlp": 1.14185703, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.10959367855453626, + "language_loss": 1.09001684, + "learning_rate": 0.0009463880725016029, + "loss": 1.1015861, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.15039062, + "step": 119, + "time_per_iteration": 2.6811347007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115422, + "balance_loss_mlp": 1.10052109, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.05068852434870314, + "language_loss": 1.03909945, + "learning_rate": 0.0009480451988946134, + "loss": 1.05025363, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.14880371, + "step": 120, + "time_per_iteration": 2.801814079284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_mlp": 1.09179425, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.05688398470992871, + "language_loss": 1.05377555, + "learning_rate": 0.0009496885730428627, + "loss": 1.06484532, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1517334, + "step": 121, + "time_per_iteration": 3.04720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_mlp": 1.10574555, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.08369646841136469, + "language_loss": 1.03908122, + "learning_rate": 0.0009513184213246156, + "loss": 1.05029583, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.15710449, + "step": 122, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129626, + "balance_loss_mlp": 1.11406958, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.05522871343558165, + "language_loss": 1.07008672, + "learning_rate": 0.0009529349645740552, + "loss": 1.08138299, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15539551, + "step": 123, + "time_per_iteration": 2.69759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129797, + "balance_loss_mlp": 1.11481285, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.053769267634074955, + "language_loss": 1.05687594, + "learning_rate": 0.0009545384182608524, + "loss": 1.06817389, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1496582, + "step": 124, + "time_per_iteration": 2.550584316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126, + "balance_loss_mlp": 1.11114669, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.08700167249890467, + "language_loss": 1.02945745, + "learning_rate": 0.0009561289926625252, + "loss": 1.04071736, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14831543, + "step": 125, + "time_per_iteration": 2.6619794368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_mlp": 1.10831082, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.07114777459455598, + "language_loss": 1.07932711, + "learning_rate": 0.0009577068930299292, + "loss": 1.09056234, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.15209961, + "step": 126, + "time_per_iteration": 2.553642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125226, + "balance_loss_mlp": 1.11038458, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.08279894264625885, + "language_loss": 1.03556633, + "learning_rate": 0.0009592723197462087, + "loss": 1.04681861, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.14819336, + "step": 127, + "time_per_iteration": 2.7255966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_mlp": 1.10936916, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07600858050716931, + "language_loss": 0.99905002, + "learning_rate": 0.0009608254684795125, + "loss": 1.01029539, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15148926, + "step": 128, + "time_per_iteration": 2.9839587211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_mlp": 1.11718702, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.08573045125619827, + "language_loss": 1.02976727, + "learning_rate": 0.0009623665303297678, + "loss": 1.04109192, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.15258789, + "step": 129, + "time_per_iteration": 2.7344865798950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_mlp": 1.10497391, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.07510500588649292, + "language_loss": 1.07057762, + "learning_rate": 0.0009638956919697878, + "loss": 1.08177161, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.14416504, + "step": 130, + "time_per_iteration": 2.864952802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_mlp": 1.08930528, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.0567118244953117, + "language_loss": 0.99135083, + "learning_rate": 0.0009654131357809714, + "loss": 1.00239229, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.14819336, + "step": 131, + "time_per_iteration": 2.6095099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_mlp": 1.1081202, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.05892082702998288, + "language_loss": 1.08188879, + "learning_rate": 0.0009669190399838441, + "loss": 1.09312594, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.15576172, + "step": 132, + "time_per_iteration": 3.096733331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_mlp": 1.08531809, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.09564892115109941, + "language_loss": 1.01233923, + "learning_rate": 0.0009684135787636724, + "loss": 1.02334726, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.15478516, + "step": 133, + "time_per_iteration": 2.8120856285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111325, + "balance_loss_mlp": 1.09529161, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.04870542745948935, + "language_loss": 1.05797207, + "learning_rate": 0.0009698969223913726, + "loss": 1.06908536, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.16027832, + "step": 134, + "time_per_iteration": 3.0269176959991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_mlp": 1.10735679, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.04083122637660085, + "language_loss": 1.08225274, + "learning_rate": 0.0009713692373399265, + "loss": 1.09348655, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.16015625, + "step": 135, + "time_per_iteration": 2.690932273864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01798361, + "balance_loss_mlp": 1.75773478, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.2058674005568875, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.8125459, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.40625, + "step": 136, + "time_per_iteration": 5.460411548614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01507549, + "balance_loss_mlp": 1.47512448, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.12866590611947104, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79318589, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.32421875, + "step": 137, + "time_per_iteration": 4.989046335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146765, + "balance_loss_mlp": 1.13081443, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.04917093034878699, + "language_loss": 1.00934815, + "learning_rate": 0.0009757216201974225, + "loss": 1.02081585, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.1595459, + "step": 138, + "time_per_iteration": 2.9566736221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162305, + "balance_loss_mlp": 1.1448524, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.06281235859244827, + "language_loss": 1.0596863, + "learning_rate": 0.0009771514130396581, + "loss": 1.07130933, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17468262, + "step": 139, + "time_per_iteration": 2.683931350708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150087, + "balance_loss_mlp": 1.1330874, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09254080332591261, + "language_loss": 1.06202602, + "learning_rate": 0.00097857095638274, + "loss": 1.07352686, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17016602, + "step": 140, + "time_per_iteration": 2.558708906173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149417, + "balance_loss_mlp": 1.13241768, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.03864103733020509, + "language_loss": 0.97399604, + "learning_rate": 0.0009799803961288726, + "loss": 0.9854902, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17016602, + "step": 141, + "time_per_iteration": 2.992034673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_mlp": 1.10685217, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.06378420241673269, + "language_loss": 1.03629804, + "learning_rate": 0.000981379875086876, + "loss": 1.0475328, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16625977, + "step": 142, + "time_per_iteration": 3.063534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121821, + "balance_loss_mlp": 1.10560894, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.046520134554953796, + "language_loss": 0.98784387, + "learning_rate": 0.0009827695330590185, + "loss": 0.99906206, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.1619873, + "step": 143, + "time_per_iteration": 2.6495330333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_mlp": 1.1078757, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.05485832849515215, + "language_loss": 0.98036379, + "learning_rate": 0.0009841495069248256, + "loss": 0.99160779, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.1652832, + "step": 144, + "time_per_iteration": 2.9577834606170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_mlp": 1.12901306, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.09798795242100523, + "language_loss": 0.97478735, + "learning_rate": 0.0009855199307219871, + "loss": 0.98624128, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.16381836, + "step": 145, + "time_per_iteration": 2.6759142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148365, + "balance_loss_mlp": 1.13168764, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.1254453322996171, + "language_loss": 0.99733889, + "learning_rate": 0.0009868809357244854, + "loss": 1.00882256, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16687012, + "step": 146, + "time_per_iteration": 2.66375994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113683, + "balance_loss_mlp": 1.11978364, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.08248071954181796, + "language_loss": 1.03600287, + "learning_rate": 0.0009882326505180556, + "loss": 1.04737115, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.1706543, + "step": 147, + "time_per_iteration": 2.719353437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_mlp": 1.13280392, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.12761243433758393, + "language_loss": 1.02101135, + "learning_rate": 0.0009895752010730906, + "loss": 1.03252351, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.1842041, + "step": 148, + "time_per_iteration": 2.9704201221466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141454, + "balance_loss_mlp": 1.12377512, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07962775403881484, + "language_loss": 1.0825479, + "learning_rate": 0.0009909087108150867, + "loss": 1.09396255, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.17687988, + "step": 149, + "time_per_iteration": 2.7516071796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151554, + "balance_loss_mlp": 1.13330352, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.10196194967952074, + "language_loss": 1.09083438, + "learning_rate": 0.0009922333006927371, + "loss": 1.10235, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.18249512, + "step": 150, + "time_per_iteration": 2.4685099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.15218103, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.13259475383105176, + "language_loss": 1.020684, + "learning_rate": 0.0009935490892437632, + "loss": 1.03238916, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.18322754, + "step": 151, + "time_per_iteration": 2.5665087699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166904, + "balance_loss_mlp": 1.14880824, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10481585745820837, + "language_loss": 1.00390673, + "learning_rate": 0.0009948561926585687, + "loss": 1.01557577, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.18103027, + "step": 152, + "time_per_iteration": 2.7641003131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139325, + "balance_loss_mlp": 1.122576, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09697971136145118, + "language_loss": 1.05073512, + "learning_rate": 0.0009961547248418122, + "loss": 1.06212831, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.16760254, + "step": 153, + "time_per_iteration": 2.631476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123418, + "balance_loss_mlp": 1.10662186, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.05437877185758658, + "language_loss": 1.01441622, + "learning_rate": 0.0009974447974719707, + "loss": 1.0256505, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.16809082, + "step": 154, + "time_per_iteration": 2.709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.11151338, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.09703401576709127, + "language_loss": 1.03478801, + "learning_rate": 0.0009987265200589763, + "loss": 1.0460813, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.17810059, + "step": 155, + "time_per_iteration": 2.77809739112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140894, + "balance_loss_mlp": 1.12376344, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.08300490544518559, + "language_loss": 1.02959824, + "learning_rate": 0.001, + "loss": 1.04100728, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.17150879, + "step": 156, + "time_per_iteration": 2.845790386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144802, + "balance_loss_mlp": 1.12720668, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07590676388764007, + "language_loss": 1.00599122, + "learning_rate": 0.0009999999029413921, + "loss": 1.01743913, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.17614746, + "step": 157, + "time_per_iteration": 2.833735227584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142594, + "balance_loss_mlp": 1.12554669, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.06607639809804342, + "language_loss": 1.01453137, + "learning_rate": 0.0009999996117656068, + "loss": 1.02595735, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.1706543, + "step": 158, + "time_per_iteration": 2.803636074066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011301, + "balance_loss_mlp": 1.11345792, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.08769352458743468, + "language_loss": 0.94982773, + "learning_rate": 0.0009999991264727564, + "loss": 0.96112871, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.16638184, + "step": 159, + "time_per_iteration": 2.7776851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.11870432, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.05788098803643346, + "language_loss": 1.06247735, + "learning_rate": 0.0009999984470630296, + "loss": 1.07383585, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.17163086, + "step": 160, + "time_per_iteration": 2.6311371326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125321, + "balance_loss_mlp": 1.10836911, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.05159431076001957, + "language_loss": 0.94850963, + "learning_rate": 0.0009999975735366902, + "loss": 0.95976287, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.16955566, + "step": 161, + "time_per_iteration": 3.0904829502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148114, + "balance_loss_mlp": 1.13099504, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0692270455282635, + "language_loss": 0.96706492, + "learning_rate": 0.0009999965058940775, + "loss": 0.97854608, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.17138672, + "step": 162, + "time_per_iteration": 3.490063428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150632, + "balance_loss_mlp": 1.13323975, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08572766411177644, + "language_loss": 1.03267431, + "learning_rate": 0.0009999952441356057, + "loss": 1.04418063, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.17382812, + "step": 163, + "time_per_iteration": 2.497690439224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130614, + "balance_loss_mlp": 1.11405563, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.05784293330097489, + "language_loss": 1.03805065, + "learning_rate": 0.000999993788261765, + "loss": 1.0493567, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.16564941, + "step": 164, + "time_per_iteration": 3.6041390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.1152972, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.05766532368121917, + "language_loss": 1.05311596, + "learning_rate": 0.00099999213827312, + "loss": 1.06444073, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.171875, + "step": 165, + "time_per_iteration": 2.806014060974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_mlp": 1.12589669, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.05992608893057494, + "language_loss": 1.00112009, + "learning_rate": 0.000999990294170312, + "loss": 1.01254439, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.16540527, + "step": 166, + "time_per_iteration": 2.6405951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.11351717, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.05363857392651908, + "language_loss": 1.03767109, + "learning_rate": 0.0009999882559540566, + "loss": 1.04897451, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.16845703, + "step": 167, + "time_per_iteration": 2.69801664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_mlp": 1.11079764, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.03971308084427602, + "language_loss": 1.00767386, + "learning_rate": 0.000999986023625145, + "loss": 1.01894999, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.16821289, + "step": 168, + "time_per_iteration": 2.710706949234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04227602, + "balance_loss_mlp": 3.93005633, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.49669676383753814, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8315202, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.96875, + "step": 169, + "time_per_iteration": 4.921034574508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178384, + "balance_loss_mlp": 1.15987098, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11256254520903143, + "language_loss": 1.01289928, + "learning_rate": 0.0009999809766328958, + "loss": 1.02468312, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.18518066, + "step": 170, + "time_per_iteration": 2.6784250736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236994, + "balance_loss_mlp": 1.21676469, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.13219145589868983, + "language_loss": 1.0357101, + "learning_rate": 0.0009999781619715177, + "loss": 1.04807997, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.20227051, + "step": 171, + "time_per_iteration": 2.5412755012512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234758, + "balance_loss_mlp": 1.21518433, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.05193788120122226, + "language_loss": 1.03408492, + "learning_rate": 0.000999975153201402, + "loss": 1.0464325, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.19567871, + "step": 172, + "time_per_iteration": 2.864586353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_mlp": 1.21688426, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.0814546252210238, + "language_loss": 1.01345742, + "learning_rate": 0.0009999719503237174, + "loss": 1.02582097, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.19470215, + "step": 173, + "time_per_iteration": 2.765923261642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_mlp": 1.24583161, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11494520888694326, + "language_loss": 1.10141742, + "learning_rate": 0.0009999685533397073, + "loss": 1.11407971, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20410156, + "step": 174, + "time_per_iteration": 2.5439114570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_mlp": 1.24525094, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.12313705571337571, + "language_loss": 1.01947784, + "learning_rate": 0.00099996496225069, + "loss": 1.03212488, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19445801, + "step": 175, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257561, + "balance_loss_mlp": 1.23677111, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07888015485072913, + "language_loss": 1.04929149, + "learning_rate": 0.0009999611770580604, + "loss": 1.06186724, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.20788574, + "step": 176, + "time_per_iteration": 2.841484785079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258013, + "balance_loss_mlp": 1.23668683, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.1202186920466195, + "language_loss": 1.03394961, + "learning_rate": 0.0009999571977632876, + "loss": 1.04652977, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21350098, + "step": 177, + "time_per_iteration": 2.567788600921631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_mlp": 1.25026441, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.09201820914192435, + "language_loss": 1.05765235, + "learning_rate": 0.0009999530243679166, + "loss": 1.07036722, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.21240234, + "step": 178, + "time_per_iteration": 2.5753743648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258548, + "balance_loss_mlp": 1.23935485, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.06529189645852858, + "language_loss": 1.00495052, + "learning_rate": 0.0009999486568735675, + "loss": 1.01753592, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.19177246, + "step": 179, + "time_per_iteration": 3.0607473850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251876, + "balance_loss_mlp": 1.23275518, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.07628849485304477, + "language_loss": 1.00889277, + "learning_rate": 0.0009999440952819362, + "loss": 1.02141166, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.19116211, + "step": 180, + "time_per_iteration": 3.6515376567840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248658, + "balance_loss_mlp": 1.22853494, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.05983966318213213, + "language_loss": 1.0115366, + "learning_rate": 0.0009999393395947935, + "loss": 1.02402306, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2010498, + "step": 181, + "time_per_iteration": 2.799633502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253433, + "balance_loss_mlp": 1.23378766, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.0770350968764605, + "language_loss": 1.04747987, + "learning_rate": 0.0009999343898139858, + "loss": 1.06001413, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19641113, + "step": 182, + "time_per_iteration": 2.627434253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258891, + "balance_loss_mlp": 1.23675334, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.06485795323962908, + "language_loss": 1.03381288, + "learning_rate": 0.0009999292459414348, + "loss": 1.04640174, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.22131348, + "step": 183, + "time_per_iteration": 2.5552356243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227697, + "balance_loss_mlp": 1.20765769, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.06837915158031915, + "language_loss": 1.07873201, + "learning_rate": 0.0009999239079791374, + "loss": 1.0910089, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.20031738, + "step": 184, + "time_per_iteration": 2.5553643703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225953, + "balance_loss_mlp": 1.20453107, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.05538225102727573, + "language_loss": 1.00595856, + "learning_rate": 0.0009999183759291659, + "loss": 1.01821804, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.21435547, + "step": 185, + "time_per_iteration": 2.6955769062042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199054, + "balance_loss_mlp": 1.17938447, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.052094207769016576, + "language_loss": 1.02581143, + "learning_rate": 0.0009999126497936682, + "loss": 1.03780198, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1965332, + "step": 186, + "time_per_iteration": 2.5304598808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198293, + "balance_loss_mlp": 1.1770494, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057723222775786294, + "language_loss": 1.05774581, + "learning_rate": 0.0009999067295748676, + "loss": 1.06972873, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21252441, + "step": 187, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225876, + "balance_loss_mlp": 1.20496714, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.0756096280824464, + "language_loss": 1.03738201, + "learning_rate": 0.000999900615275062, + "loss": 1.04964077, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.20922852, + "step": 188, + "time_per_iteration": 2.677471399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211466, + "balance_loss_mlp": 1.18979406, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.0898221855427691, + "language_loss": 1.09605587, + "learning_rate": 0.0009998943068966256, + "loss": 1.10817051, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21679688, + "step": 189, + "time_per_iteration": 2.4233202934265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217638, + "balance_loss_mlp": 1.19651425, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.10338446511893212, + "language_loss": 1.03747463, + "learning_rate": 0.0009998878044420072, + "loss": 1.04965115, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.21130371, + "step": 190, + "time_per_iteration": 2.6978025436401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177731, + "balance_loss_mlp": 1.15573716, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06881722524262912, + "language_loss": 0.99768066, + "learning_rate": 0.0009998811079137318, + "loss": 1.00945807, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22009277, + "step": 191, + "time_per_iteration": 2.5934321880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.12218916, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.0852793637050772, + "language_loss": 1.0086391, + "learning_rate": 0.0009998742173143987, + "loss": 1.02007401, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.2130127, + "step": 192, + "time_per_iteration": 2.6706249713897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139307, + "balance_loss_mlp": 1.1180048, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.07456835679934387, + "language_loss": 1.01398337, + "learning_rate": 0.0009998671326466833, + "loss": 1.02537644, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.21313477, + "step": 193, + "time_per_iteration": 2.992595672607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10519516, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.08171257283174432, + "language_loss": 1.02813613, + "learning_rate": 0.0009998598539133362, + "loss": 1.03940392, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21594238, + "step": 194, + "time_per_iteration": 3.0081543922424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113199, + "balance_loss_mlp": 1.11179638, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.05573112518601677, + "language_loss": 1.02892375, + "learning_rate": 0.0009998523811171828, + "loss": 1.04024363, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2019043, + "step": 195, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149122, + "balance_loss_mlp": 1.12843966, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0935188115694547, + "language_loss": 1.0387187, + "learning_rate": 0.0009998447142611248, + "loss": 1.05020976, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.20690918, + "step": 196, + "time_per_iteration": 2.6388566493988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160139, + "balance_loss_mlp": 1.13986123, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.047444937864230444, + "language_loss": 0.96302813, + "learning_rate": 0.0009998368533481387, + "loss": 0.97462952, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.20275879, + "step": 197, + "time_per_iteration": 3.033572196960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132809, + "balance_loss_mlp": 1.11254394, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08710369828361038, + "language_loss": 0.9995833, + "learning_rate": 0.0009998287983812762, + "loss": 1.01091146, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.20263672, + "step": 198, + "time_per_iteration": 2.8421950340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155397, + "balance_loss_mlp": 1.13373709, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.10277508525357126, + "language_loss": 1.05776644, + "learning_rate": 0.0009998205493636646, + "loss": 1.06932044, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.2166748, + "step": 199, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141939, + "balance_loss_mlp": 1.12035084, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.09429923895154278, + "language_loss": 0.98451054, + "learning_rate": 0.0009998121062985063, + "loss": 0.99592984, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.21594238, + "step": 200, + "time_per_iteration": 2.6926732063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171328, + "balance_loss_mlp": 1.15014482, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.08332681767957313, + "language_loss": 1.00419915, + "learning_rate": 0.0009998034691890794, + "loss": 1.01591253, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.21203613, + "step": 201, + "time_per_iteration": 2.7643332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165409, + "balance_loss_mlp": 1.14516699, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.11326578301102472, + "language_loss": 1.05536067, + "learning_rate": 0.0009997946380387369, + "loss": 1.06701469, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.20251465, + "step": 202, + "time_per_iteration": 2.630284070968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157571, + "balance_loss_mlp": 1.13723421, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09790094078320352, + "language_loss": 1.07388449, + "learning_rate": 0.0009997856128509076, + "loss": 1.08546019, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.20336914, + "step": 203, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144349, + "balance_loss_mlp": 1.12458408, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.1356659453961297, + "language_loss": 1.02559984, + "learning_rate": 0.0009997763936290952, + "loss": 1.03704333, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.19750977, + "step": 204, + "time_per_iteration": 2.503309965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138207, + "balance_loss_mlp": 1.11642766, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.053010676996176516, + "language_loss": 1.07603145, + "learning_rate": 0.0009997669803768789, + "loss": 1.08741355, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.21789551, + "step": 205, + "time_per_iteration": 2.7773749828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_mlp": 1.09366679, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07785432610828748, + "language_loss": 1.0289582, + "learning_rate": 0.0009997573730979134, + "loss": 1.04010415, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.20947266, + "step": 206, + "time_per_iteration": 2.7241222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04720912, + "balance_loss_mlp": 3.71993518, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.31672297251450016, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.83914113, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 10.0, + "step": 207, + "time_per_iteration": 4.65311074256897 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160001, + "balance_loss_mlp": 1.13651657, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.09244016287770654, + "language_loss": 1.01599813, + "learning_rate": 0.0009997375764747294, + "loss": 1.02759814, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.23449707, + "step": 208, + "time_per_iteration": 2.999249219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144547, + "balance_loss_mlp": 1.12159967, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.10768555369795524, + "language_loss": 0.98886019, + "learning_rate": 0.0009997273871381967, + "loss": 1.00030565, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.22949219, + "step": 209, + "time_per_iteration": 2.740895986557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154635, + "balance_loss_mlp": 1.13075733, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.0670178022721504, + "language_loss": 1.03911638, + "learning_rate": 0.0009997170037902862, + "loss": 1.05066276, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.23876953, + "step": 210, + "time_per_iteration": 2.7199809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161677, + "balance_loss_mlp": 1.13826418, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.062356382061819024, + "language_loss": 1.06535935, + "learning_rate": 0.0009997064264350292, + "loss": 1.07697606, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.23413086, + "step": 211, + "time_per_iteration": 2.85477614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164794, + "balance_loss_mlp": 1.14111865, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.11782714892356931, + "language_loss": 1.00570273, + "learning_rate": 0.0009996956550765317, + "loss": 1.01735067, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.23657227, + "step": 212, + "time_per_iteration": 2.683258295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178964, + "balance_loss_mlp": 1.15452623, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07352585681220185, + "language_loss": 0.95357072, + "learning_rate": 0.0009996846897189762, + "loss": 0.9653604, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.24438477, + "step": 213, + "time_per_iteration": 2.64486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.14665973, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.06101080420793073, + "language_loss": 1.01569629, + "learning_rate": 0.0009996735303666193, + "loss": 1.02740788, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.24499512, + "step": 214, + "time_per_iteration": 2.719754934310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189275, + "balance_loss_mlp": 1.16434813, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.09805160088916984, + "language_loss": 1.03784573, + "learning_rate": 0.0009996621770237937, + "loss": 1.04973853, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24938965, + "step": 215, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202725, + "balance_loss_mlp": 1.17728579, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.05858333324383458, + "language_loss": 0.99328029, + "learning_rate": 0.0009996506296949073, + "loss": 1.00530756, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.25463867, + "step": 216, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175124, + "balance_loss_mlp": 1.14957714, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.09898600739692984, + "language_loss": 0.99386859, + "learning_rate": 0.0009996388883844428, + "loss": 1.00561976, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.25561523, + "step": 217, + "time_per_iteration": 2.5985324382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155134, + "balance_loss_mlp": 1.13007665, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06208913439552352, + "language_loss": 1.03500867, + "learning_rate": 0.0009996269530969588, + "loss": 1.04656017, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25048828, + "step": 218, + "time_per_iteration": 2.591993808746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152332, + "balance_loss_mlp": 1.12778735, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.08789931910276294, + "language_loss": 1.02762055, + "learning_rate": 0.0009996148238370888, + "loss": 1.0391438, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24536133, + "step": 219, + "time_per_iteration": 2.7247660160064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146753, + "balance_loss_mlp": 1.12125421, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.059765696203788965, + "language_loss": 0.98427057, + "learning_rate": 0.0009996025006095421, + "loss": 0.99573809, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.25524902, + "step": 220, + "time_per_iteration": 3.314250946044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04012538, + "balance_loss_mlp": 3.61886096, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.18322335632445477, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81795681, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 3.921875, + "step": 221, + "time_per_iteration": 5.397853851318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_mlp": 1.11779404, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.10045289138425088, + "language_loss": 0.98726314, + "learning_rate": 0.0009995772722706307, + "loss": 0.99869102, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.25, + "step": 222, + "time_per_iteration": 2.8346786499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168149, + "balance_loss_mlp": 1.14130318, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.07395583213906755, + "language_loss": 1.12709904, + "learning_rate": 0.0009995643671690604, + "loss": 1.13878047, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.26879883, + "step": 223, + "time_per_iteration": 2.4760169982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157966, + "balance_loss_mlp": 1.1317513, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.08239055528326475, + "language_loss": 1.00208497, + "learning_rate": 0.0009995512681194023, + "loss": 1.01366448, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.26257324, + "step": 224, + "time_per_iteration": 2.833751916885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151672, + "balance_loss_mlp": 1.12492132, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.058356102807926864, + "language_loss": 0.97854793, + "learning_rate": 0.0009995379751267417, + "loss": 0.99006462, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.2677002, + "step": 225, + "time_per_iteration": 3.295761823654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_mlp": 1.1551652, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.09032086206875983, + "language_loss": 0.99067688, + "learning_rate": 0.0009995244881962398, + "loss": 1.00250244, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.27416992, + "step": 226, + "time_per_iteration": 2.6147754192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162924, + "balance_loss_mlp": 1.1352675, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.05273235380658081, + "language_loss": 1.00220668, + "learning_rate": 0.0009995108073331323, + "loss": 1.01383591, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27661133, + "step": 227, + "time_per_iteration": 2.575477361679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165107, + "balance_loss_mlp": 1.13835633, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.07222661628022838, + "language_loss": 1.03328192, + "learning_rate": 0.0009994969325427309, + "loss": 1.04493296, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.26733398, + "step": 228, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159475, + "balance_loss_mlp": 1.13215184, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.05690950477809338, + "language_loss": 0.99788582, + "learning_rate": 0.0009994828638304218, + "loss": 1.0094806, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.2734375, + "step": 229, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160216, + "balance_loss_mlp": 1.13327467, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.0671245201901001, + "language_loss": 1.05080867, + "learning_rate": 0.0009994686012016675, + "loss": 1.06241083, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.26953125, + "step": 230, + "time_per_iteration": 2.5507686138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200075, + "balance_loss_mlp": 1.17368245, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.08083200993131012, + "language_loss": 1.04836714, + "learning_rate": 0.000999454144662005, + "loss": 1.06036782, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.26416016, + "step": 231, + "time_per_iteration": 2.872386932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177085, + "balance_loss_mlp": 1.15090632, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.06521500069668446, + "language_loss": 0.98697901, + "learning_rate": 0.0009994394942170468, + "loss": 0.99874985, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.26208496, + "step": 232, + "time_per_iteration": 2.6734542846679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_mlp": 1.13452244, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06848368332912834, + "language_loss": 0.96340638, + "learning_rate": 0.0009994246498724808, + "loss": 0.97500765, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.25598145, + "step": 233, + "time_per_iteration": 2.735145330429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.14341569, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.09664881582101635, + "language_loss": 0.99309772, + "learning_rate": 0.00099940961163407, + "loss": 1.00479114, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.25964355, + "step": 234, + "time_per_iteration": 2.8988683223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_mlp": 1.11722803, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.06003753756121682, + "language_loss": 1.01686716, + "learning_rate": 0.0009993943795076528, + "loss": 1.02828944, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.25012207, + "step": 235, + "time_per_iteration": 2.6333067417144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132836, + "balance_loss_mlp": 1.10618043, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.08170413586498586, + "language_loss": 1.0374043, + "learning_rate": 0.0009993789534991427, + "loss": 1.04873264, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.26708984, + "step": 236, + "time_per_iteration": 2.4350106716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_mlp": 1.0960753, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.0440176634981383, + "language_loss": 0.99063611, + "learning_rate": 0.0009993633336145287, + "loss": 1.00186157, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26513672, + "step": 237, + "time_per_iteration": 2.6414294242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134799, + "balance_loss_mlp": 1.10904956, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.04213473561248219, + "language_loss": 1.02718055, + "learning_rate": 0.0009993475198598752, + "loss": 1.03852856, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.25756836, + "step": 238, + "time_per_iteration": 2.9781904220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152995, + "balance_loss_mlp": 1.12614954, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08613106589232603, + "language_loss": 1.00055635, + "learning_rate": 0.0009993315122413212, + "loss": 1.01208627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.26879883, + "step": 239, + "time_per_iteration": 2.6395275592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_mlp": 1.13594294, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.06839694959482054, + "language_loss": 0.99973977, + "learning_rate": 0.0009993153107650818, + "loss": 1.01136363, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.2644043, + "step": 240, + "time_per_iteration": 2.563133716583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_mlp": 1.13391829, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.06471449859153773, + "language_loss": 0.98970807, + "learning_rate": 0.0009992989154374468, + "loss": 1.00131631, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.26928711, + "step": 241, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145463, + "balance_loss_mlp": 1.11914206, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06957696695924716, + "language_loss": 1.05868769, + "learning_rate": 0.0009992823262647817, + "loss": 1.07014227, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26342773, + "step": 242, + "time_per_iteration": 2.6841883659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111302, + "balance_loss_mlp": 1.08692503, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.0649477492764712, + "language_loss": 0.99848783, + "learning_rate": 0.0009992655432535264, + "loss": 1.00961804, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.2611084, + "step": 243, + "time_per_iteration": 2.7613234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107198, + "balance_loss_mlp": 1.08162785, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.05612685480258275, + "language_loss": 1.00329947, + "learning_rate": 0.0009992485664101973, + "loss": 1.01437151, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.25598145, + "step": 244, + "time_per_iteration": 2.717280387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.09556472, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.10316769075352135, + "language_loss": 1.02662849, + "learning_rate": 0.000999231395741385, + "loss": 1.03785205, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.26831055, + "step": 245, + "time_per_iteration": 3.095249891281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_mlp": 1.11837006, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.09647975042234339, + "language_loss": 1.01015186, + "learning_rate": 0.0009992140312537557, + "loss": 1.02159202, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.25671387, + "step": 246, + "time_per_iteration": 2.633258819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.09845233, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.09798218580430706, + "language_loss": 0.95550418, + "learning_rate": 0.000999196472954051, + "loss": 0.96674085, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.25231934, + "step": 247, + "time_per_iteration": 3.024939775466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02466762, + "balance_loss_mlp": 2.43700695, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.2831653982047738, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81891614, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 0.296875, + "step": 248, + "time_per_iteration": 5.486468076705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162703, + "balance_loss_mlp": 1.13626289, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.12969478117477343, + "language_loss": 1.03178453, + "learning_rate": 0.0009991607749457578, + "loss": 1.04341149, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.26464844, + "step": 249, + "time_per_iteration": 2.5253713130950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119774, + "balance_loss_mlp": 1.16941571, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.09425507858465235, + "language_loss": 1.01008546, + "learning_rate": 0.0009991426352510286, + "loss": 1.0220629, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.28295898, + "step": 250, + "time_per_iteration": 3.0042202472686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204128, + "balance_loss_mlp": 1.174016, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.07677732337183582, + "language_loss": 1.0282234, + "learning_rate": 0.0009991243017719422, + "loss": 1.04026473, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30126953, + "step": 251, + "time_per_iteration": 2.709934711456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206766, + "balance_loss_mlp": 1.17522311, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.1103729500964747, + "language_loss": 0.97436613, + "learning_rate": 0.0009991057745156165, + "loss": 0.9864338, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.31518555, + "step": 252, + "time_per_iteration": 2.5961716175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03348202, + "balance_loss_mlp": 3.30471396, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.3811060337507454, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85259187, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.43554688, + "step": 253, + "time_per_iteration": 5.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_mlp": 1.1623621, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.07473951959737497, + "language_loss": 1.05491519, + "learning_rate": 0.0009990681387000943, + "loss": 1.06686831, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.3293457, + "step": 254, + "time_per_iteration": 2.7937283515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121698, + "balance_loss_mlp": 1.18345821, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.06898181212790383, + "language_loss": 1.01063621, + "learning_rate": 0.0009990490301555093, + "loss": 1.02280605, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.33544922, + "step": 255, + "time_per_iteration": 2.9615726470947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05252755, + "balance_loss_mlp": 5.12458086, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.5609302024280507, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.84467912, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.28125, + "step": 256, + "time_per_iteration": 4.8413920402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03162439, + "balance_loss_mlp": 3.09758925, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.1723793408951341, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8240518, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6484375, + "step": 257, + "time_per_iteration": 4.985513687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03630928, + "balance_loss_mlp": 3.55844903, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4079591987734508, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73606813, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.7265625, + "step": 258, + "time_per_iteration": 4.858096361160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403117, + "balance_loss_mlp": 1.35569584, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.11330256318865821, + "language_loss": 0.95339322, + "learning_rate": 0.0009989706585723202, + "loss": 0.96742439, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.47436523, + "step": 259, + "time_per_iteration": 2.794419765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437412, + "balance_loss_mlp": 1.38651013, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.10381773722922016, + "language_loss": 1.0219605, + "learning_rate": 0.0009989505813633442, + "loss": 1.03633475, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.50927734, + "step": 260, + "time_per_iteration": 2.6660099029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145174, + "balance_loss_mlp": 1.39776254, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12909552841436595, + "language_loss": 1.02080631, + "learning_rate": 0.000998930310444573, + "loss": 1.03532374, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.5402832, + "step": 261, + "time_per_iteration": 2.7547266483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429363, + "balance_loss_mlp": 1.37698281, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.08616818959721087, + "language_loss": 0.99936116, + "learning_rate": 0.0009989098458238765, + "loss": 1.01365471, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.52441406, + "step": 262, + "time_per_iteration": 2.804656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431577, + "balance_loss_mlp": 1.38310647, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.10103635045761167, + "language_loss": 0.99213421, + "learning_rate": 0.0009988891875091998, + "loss": 1.00644994, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.48486328, + "step": 263, + "time_per_iteration": 2.780696392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359367, + "balance_loss_mlp": 1.31771505, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09437475228894394, + "language_loss": 0.93793595, + "learning_rate": 0.0009988683355085636, + "loss": 0.95152962, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.41625977, + "step": 264, + "time_per_iteration": 2.758275032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314446, + "balance_loss_mlp": 1.27684712, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09784246378207673, + "language_loss": 1.02612829, + "learning_rate": 0.000998847289830063, + "loss": 1.03927279, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37524414, + "step": 265, + "time_per_iteration": 2.8752288818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289086, + "balance_loss_mlp": 1.25468266, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.06973466471853282, + "language_loss": 0.95293748, + "learning_rate": 0.0009988260504818682, + "loss": 0.9658283, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.34423828, + "step": 266, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290407, + "balance_loss_mlp": 1.2563374, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.0971565340820806, + "language_loss": 1.02148294, + "learning_rate": 0.000998804617472226, + "loss": 1.03438699, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.34082031, + "step": 267, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275377, + "balance_loss_mlp": 1.24085402, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.10761719469623075, + "language_loss": 0.96939588, + "learning_rate": 0.0009987829908094568, + "loss": 0.98214972, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.34545898, + "step": 268, + "time_per_iteration": 2.8270740509033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271333, + "balance_loss_mlp": 1.23785877, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.1226169977774822, + "language_loss": 1.04002702, + "learning_rate": 0.0009987611705019569, + "loss": 1.05274034, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.33496094, + "step": 269, + "time_per_iteration": 4.483954429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277218, + "balance_loss_mlp": 1.24267149, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.07374197309260985, + "language_loss": 1.02401245, + "learning_rate": 0.0009987391565581978, + "loss": 1.03678453, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34594727, + "step": 270, + "time_per_iteration": 2.627356767654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304636, + "balance_loss_mlp": 1.26977956, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06923057034816653, + "language_loss": 0.94496262, + "learning_rate": 0.000998716948986726, + "loss": 0.95800889, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34887695, + "step": 271, + "time_per_iteration": 2.804185628890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322736, + "balance_loss_mlp": 1.28718746, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.1173780328671846, + "language_loss": 0.97372609, + "learning_rate": 0.0009986945477961633, + "loss": 0.9869535, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.35571289, + "step": 272, + "time_per_iteration": 2.739595890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297409, + "balance_loss_mlp": 1.2620039, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07261359465506025, + "language_loss": 1.02136993, + "learning_rate": 0.0009986719529952066, + "loss": 1.03434396, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.35424805, + "step": 273, + "time_per_iteration": 2.8717877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_mlp": 1.20389819, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.13624684616705834, + "language_loss": 1.01736569, + "learning_rate": 0.000998649164592628, + "loss": 1.0297575, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.35327148, + "step": 274, + "time_per_iteration": 2.590993642807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206885, + "balance_loss_mlp": 1.16945291, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.061304815826305474, + "language_loss": 0.99439085, + "learning_rate": 0.0009986261825972748, + "loss": 1.00645971, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.37426758, + "step": 275, + "time_per_iteration": 2.702202081680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_mlp": 1.14466429, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.10486338408500256, + "language_loss": 1.01433325, + "learning_rate": 0.000998603007018069, + "loss": 1.02616751, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.38745117, + "step": 276, + "time_per_iteration": 2.876267671585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190326, + "balance_loss_mlp": 1.15055728, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.08719890934761923, + "language_loss": 0.99445826, + "learning_rate": 0.0009985796378640089, + "loss": 1.00636148, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.39746094, + "step": 277, + "time_per_iteration": 2.74886155128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165278, + "balance_loss_mlp": 1.12720275, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.06292174667602014, + "language_loss": 0.99806106, + "learning_rate": 0.0009985560751441665, + "loss": 1.00971389, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.38061523, + "step": 278, + "time_per_iteration": 2.8894753456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175743, + "balance_loss_mlp": 1.13790607, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.06329003141341145, + "language_loss": 1.01538157, + "learning_rate": 0.00099853231886769, + "loss": 1.02713895, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.37792969, + "step": 279, + "time_per_iteration": 2.783085823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183406, + "balance_loss_mlp": 1.14633179, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06545769746199957, + "language_loss": 1.01316965, + "learning_rate": 0.0009985083690438024, + "loss": 1.02500367, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.37084961, + "step": 280, + "time_per_iteration": 2.707329511642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147788, + "balance_loss_mlp": 1.11245418, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.05305898567294309, + "language_loss": 0.9175781, + "learning_rate": 0.0009984842256818016, + "loss": 0.92905599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.35400391, + "step": 281, + "time_per_iteration": 3.1014201641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_mlp": 1.13106215, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.05782684737590577, + "language_loss": 1.02446878, + "learning_rate": 0.0009984598887910613, + "loss": 1.03612816, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.34912109, + "step": 282, + "time_per_iteration": 2.75343656539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_mlp": 1.14555514, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0631633618899466, + "language_loss": 0.98333299, + "learning_rate": 0.0009984353583810297, + "loss": 0.99513876, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.3503418, + "step": 283, + "time_per_iteration": 2.8092565536499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.15350997, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0821933313576245, + "language_loss": 1.00416183, + "learning_rate": 0.0009984106344612302, + "loss": 1.01602352, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.32666016, + "step": 284, + "time_per_iteration": 2.7632908821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_mlp": 1.1310904, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.06349155766627652, + "language_loss": 0.95740765, + "learning_rate": 0.0009983857170412615, + "loss": 0.96904278, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.32421875, + "step": 285, + "time_per_iteration": 2.9946134090423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130334, + "balance_loss_mlp": 1.09912539, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.0487694941790178, + "language_loss": 0.95326382, + "learning_rate": 0.000998360606130798, + "loss": 0.96456718, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.31176758, + "step": 286, + "time_per_iteration": 2.8205370903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.09512836, + "balance_loss_mlp": 7.26674223, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.42812971022266805, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.78585953, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 22.5, + "step": 287, + "time_per_iteration": 4.986966848373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173658, + "balance_loss_mlp": 1.14278328, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08917023960137904, + "language_loss": 1.01027536, + "learning_rate": 0.0009983098038774552, + "loss": 1.02201188, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.30834961, + "step": 288, + "time_per_iteration": 2.8100168704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06110836, + "balance_loss_mlp": 5.25634384, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.4031517895181362, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.84281063, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 8.5625, + "step": 289, + "time_per_iteration": 4.790200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_mlp": 1.23435044, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.18275347501036113, + "language_loss": 0.9955281, + "learning_rate": 0.0009982582277800948, + "loss": 1.00819802, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.32641602, + "step": 290, + "time_per_iteration": 2.5976333618164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281728, + "balance_loss_mlp": 1.24694288, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.14603269886404707, + "language_loss": 1.06751418, + "learning_rate": 0.0009982321495648908, + "loss": 1.08033144, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.34838867, + "step": 291, + "time_per_iteration": 2.8513312339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250537, + "balance_loss_mlp": 1.21348643, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.09283742859778188, + "language_loss": 0.97403693, + "learning_rate": 0.0009982058779188115, + "loss": 0.98654234, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.37011719, + "step": 292, + "time_per_iteration": 2.728203773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230786, + "balance_loss_mlp": 1.19170928, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.08826519450204054, + "language_loss": 1.05705655, + "learning_rate": 0.0009981794128520567, + "loss": 1.06936455, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.39038086, + "step": 293, + "time_per_iteration": 2.79616379737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253904, + "balance_loss_mlp": 1.21258569, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.08065602932127632, + "language_loss": 1.01724029, + "learning_rate": 0.000998152754374901, + "loss": 1.02977943, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.41333008, + "step": 294, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232141, + "balance_loss_mlp": 1.19132411, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.07309017642696977, + "language_loss": 0.9826439, + "learning_rate": 0.0009981259024976943, + "loss": 0.99496531, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.40820312, + "step": 295, + "time_per_iteration": 2.7376105785369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244019, + "balance_loss_mlp": 1.20112753, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.07769478500482971, + "language_loss": 0.96765345, + "learning_rate": 0.0009980988572308612, + "loss": 0.9800936, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.42871094, + "step": 296, + "time_per_iteration": 3.001779556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226011, + "balance_loss_mlp": 1.18197489, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.0588150430335769, + "language_loss": 0.99343681, + "learning_rate": 0.0009980716185849015, + "loss": 1.00569689, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44067383, + "step": 297, + "time_per_iteration": 2.9817121028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223805, + "balance_loss_mlp": 1.18153381, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06400414638033543, + "language_loss": 0.95616293, + "learning_rate": 0.0009980441865703904, + "loss": 0.96840101, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4230957, + "step": 298, + "time_per_iteration": 2.615875244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122669, + "balance_loss_mlp": 1.18513405, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.09089975305964836, + "language_loss": 1.03662193, + "learning_rate": 0.000998016561197978, + "loss": 1.04888892, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.41577148, + "step": 299, + "time_per_iteration": 2.765833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219698, + "balance_loss_mlp": 1.17835617, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.05662219614280908, + "language_loss": 0.94978034, + "learning_rate": 0.0009979887424783895, + "loss": 0.96197736, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.41357422, + "step": 300, + "time_per_iteration": 2.8931760787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122099, + "balance_loss_mlp": 1.17850339, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.05388706690809858, + "language_loss": 0.94851983, + "learning_rate": 0.0009979607304224248, + "loss": 0.96072972, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.42504883, + "step": 301, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213648, + "balance_loss_mlp": 1.16951644, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.0564182452216587, + "language_loss": 1.02312028, + "learning_rate": 0.000997932525040959, + "loss": 1.03525686, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.44140625, + "step": 302, + "time_per_iteration": 2.7084572315216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.14165473, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.07525794393376325, + "language_loss": 1.04335976, + "learning_rate": 0.000997904126344943, + "loss": 1.05521822, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.44165039, + "step": 303, + "time_per_iteration": 2.6271631717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121438, + "balance_loss_mlp": 1.17055893, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.0664075129682053, + "language_loss": 1.00263453, + "learning_rate": 0.0009978755343454018, + "loss": 1.01477838, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.791146993637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182664, + "balance_loss_mlp": 1.13869941, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07350056034493838, + "language_loss": 1.01461756, + "learning_rate": 0.0009978467490534355, + "loss": 1.0264442, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.43969727, + "step": 305, + "time_per_iteration": 2.614455461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186922, + "balance_loss_mlp": 1.14424467, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.056638515612222363, + "language_loss": 0.97774673, + "learning_rate": 0.00099781777048022, + "loss": 0.98961592, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.42700195, + "step": 306, + "time_per_iteration": 2.717700481414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011718, + "balance_loss_mlp": 1.12855101, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.056560878082468485, + "language_loss": 0.99827361, + "learning_rate": 0.0009977885986370057, + "loss": 1.00999165, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.43310547, + "step": 307, + "time_per_iteration": 2.557203531265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164879, + "balance_loss_mlp": 1.12263095, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.05991229640473007, + "language_loss": 0.9525907, + "learning_rate": 0.000997759233535118, + "loss": 0.9642396, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.42285156, + "step": 308, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174986, + "balance_loss_mlp": 1.1345737, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.06710738832596337, + "language_loss": 1.01122141, + "learning_rate": 0.0009977296751859576, + "loss": 1.02297115, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.40405273, + "step": 309, + "time_per_iteration": 2.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164837, + "balance_loss_mlp": 1.12487829, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.05223481097130428, + "language_loss": 1.03482628, + "learning_rate": 0.0009976999236009998, + "loss": 1.0464747, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.39941406, + "step": 310, + "time_per_iteration": 2.769092321395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164403, + "balance_loss_mlp": 1.1263994, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.05685909644716586, + "language_loss": 1.04877043, + "learning_rate": 0.0009976699787917955, + "loss": 1.06041443, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37963867, + "step": 311, + "time_per_iteration": 2.6526851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08775091, + "balance_loss_mlp": 7.79852915, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.2725707199289832, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.82218087, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 9.75, + "step": 312, + "time_per_iteration": 5.006884813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_mlp": 1.12172294, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.06726838636277511, + "language_loss": 0.96427834, + "learning_rate": 0.0009976095095472243, + "loss": 0.97589004, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39428711, + "step": 313, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166252, + "balance_loss_mlp": 1.12738967, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.0761643630364548, + "language_loss": 0.97957367, + "learning_rate": 0.0009975789851353334, + "loss": 0.99123621, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.38818359, + "step": 314, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_mlp": 1.13191843, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07475166161853689, + "language_loss": 1.00319684, + "learning_rate": 0.0009975482675461487, + "loss": 1.0149318, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.41601562, + "step": 315, + "time_per_iteration": 2.65468692779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159286, + "balance_loss_mlp": 1.11591756, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08252555003670439, + "language_loss": 0.98425788, + "learning_rate": 0.0009975173567915952, + "loss": 0.99585068, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.43383789, + "step": 316, + "time_per_iteration": 2.6916940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.12767935, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.0640207679679256, + "language_loss": 0.91960573, + "learning_rate": 0.000997486252883674, + "loss": 0.93133986, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.45727539, + "step": 317, + "time_per_iteration": 2.8535635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188261, + "balance_loss_mlp": 1.13979006, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.0671416603225842, + "language_loss": 0.97457695, + "learning_rate": 0.0009974549558344602, + "loss": 0.98645949, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.484375, + "step": 318, + "time_per_iteration": 3.6911113262176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189393, + "balance_loss_mlp": 1.14037383, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.09268216800999254, + "language_loss": 1.06808639, + "learning_rate": 0.000997423465656105, + "loss": 1.07998025, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.49023438, + "step": 319, + "time_per_iteration": 2.727130651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.096205, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.06029287427116143, + "language_loss": 1.04509127, + "learning_rate": 0.0009973917823608335, + "loss": 1.05656588, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.51318359, + "step": 320, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.09605646, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.03213952729051003, + "language_loss": 0.98612553, + "learning_rate": 0.0009973599059609462, + "loss": 0.99760658, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.52075195, + "step": 321, + "time_per_iteration": 2.7024786472320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.09133446, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.04984356389382333, + "language_loss": 0.97161096, + "learning_rate": 0.000997327836468819, + "loss": 0.9830358, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.51147461, + "step": 322, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_mlp": 1.0917964, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.06671524152363617, + "language_loss": 0.99795449, + "learning_rate": 0.000997295573896902, + "loss": 1.00938356, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.51171875, + "step": 323, + "time_per_iteration": 2.834237813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03299168, + "balance_loss_mlp": 3.12445545, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.43556355854402456, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.84495211, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.75, + "step": 324, + "time_per_iteration": 4.770992040634155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02151431, + "balance_loss_mlp": 1.9545927, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.14082611715048204, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80723369, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.9609375, + "step": 325, + "time_per_iteration": 4.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.14768362, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.08367806581965369, + "language_loss": 0.93651855, + "learning_rate": 0.000997197627828043, + "loss": 0.94848073, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.4855957, + "step": 326, + "time_per_iteration": 2.5508148670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215208, + "balance_loss_mlp": 1.16862106, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.06635735350324974, + "language_loss": 0.89348811, + "learning_rate": 0.0009971645930629716, + "loss": 0.90564024, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.46533203, + "step": 327, + "time_per_iteration": 2.711386203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125047, + "balance_loss_mlp": 1.20192814, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.08863859510008423, + "language_loss": 1.03147936, + "learning_rate": 0.0009971313652814872, + "loss": 1.04398406, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.48486328, + "step": 328, + "time_per_iteration": 2.8484854698181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225004, + "balance_loss_mlp": 1.17553234, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.08503417282278386, + "language_loss": 1.0059731, + "learning_rate": 0.0009970979444964903, + "loss": 1.01822317, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.49487305, + "step": 329, + "time_per_iteration": 2.957482099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197604, + "balance_loss_mlp": 1.14846587, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.06790724972181753, + "language_loss": 1.01849604, + "learning_rate": 0.0009970643307209556, + "loss": 1.03047216, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.49121094, + "step": 330, + "time_per_iteration": 2.8220374584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170349, + "balance_loss_mlp": 1.1215446, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.06721894230078661, + "language_loss": 0.98097444, + "learning_rate": 0.0009970305239679334, + "loss": 0.99267793, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.48803711, + "step": 331, + "time_per_iteration": 2.8813369274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176679, + "balance_loss_mlp": 1.12754059, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.056286161373139375, + "language_loss": 1.03013992, + "learning_rate": 0.0009969965242505483, + "loss": 1.04190671, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.4909668, + "step": 332, + "time_per_iteration": 2.6662604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168774, + "balance_loss_mlp": 1.11932611, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06031850484613652, + "language_loss": 0.99096131, + "learning_rate": 0.0009969623315820007, + "loss": 1.00264907, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.49487305, + "step": 333, + "time_per_iteration": 2.6671581268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.10619712, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06229524640691676, + "language_loss": 0.99215055, + "learning_rate": 0.000996927945975565, + "loss": 1.00368309, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.47070312, + "step": 334, + "time_per_iteration": 2.568838357925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.1125921, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.05620099657237302, + "language_loss": 0.95852566, + "learning_rate": 0.0009968933674445906, + "loss": 0.97011936, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.46728516, + "step": 335, + "time_per_iteration": 2.6725666522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160514, + "balance_loss_mlp": 1.1122818, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.05589062806096766, + "language_loss": 0.97974062, + "learning_rate": 0.0009968585960025028, + "loss": 0.99134576, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.48217773, + "step": 336, + "time_per_iteration": 2.945194959640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0396516, + "balance_loss_mlp": 3.85834861, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.42886267506062575, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.81618351, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.0703125, + "step": 337, + "time_per_iteration": 4.802944183349609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215082, + "balance_loss_mlp": 1.16968668, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.09324534870618859, + "language_loss": 0.96021777, + "learning_rate": 0.0009967884744390583, + "loss": 0.9723686, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.45361328, + "step": 338, + "time_per_iteration": 3.5247950553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251582, + "balance_loss_mlp": 1.2060678, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.09123718626917265, + "language_loss": 0.97373873, + "learning_rate": 0.0009967531243449256, + "loss": 0.98625457, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.45507812, + "step": 339, + "time_per_iteration": 2.681973695755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211309, + "balance_loss_mlp": 1.163077, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.06030156589334856, + "language_loss": 1.04525125, + "learning_rate": 0.000996717581394126, + "loss": 1.05736434, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.48242188, + "step": 340, + "time_per_iteration": 2.6031126976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205107, + "balance_loss_mlp": 1.15630233, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.06934362388274598, + "language_loss": 1.05133414, + "learning_rate": 0.000996681845600459, + "loss": 1.06338525, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.48803711, + "step": 341, + "time_per_iteration": 2.6689491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190009, + "balance_loss_mlp": 1.1402986, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07929020766121274, + "language_loss": 0.97276402, + "learning_rate": 0.0009966459169777982, + "loss": 0.98466408, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.49731445, + "step": 342, + "time_per_iteration": 2.5235347747802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183772, + "balance_loss_mlp": 1.13444376, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.06503113555429127, + "language_loss": 1.05431008, + "learning_rate": 0.0009966097955400924, + "loss": 1.0661478, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.4934082, + "step": 343, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195626, + "balance_loss_mlp": 1.14772749, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.05810753199069879, + "language_loss": 0.99792945, + "learning_rate": 0.0009965734813013652, + "loss": 1.00988579, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.47924805, + "step": 344, + "time_per_iteration": 2.8092823028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211149, + "balance_loss_mlp": 1.16191518, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.08606224500635251, + "language_loss": 1.02011895, + "learning_rate": 0.0009965369742757151, + "loss": 1.03223062, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.49243164, + "step": 345, + "time_per_iteration": 2.5981764793395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193116, + "balance_loss_mlp": 1.14435959, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.0619511290056959, + "language_loss": 0.98293203, + "learning_rate": 0.0009965002744773152, + "loss": 0.99486327, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.48730469, + "step": 346, + "time_per_iteration": 3.4968950748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178364, + "balance_loss_mlp": 1.13115668, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.04856723246232052, + "language_loss": 0.95658922, + "learning_rate": 0.0009964633819204139, + "loss": 0.96837282, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.47167969, + "step": 347, + "time_per_iteration": 2.6705336570739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04576048, + "balance_loss_mlp": 4.3029151, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.32603271390487504, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.86377156, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 2.734375, + "step": 348, + "time_per_iteration": 4.961863994598389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03789769, + "balance_loss_mlp": 3.60590124, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.16497869204612428, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.78943658, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.8359375, + "step": 349, + "time_per_iteration": 4.876751184463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181375, + "balance_loss_mlp": 1.13578987, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.07770510755269132, + "language_loss": 0.96067584, + "learning_rate": 0.000996351547842304, + "loss": 0.9724896, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.45581055, + "step": 350, + "time_per_iteration": 3.166680097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217287, + "balance_loss_mlp": 1.16969919, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.06167835917893234, + "language_loss": 0.94333142, + "learning_rate": 0.0009963138843953744, + "loss": 0.9555043, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.47558594, + "step": 351, + "time_per_iteration": 2.5784904956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122803, + "balance_loss_mlp": 1.18005991, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.06188972934791396, + "language_loss": 0.98543227, + "learning_rate": 0.000996276028262306, + "loss": 0.99771261, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.47924805, + "step": 352, + "time_per_iteration": 2.7985076904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216963, + "balance_loss_mlp": 1.16760993, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.0659402302829914, + "language_loss": 1.04801619, + "learning_rate": 0.0009962379794577964, + "loss": 1.06018579, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.49365234, + "step": 353, + "time_per_iteration": 2.608032703399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123128, + "balance_loss_mlp": 1.18266606, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.051231802586423875, + "language_loss": 0.94352609, + "learning_rate": 0.000996199737996617, + "loss": 0.95583886, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.48657227, + "step": 354, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227436, + "balance_loss_mlp": 1.17770219, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.05676190931504088, + "language_loss": 1.03759205, + "learning_rate": 0.0009961613038936149, + "loss": 1.04986644, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.49755859, + "step": 355, + "time_per_iteration": 2.617859125137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216553, + "balance_loss_mlp": 1.16572189, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.04878484453506707, + "language_loss": 0.95482612, + "learning_rate": 0.000996122677163711, + "loss": 0.96699166, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.50878906, + "step": 356, + "time_per_iteration": 2.8171308040618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230039, + "balance_loss_mlp": 1.18037653, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.06504242786199886, + "language_loss": 1.01527905, + "learning_rate": 0.000996083857821902, + "loss": 1.02757955, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.49682617, + "step": 357, + "time_per_iteration": 3.0562636852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221322, + "balance_loss_mlp": 1.17237508, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.043415107047687695, + "language_loss": 0.99947309, + "learning_rate": 0.0009960448458832588, + "loss": 1.01168633, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.48925781, + "step": 358, + "time_per_iteration": 2.6778266429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224961, + "balance_loss_mlp": 1.17675292, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.061398357107108094, + "language_loss": 0.99686754, + "learning_rate": 0.000996005641362927, + "loss": 1.00911713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.48193359, + "step": 359, + "time_per_iteration": 2.5839953422546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218039, + "balance_loss_mlp": 1.16792321, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.045504813624839685, + "language_loss": 1.02907789, + "learning_rate": 0.0009959662442761274, + "loss": 1.04125834, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.5012207, + "step": 360, + "time_per_iteration": 2.9012227058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225991, + "balance_loss_mlp": 1.17504108, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.05242893208235044, + "language_loss": 0.96392268, + "learning_rate": 0.000995926654638155, + "loss": 0.97618258, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.50976562, + "step": 361, + "time_per_iteration": 2.7972850799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120421, + "balance_loss_mlp": 1.15323579, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0452718414118582, + "language_loss": 0.98678619, + "learning_rate": 0.00099588687246438, + "loss": 0.99882829, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.51025391, + "step": 362, + "time_per_iteration": 2.845742702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011953, + "balance_loss_mlp": 1.14241886, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.06654716127982052, + "language_loss": 1.06146324, + "learning_rate": 0.0009958468977702471, + "loss": 1.07341623, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.52978516, + "step": 363, + "time_per_iteration": 2.5876591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05386722, + "balance_loss_mlp": 5.09527922, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.35536528906135745, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.85121429, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 2.921875, + "step": 364, + "time_per_iteration": 4.7958595752716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183221, + "balance_loss_mlp": 1.12800324, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.06493728064972926, + "language_loss": 0.94085538, + "learning_rate": 0.0009957663708830612, + "loss": 0.95268762, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.55273438, + "step": 365, + "time_per_iteration": 3.238919258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188034, + "balance_loss_mlp": 1.13048029, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.06418297657416602, + "language_loss": 0.98210049, + "learning_rate": 0.0009957258187212714, + "loss": 0.99398077, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.57470703, + "step": 366, + "time_per_iteration": 3.0337131023406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0292345, + "balance_loss_mlp": 2.78612089, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.09868001986151984, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.82118309, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.375, + "step": 367, + "time_per_iteration": 4.825684070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118916, + "balance_loss_mlp": 1.12988925, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.06345017711900697, + "language_loss": 0.94456601, + "learning_rate": 0.0009956441370400167, + "loss": 0.95645761, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.59179688, + "step": 368, + "time_per_iteration": 2.6685595512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203671, + "balance_loss_mlp": 1.14411354, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.07550644934377632, + "language_loss": 1.00098681, + "learning_rate": 0.0009956030075522636, + "loss": 1.0130235, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.59472656, + "step": 369, + "time_per_iteration": 2.7824065685272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.12555027, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0634963537383221, + "language_loss": 1.00245738, + "learning_rate": 0.0009955616856543587, + "loss": 1.01431036, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.59667969, + "step": 370, + "time_per_iteration": 2.6869115829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117739, + "balance_loss_mlp": 1.11649847, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.04749901473855408, + "language_loss": 0.92605507, + "learning_rate": 0.0009955201713623448, + "loss": 0.93782902, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.60791016, + "step": 371, + "time_per_iteration": 2.7894065380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03553003, + "balance_loss_mlp": 3.34700894, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.1539254818196356, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.80225718, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 2.0625, + "step": 372, + "time_per_iteration": 5.025646924972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_mlp": 1.12739396, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.05697389015463885, + "language_loss": 1.05361807, + "learning_rate": 0.0009954365656605333, + "loss": 1.06550562, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.61328125, + "step": 373, + "time_per_iteration": 2.5767741203308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203971, + "balance_loss_mlp": 1.13878703, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.0561234241567743, + "language_loss": 0.98981488, + "learning_rate": 0.0009953944742831947, + "loss": 1.00185454, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.65185547, + "step": 374, + "time_per_iteration": 3.0126912593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209318, + "balance_loss_mlp": 1.14351439, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.05197007853134015, + "language_loss": 1.02623391, + "learning_rate": 0.0009953521905766642, + "loss": 1.0383271, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.65820312, + "step": 375, + "time_per_iteration": 2.9678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207965, + "balance_loss_mlp": 1.14464104, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.05250799377029981, + "language_loss": 1.01212132, + "learning_rate": 0.0009953097145573577, + "loss": 1.02420104, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.6328125, + "step": 376, + "time_per_iteration": 2.7048561573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121329, + "balance_loss_mlp": 1.1502521, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.050651846587156886, + "language_loss": 0.98499894, + "learning_rate": 0.000995267046241766, + "loss": 0.99713182, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.62988281, + "step": 377, + "time_per_iteration": 3.287705421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225924, + "balance_loss_mlp": 1.16341114, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.05776369312695448, + "language_loss": 0.98701203, + "learning_rate": 0.0009952241856464547, + "loss": 0.99927127, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.62451172, + "step": 378, + "time_per_iteration": 2.5897629261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220563, + "balance_loss_mlp": 1.16010034, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.05450855675542614, + "language_loss": 1.05642247, + "learning_rate": 0.0009951811327880632, + "loss": 1.06862807, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.60351562, + "step": 379, + "time_per_iteration": 2.7320594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220943, + "balance_loss_mlp": 1.15924072, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.04947645913164449, + "language_loss": 0.99005401, + "learning_rate": 0.0009951378876833063, + "loss": 1.00226343, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.61669922, + "step": 380, + "time_per_iteration": 2.595810651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196634, + "balance_loss_mlp": 1.13798296, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.058807068798268386, + "language_loss": 1.05567527, + "learning_rate": 0.0009950944503489736, + "loss": 1.06764162, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.5859375, + "step": 381, + "time_per_iteration": 2.733560562133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197608, + "balance_loss_mlp": 1.13914812, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.06747680453051412, + "language_loss": 0.99337935, + "learning_rate": 0.0009950508208019285, + "loss": 1.00535548, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.58398438, + "step": 382, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176507, + "balance_loss_mlp": 1.12062192, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.05827239016363537, + "language_loss": 1.03707182, + "learning_rate": 0.0009950069990591096, + "loss": 1.04883695, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.55908203, + "step": 383, + "time_per_iteration": 2.6856980323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05393736, + "balance_loss_mlp": 5.19079447, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.38241300139143997, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.81795102, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 2.03125, + "step": 384, + "time_per_iteration": 4.860661268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128436, + "balance_loss_mlp": 1.07369518, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.06005395599718801, + "language_loss": 0.96679938, + "learning_rate": 0.0009949187790542777, + "loss": 0.97808379, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.54760742, + "step": 385, + "time_per_iteration": 2.7245922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.09042215, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.06780842756482337, + "language_loss": 0.9270733, + "learning_rate": 0.0009948743808265148, + "loss": 0.93854064, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.56298828, + "step": 386, + "time_per_iteration": 2.6745331287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187036, + "balance_loss_mlp": 1.13334417, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.04295711334598506, + "language_loss": 1.02854586, + "learning_rate": 0.0009948297904714782, + "loss": 1.04041624, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.53759766, + "step": 387, + "time_per_iteration": 2.681718111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202671, + "balance_loss_mlp": 1.15167296, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.05564614333293379, + "language_loss": 0.94366896, + "learning_rate": 0.0009947850080064796, + "loss": 0.95569569, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.51049805, + "step": 388, + "time_per_iteration": 2.788663148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216483, + "balance_loss_mlp": 1.16817975, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.07112384111458, + "language_loss": 0.99713415, + "learning_rate": 0.0009947400334489047, + "loss": 1.00929892, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.48291016, + "step": 389, + "time_per_iteration": 2.9905049800872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227498, + "balance_loss_mlp": 1.17926562, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.06900212518032732, + "language_loss": 0.91264081, + "learning_rate": 0.0009946948668162145, + "loss": 0.92491579, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.48168945, + "step": 390, + "time_per_iteration": 2.767531394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012247, + "balance_loss_mlp": 1.17277205, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.052104168644034804, + "language_loss": 0.95126128, + "learning_rate": 0.0009946495081259441, + "loss": 0.96350825, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.52001953, + "step": 391, + "time_per_iteration": 2.816908597946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192311, + "balance_loss_mlp": 1.14057434, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.051504782312047234, + "language_loss": 0.99421549, + "learning_rate": 0.0009946039573957035, + "loss": 1.00613856, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.51782227, + "step": 392, + "time_per_iteration": 2.9265222549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116666, + "balance_loss_mlp": 1.11478019, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.055053573084277836, + "language_loss": 0.95799196, + "learning_rate": 0.000994558214643177, + "loss": 0.96965855, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.51928711, + "step": 393, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165121, + "balance_loss_mlp": 1.11352682, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.05925711706254076, + "language_loss": 0.97585773, + "learning_rate": 0.000994512279886123, + "loss": 0.98750889, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.51660156, + "step": 394, + "time_per_iteration": 3.0709142684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.09191656, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.04191079383555719, + "language_loss": 0.97239089, + "learning_rate": 0.0009944661531423758, + "loss": 0.98382699, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.51757812, + "step": 395, + "time_per_iteration": 2.7044599056243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134219, + "balance_loss_mlp": 1.08338809, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.05545815376917658, + "language_loss": 0.96390671, + "learning_rate": 0.000994419834429843, + "loss": 0.97524893, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.50854492, + "step": 396, + "time_per_iteration": 2.6767609119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135922, + "balance_loss_mlp": 1.08525789, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.05307630449121137, + "language_loss": 1.01208472, + "learning_rate": 0.0009943733237665069, + "loss": 1.02344394, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.50683594, + "step": 397, + "time_per_iteration": 2.819148302078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124426, + "balance_loss_mlp": 1.07502615, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.049844903289807924, + "language_loss": 0.99488425, + "learning_rate": 0.0009943266211704248, + "loss": 1.00612843, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.49389648, + "step": 398, + "time_per_iteration": 2.9555482864379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125466, + "balance_loss_mlp": 1.07675719, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.05620775813161816, + "language_loss": 1.01430082, + "learning_rate": 0.000994279726659728, + "loss": 1.02555549, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.48706055, + "step": 399, + "time_per_iteration": 2.5138003826141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.07761765, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.05674792404596756, + "language_loss": 0.99883693, + "learning_rate": 0.0009942326402526231, + "loss": 1.01010823, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.49511719, + "step": 400, + "time_per_iteration": 2.5245604515075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_mlp": 1.07793891, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.036646942736225624, + "language_loss": 0.9767518, + "learning_rate": 0.0009941853619673902, + "loss": 0.98802906, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.49804688, + "step": 401, + "time_per_iteration": 2.644771099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_mlp": 1.07451057, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.057554732491620374, + "language_loss": 1.01884329, + "learning_rate": 0.0009941378918223844, + "loss": 1.0300777, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.48876953, + "step": 402, + "time_per_iteration": 3.051617383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_mlp": 1.07618988, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.04510164642433069, + "language_loss": 0.94372368, + "learning_rate": 0.0009940902298360354, + "loss": 0.95496523, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.47924805, + "step": 403, + "time_per_iteration": 2.7302582263946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118279, + "balance_loss_mlp": 1.0687592, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.062376946911402976, + "language_loss": 1.04687834, + "learning_rate": 0.0009940423760268473, + "loss": 1.05806112, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.49536133, + "step": 404, + "time_per_iteration": 2.856938600540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118682, + "balance_loss_mlp": 1.07009196, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.046838991637930295, + "language_loss": 0.97888398, + "learning_rate": 0.0009939943304133982, + "loss": 0.99007082, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.48608398, + "step": 405, + "time_per_iteration": 2.6161091327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115161, + "balance_loss_mlp": 1.06881261, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.04496148345425058, + "language_loss": 1.04081011, + "learning_rate": 0.0009939460930143416, + "loss": 1.0519619, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.46337891, + "step": 406, + "time_per_iteration": 2.6310677528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119218, + "balance_loss_mlp": 1.07332289, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.037201804651944344, + "language_loss": 0.98071587, + "learning_rate": 0.0009938976638484043, + "loss": 0.99190807, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.45874023, + "step": 407, + "time_per_iteration": 2.8977036476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.06844616, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.04629061554837057, + "language_loss": 0.97991359, + "learning_rate": 0.0009938490429343887, + "loss": 0.99104249, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.44458008, + "step": 408, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07315516, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.04004461216150975, + "language_loss": 0.97974342, + "learning_rate": 0.0009938002302911709, + "loss": 0.99092889, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.4543457, + "step": 409, + "time_per_iteration": 2.738518238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123547, + "balance_loss_mlp": 1.07915401, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.07048914756312923, + "language_loss": 1.00401747, + "learning_rate": 0.0009937512259377015, + "loss": 1.01525307, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.44384766, + "step": 410, + "time_per_iteration": 2.670149564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110678, + "balance_loss_mlp": 1.0668565, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.049646402233970426, + "language_loss": 0.99659574, + "learning_rate": 0.000993702029893006, + "loss": 1.00770259, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.4387207, + "step": 411, + "time_per_iteration": 2.7853777408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118473, + "balance_loss_mlp": 1.07200527, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.04880092350488667, + "language_loss": 0.98862529, + "learning_rate": 0.0009936526421761838, + "loss": 0.99981004, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.46435547, + "step": 412, + "time_per_iteration": 3.030674457550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114951, + "balance_loss_mlp": 1.07043815, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.04383720282943398, + "language_loss": 1.01490402, + "learning_rate": 0.000993603062806409, + "loss": 1.02605367, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.4453125, + "step": 413, + "time_per_iteration": 2.7101500034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0637151, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.046157231925668944, + "language_loss": 1.04664707, + "learning_rate": 0.0009935532918029298, + "loss": 1.05774391, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.45947266, + "step": 414, + "time_per_iteration": 2.593390941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118947, + "balance_loss_mlp": 1.07278943, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.058468816323775735, + "language_loss": 0.97956645, + "learning_rate": 0.0009935033291850694, + "loss": 0.99075592, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.46166992, + "step": 415, + "time_per_iteration": 2.6693851947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_mlp": 1.0654031, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.061030352209764355, + "language_loss": 1.00225627, + "learning_rate": 0.0009934531749722247, + "loss": 1.01337099, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.46044922, + "step": 416, + "time_per_iteration": 2.578746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_mlp": 1.07337523, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.05071064772829009, + "language_loss": 0.98778659, + "learning_rate": 0.0009934028291838672, + "loss": 0.99898028, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.45996094, + "step": 417, + "time_per_iteration": 2.7096333503723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106202, + "balance_loss_mlp": 1.06166553, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.045680808340910005, + "language_loss": 0.94326293, + "learning_rate": 0.0009933522918395433, + "loss": 0.95432496, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.44555664, + "step": 418, + "time_per_iteration": 2.644414186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04959176, + "balance_loss_mlp": 4.71808767, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.3214703434406663, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.83210278, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 2.40625, + "step": 419, + "time_per_iteration": 4.868964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_mlp": 1.07108891, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08060687528614664, + "language_loss": 1.13036489, + "learning_rate": 0.000993250642561551, + "loss": 1.14152122, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.4453125, + "step": 420, + "time_per_iteration": 2.632162094116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121548, + "balance_loss_mlp": 1.07538986, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.08633853635548816, + "language_loss": 0.9784801, + "learning_rate": 0.0009931995306673466, + "loss": 0.98969555, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.46118164, + "step": 421, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134412, + "balance_loss_mlp": 1.08815861, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.038770411105538145, + "language_loss": 1.03907061, + "learning_rate": 0.000993148227296103, + "loss": 1.05041468, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.4621582, + "step": 422, + "time_per_iteration": 2.669496536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133253, + "balance_loss_mlp": 1.08707166, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.053095831055692516, + "language_loss": 0.9112367, + "learning_rate": 0.000993096732467738, + "loss": 0.92256927, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.46166992, + "step": 423, + "time_per_iteration": 2.961660861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150855, + "balance_loss_mlp": 1.10498345, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.08137036582560589, + "language_loss": 0.99760056, + "learning_rate": 0.0009930450462022435, + "loss": 1.00910902, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.45874023, + "step": 424, + "time_per_iteration": 2.7952311038970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03600409, + "balance_loss_mlp": 3.48901963, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.18349806711668631, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.82790214, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.1171875, + "step": 425, + "time_per_iteration": 4.8854875564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.11344862, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.06491953183218531, + "language_loss": 0.9776966, + "learning_rate": 0.0009929410994402065, + "loss": 0.98928833, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.45703125, + "step": 426, + "time_per_iteration": 4.275091886520386 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169515, + "balance_loss_mlp": 1.12223697, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.07437504582125473, + "language_loss": 1.02033544, + "learning_rate": 0.0009928888389840196, + "loss": 1.03203058, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.47241211, + "step": 427, + "time_per_iteration": 2.7036454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145234, + "balance_loss_mlp": 1.09941018, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.05964472172349544, + "language_loss": 1.03706717, + "learning_rate": 0.0009928363871714147, + "loss": 1.04851961, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.45849609, + "step": 428, + "time_per_iteration": 2.6669116020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.10254741, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.07530468467255677, + "language_loss": 0.97491598, + "learning_rate": 0.0009927837440227556, + "loss": 0.98641634, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.47485352, + "step": 429, + "time_per_iteration": 2.8463807106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120703, + "balance_loss_mlp": 1.07588065, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.04140843961960757, + "language_loss": 0.92054397, + "learning_rate": 0.0009927309095584798, + "loss": 0.93175101, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.44824219, + "step": 430, + "time_per_iteration": 2.9767606258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116415, + "balance_loss_mlp": 1.07278419, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.04726827868993605, + "language_loss": 1.04780793, + "learning_rate": 0.0009926778837991, + "loss": 1.05897212, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.43652344, + "step": 431, + "time_per_iteration": 2.5883395671844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.06749809, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.049074519776006666, + "language_loss": 1.0243988, + "learning_rate": 0.000992624666765202, + "loss": 1.0355196, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.44604492, + "step": 432, + "time_per_iteration": 2.7943906784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_mlp": 1.07200766, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.04417562175093811, + "language_loss": 1.00109053, + "learning_rate": 0.000992571258477447, + "loss": 1.01224887, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.43823242, + "step": 433, + "time_per_iteration": 2.836127758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_mlp": 1.07260084, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.04319706549365549, + "language_loss": 0.93695247, + "learning_rate": 0.0009925176589565695, + "loss": 0.94812053, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.44213867, + "step": 434, + "time_per_iteration": 2.8157734870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131219, + "balance_loss_mlp": 1.08756483, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.04172416189060796, + "language_loss": 1.04242814, + "learning_rate": 0.0009924638682233791, + "loss": 1.05374026, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.43652344, + "step": 435, + "time_per_iteration": 2.5577316284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503783, + "balance_loss_mlp": 2.3527205, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06968128915635463, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82068378, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.5078125, + "step": 436, + "time_per_iteration": 4.594938516616821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.08348453, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.0610737753852808, + "language_loss": 0.94037408, + "learning_rate": 0.0009923557132036668, + "loss": 0.95166528, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.45629883, + "step": 437, + "time_per_iteration": 3.0716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_mlp": 1.07430601, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.04662895628051273, + "language_loss": 0.97730738, + "learning_rate": 0.0009923013489591345, + "loss": 0.98849535, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.4453125, + "step": 438, + "time_per_iteration": 2.726792812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_mlp": 1.06685066, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04626496214247174, + "language_loss": 0.96079296, + "learning_rate": 0.0009922467935862681, + "loss": 0.97189873, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.4375, + "step": 439, + "time_per_iteration": 3.0908052921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119416, + "balance_loss_mlp": 1.07273376, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.048922855388473234, + "language_loss": 0.99432743, + "learning_rate": 0.0009921920471062478, + "loss": 1.00552154, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.46655273, + "step": 440, + "time_per_iteration": 2.622451066970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117818, + "balance_loss_mlp": 1.07342434, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.07502031783190574, + "language_loss": 0.9797709, + "learning_rate": 0.0009921371095403281, + "loss": 0.99094903, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.44433594, + "step": 441, + "time_per_iteration": 2.705152750015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011116, + "balance_loss_mlp": 1.06863689, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.04941418140969711, + "language_loss": 1.00754833, + "learning_rate": 0.0009920819809098379, + "loss": 1.01866436, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.42993164, + "step": 442, + "time_per_iteration": 2.5887317657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119689, + "balance_loss_mlp": 1.07715499, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.06964486535702215, + "language_loss": 0.96275294, + "learning_rate": 0.0009920266612361798, + "loss": 0.97394979, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.42578125, + "step": 443, + "time_per_iteration": 2.745222330093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_mlp": 1.06587708, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.05163049083883061, + "language_loss": 0.96866751, + "learning_rate": 0.0009919711505408308, + "loss": 0.97974443, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.41821289, + "step": 444, + "time_per_iteration": 2.780095100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106314, + "balance_loss_mlp": 1.0654248, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.054748359311131624, + "language_loss": 0.94535226, + "learning_rate": 0.000991915448845342, + "loss": 0.95641541, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.40893555, + "step": 445, + "time_per_iteration": 2.5229337215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.06279922, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.0575820537988498, + "language_loss": 1.03181779, + "learning_rate": 0.000991859556171339, + "loss": 1.04284596, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.40039062, + "step": 446, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_mlp": 1.06497526, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.04289742759235468, + "language_loss": 1.05262291, + "learning_rate": 0.000991803472540521, + "loss": 1.06367946, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.40673828, + "step": 447, + "time_per_iteration": 2.6220486164093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_mlp": 1.06550729, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.04330621576945977, + "language_loss": 1.00096428, + "learning_rate": 0.0009917471979746615, + "loss": 1.01202178, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.40234375, + "step": 448, + "time_per_iteration": 2.9767467975616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.07379115, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.03609686036920932, + "language_loss": 0.98485255, + "learning_rate": 0.0009916907324956086, + "loss": 0.99600053, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.41015625, + "step": 449, + "time_per_iteration": 2.701143980026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117003, + "balance_loss_mlp": 1.07480288, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.04834207301210501, + "language_loss": 0.95441091, + "learning_rate": 0.0009916340761252837, + "loss": 0.965581, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.42211914, + "step": 450, + "time_per_iteration": 2.6036393642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129901, + "balance_loss_mlp": 1.08910751, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.07269963588094165, + "language_loss": 0.9243114, + "learning_rate": 0.0009915772288856832, + "loss": 0.93561041, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.40820312, + "step": 451, + "time_per_iteration": 3.05719256401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125865, + "balance_loss_mlp": 1.08359361, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.05954656443346509, + "language_loss": 0.93746579, + "learning_rate": 0.000991520190798877, + "loss": 0.94872439, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.42285156, + "step": 452, + "time_per_iteration": 2.804128885269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_mlp": 1.07723105, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.05604676795867647, + "language_loss": 1.04000187, + "learning_rate": 0.0009914629618870089, + "loss": 1.05120206, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.42797852, + "step": 453, + "time_per_iteration": 2.8959083557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02032313, + "balance_loss_mlp": 1.86675501, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.06678910630402063, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.80708182, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.65625, + "step": 454, + "time_per_iteration": 4.753306865692139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974747, + "balance_loss_mlp": 1.80537415, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.06350102966569023, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83402705, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.6953125, + "step": 455, + "time_per_iteration": 4.909627914428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_mlp": 1.05778539, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.07384563339861851, + "language_loss": 0.95938599, + "learning_rate": 0.0009912901304235883, + "loss": 0.97038674, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.42333984, + "step": 456, + "time_per_iteration": 3.0303096771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_mlp": 1.05112898, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.061767025741825826, + "language_loss": 0.93898749, + "learning_rate": 0.000991232138434397, + "loss": 0.94991863, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.41992188, + "step": 457, + "time_per_iteration": 2.834221601486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089137, + "balance_loss_mlp": 1.04824805, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.05183647995223567, + "language_loss": 1.00765896, + "learning_rate": 0.000991173955731976, + "loss": 1.0185504, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.40869141, + "step": 458, + "time_per_iteration": 2.628783702850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_mlp": 1.05569601, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.052575936673692925, + "language_loss": 1.04489028, + "learning_rate": 0.0009911155823389137, + "loss": 1.0558753, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.42797852, + "step": 459, + "time_per_iteration": 2.964416742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_mlp": 1.06523609, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.05270293395412616, + "language_loss": 1.00385904, + "learning_rate": 0.000991057018277873, + "loss": 1.01492882, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.41748047, + "step": 460, + "time_per_iteration": 2.6944808959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_mlp": 1.06245136, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.04953210926048159, + "language_loss": 1.01399374, + "learning_rate": 0.0009909982635715898, + "loss": 1.02504039, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.42236328, + "step": 461, + "time_per_iteration": 2.6137924194335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_mlp": 1.05374336, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.050729417377465176, + "language_loss": 1.00123549, + "learning_rate": 0.0009909393182428751, + "loss": 1.01219559, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.42285156, + "step": 462, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109539, + "balance_loss_mlp": 1.06891286, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.043715633324142876, + "language_loss": 0.94138575, + "learning_rate": 0.000990880182314614, + "loss": 0.95248115, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.40625, + "step": 463, + "time_per_iteration": 2.733408212661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_mlp": 1.06121325, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.051961844945365605, + "language_loss": 0.94176865, + "learning_rate": 0.0009908208558097643, + "loss": 0.9527818, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.40087891, + "step": 464, + "time_per_iteration": 2.9006474018096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105621, + "balance_loss_mlp": 1.06508923, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.04470923680131565, + "language_loss": 0.9716863, + "learning_rate": 0.000990761338751359, + "loss": 0.98274255, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.40527344, + "step": 465, + "time_per_iteration": 2.775830030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410893, + "balance_loss_mlp": 1.25296497, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.0425617539044403, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75070524, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.578125, + "step": 466, + "time_per_iteration": 5.023500919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_mlp": 1.05869305, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.04007163966797277, + "language_loss": 0.9983623, + "learning_rate": 0.0009906417330663815, + "loss": 1.00936306, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.41381836, + "step": 467, + "time_per_iteration": 2.6194305419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099405, + "balance_loss_mlp": 1.05889773, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.03985353179312445, + "language_loss": 0.96447593, + "learning_rate": 0.0009905816444862442, + "loss": 0.97546995, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.4050293, + "step": 468, + "time_per_iteration": 2.623267889022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_mlp": 1.06568456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.038840192804800056, + "language_loss": 0.93513083, + "learning_rate": 0.0009905213654454216, + "loss": 0.94620228, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.41455078, + "step": 469, + "time_per_iteration": 2.9024641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_mlp": 1.06466317, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.04985478927164425, + "language_loss": 1.01848495, + "learning_rate": 0.0009904608959673158, + "loss": 1.02953827, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.40649414, + "step": 470, + "time_per_iteration": 2.7711682319641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097659, + "balance_loss_mlp": 1.0588448, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.04989175862356038, + "language_loss": 1.02851224, + "learning_rate": 0.000990400236075403, + "loss": 1.03948903, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.38793945, + "step": 471, + "time_per_iteration": 2.536189317703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109095, + "balance_loss_mlp": 1.05113411, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.03738902964718639, + "language_loss": 0.98994756, + "learning_rate": 0.0009903393857932338, + "loss": 1.000857, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.39794922, + "step": 472, + "time_per_iteration": 2.6588857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097802, + "balance_loss_mlp": 1.05908275, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.045733529486957185, + "language_loss": 0.97091877, + "learning_rate": 0.0009902783451444317, + "loss": 0.98189688, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.38720703, + "step": 473, + "time_per_iteration": 2.6981122493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091239, + "balance_loss_mlp": 1.05406976, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.04942472768420212, + "language_loss": 1.00819659, + "learning_rate": 0.0009902171141526956, + "loss": 1.01910901, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.37158203, + "step": 474, + "time_per_iteration": 2.527256727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099497, + "balance_loss_mlp": 1.06225586, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.04275448033987936, + "language_loss": 0.88210893, + "learning_rate": 0.000990155692841797, + "loss": 0.8931039, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.37231445, + "step": 475, + "time_per_iteration": 2.989063262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_mlp": 1.06084871, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.04412440376655801, + "language_loss": 1.00229144, + "learning_rate": 0.0009900940812355818, + "loss": 1.01326227, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.36254883, + "step": 476, + "time_per_iteration": 2.8778445720672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105736, + "balance_loss_mlp": 1.07011676, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.06417087981964828, + "language_loss": 0.97168529, + "learning_rate": 0.00099003227935797, + "loss": 0.98274267, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.35620117, + "step": 477, + "time_per_iteration": 2.708608627319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101416, + "balance_loss_mlp": 1.06369829, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.06707216335576115, + "language_loss": 1.01291215, + "learning_rate": 0.000989970287232955, + "loss": 1.02392626, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.37695312, + "step": 478, + "time_per_iteration": 2.783325672149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090795, + "balance_loss_mlp": 1.05431736, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.05564878549890474, + "language_loss": 0.9726451, + "learning_rate": 0.0009899081048846043, + "loss": 0.98355305, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.36474609, + "step": 479, + "time_per_iteration": 2.6017916202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097049, + "balance_loss_mlp": 1.05964088, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.06044394784495309, + "language_loss": 1.03484094, + "learning_rate": 0.0009898457323370593, + "loss": 1.04581141, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.37402344, + "step": 480, + "time_per_iteration": 2.575676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.0533123, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.05778783373137127, + "language_loss": 0.99753714, + "learning_rate": 0.000989783169614535, + "loss": 1.00844884, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.37817383, + "step": 481, + "time_per_iteration": 2.646942615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283887, + "balance_loss_mlp": 1.15876544, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.01956789957612316, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80036646, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.25, + "step": 482, + "time_per_iteration": 4.860741376876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_mlp": 1.06158745, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.06801501049369231, + "language_loss": 0.97102278, + "learning_rate": 0.000989657473741779, + "loss": 0.98201108, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.37231445, + "step": 483, + "time_per_iteration": 2.819138526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095911, + "balance_loss_mlp": 1.05979109, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.038333848574242754, + "language_loss": 0.98462784, + "learning_rate": 0.0009895943406403465, + "loss": 0.99558693, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.36132812, + "step": 484, + "time_per_iteration": 2.7088170051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_mlp": 1.06854701, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.05828015098596693, + "language_loss": 0.92231822, + "learning_rate": 0.0009895310174615338, + "loss": 0.933357, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.35351562, + "step": 485, + "time_per_iteration": 2.760511636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_mlp": 1.14983261, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.018538812380254305, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76984316, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.15625, + "step": 486, + "time_per_iteration": 4.656491994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_mlp": 1.0699296, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.04721263549483299, + "language_loss": 0.95839012, + "learning_rate": 0.0009894038009701782, + "loss": 0.96944392, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.35498047, + "step": 487, + "time_per_iteration": 2.6169629096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_mlp": 1.06868315, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.05102581257360949, + "language_loss": 0.98848963, + "learning_rate": 0.0009893399077070253, + "loss": 0.99952644, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.3503418, + "step": 488, + "time_per_iteration": 2.5845744609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_mlp": 1.07193291, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.05918319403016569, + "language_loss": 0.92944884, + "learning_rate": 0.0009892758244652718, + "loss": 0.94051951, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.35180664, + "step": 489, + "time_per_iteration": 2.660200357437134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091731, + "balance_loss_mlp": 1.05801892, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.041386989889926534, + "language_loss": 1.00010514, + "learning_rate": 0.0009892115512697968, + "loss": 1.01102245, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.33740234, + "step": 490, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_mlp": 1.05631554, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.04182034264497562, + "language_loss": 1.00108159, + "learning_rate": 0.0009891470881455537, + "loss": 1.01198137, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.33666992, + "step": 491, + "time_per_iteration": 2.746169328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108736, + "balance_loss_mlp": 1.05319476, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.0458284589248403, + "language_loss": 0.98654628, + "learning_rate": 0.0009890824351175692, + "loss": 0.99741989, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.34204102, + "step": 492, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.05654192, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.041327442652051224, + "language_loss": 1.0219661, + "learning_rate": 0.0009890175922109435, + "loss": 1.0328722, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.34082031, + "step": 493, + "time_per_iteration": 2.6482973098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010971, + "balance_loss_mlp": 1.06086028, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.06926989533772566, + "language_loss": 1.01090789, + "learning_rate": 0.0009889525594508513, + "loss": 1.02187896, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.36254883, + "step": 494, + "time_per_iteration": 3.0095505714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_mlp": 1.05596447, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.04986765426945594, + "language_loss": 0.94310975, + "learning_rate": 0.0009888873368625404, + "loss": 0.95402986, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.3605957, + "step": 495, + "time_per_iteration": 2.5451042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05426204, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.05650320770937666, + "language_loss": 0.98877072, + "learning_rate": 0.0009888219244713326, + "loss": 0.99966443, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.3515625, + "step": 496, + "time_per_iteration": 2.8157310485839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086342, + "balance_loss_mlp": 1.05100799, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.05039739829653265, + "language_loss": 0.99588835, + "learning_rate": 0.0009887563223026229, + "loss": 1.00675178, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.35375977, + "step": 497, + "time_per_iteration": 2.6563401222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244906, + "balance_loss_mlp": 1.14648652, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.01649790273231252, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80313075, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.98046875, + "step": 498, + "time_per_iteration": 4.8689799308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098776, + "balance_loss_mlp": 1.0630604, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.06260101269903841, + "language_loss": 0.97272921, + "learning_rate": 0.0009886245487346482, + "loss": 0.98371696, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35742188, + "step": 499, + "time_per_iteration": 3.0292818546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.08159947, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.055723050712230264, + "language_loss": 1.00704551, + "learning_rate": 0.0009885583773865422, + "loss": 1.01822114, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.35986328, + "step": 500, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117756, + "balance_loss_mlp": 1.08137345, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.06268683986847115, + "language_loss": 0.9714855, + "learning_rate": 0.0009884920163632524, + "loss": 0.98266304, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.36352539, + "step": 501, + "time_per_iteration": 2.666341781616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111747, + "balance_loss_mlp": 1.07638931, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.04553274405873497, + "language_loss": 1.01245189, + "learning_rate": 0.000988425465690543, + "loss": 1.02356935, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35375977, + "step": 502, + "time_per_iteration": 2.55082106590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06867552, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.04373339165225573, + "language_loss": 0.99427342, + "learning_rate": 0.0009883587253942505, + "loss": 1.00530469, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.34472656, + "step": 503, + "time_per_iteration": 2.7674455642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_mlp": 1.07378531, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.051161986083573203, + "language_loss": 1.04393589, + "learning_rate": 0.0009882917955002862, + "loss": 1.05501866, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.3449707, + "step": 504, + "time_per_iteration": 2.549203872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_mlp": 1.07116556, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.04840022534917253, + "language_loss": 0.95342839, + "learning_rate": 0.0009882246760346343, + "loss": 0.96448457, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.3449707, + "step": 505, + "time_per_iteration": 2.653627872467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115925, + "balance_loss_mlp": 1.08128262, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.08271599518488834, + "language_loss": 1.02799106, + "learning_rate": 0.0009881573670233533, + "loss": 1.03915036, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.34692383, + "step": 506, + "time_per_iteration": 2.5279319286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104761, + "balance_loss_mlp": 1.07061946, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.05291653517072512, + "language_loss": 0.96169406, + "learning_rate": 0.0009880898684925747, + "loss": 0.97274166, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.34179688, + "step": 507, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_mlp": 1.06039834, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.053809005456099755, + "language_loss": 0.94680405, + "learning_rate": 0.0009880221804685037, + "loss": 0.95776224, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.35424805, + "step": 508, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245061, + "balance_loss_mlp": 1.15503371, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.024665830319341657, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80589479, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.8984375, + "step": 509, + "time_per_iteration": 4.705655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094606, + "balance_loss_mlp": 1.05932045, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.06644626598388864, + "language_loss": 1.02131915, + "learning_rate": 0.0009878862360456733, + "loss": 1.03226519, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.35327148, + "step": 510, + "time_per_iteration": 2.682035446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097961, + "balance_loss_mlp": 1.06336641, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.06543943311749917, + "language_loss": 0.9266718, + "learning_rate": 0.0009878179796996922, + "loss": 0.9376514, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.34619141, + "step": 511, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105256, + "balance_loss_mlp": 1.07030368, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.054213046356477584, + "language_loss": 0.96428764, + "learning_rate": 0.0009877495339659754, + "loss": 0.97534013, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.34985352, + "step": 512, + "time_per_iteration": 2.746337413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105714, + "balance_loss_mlp": 1.07190621, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.0573170093193853, + "language_loss": 0.91841626, + "learning_rate": 0.000987680898871096, + "loss": 0.9294734, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.33837891, + "step": 513, + "time_per_iteration": 2.7060482501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110186, + "balance_loss_mlp": 1.07675993, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.0786420176645203, + "language_loss": 0.95400196, + "learning_rate": 0.0009876120744417, + "loss": 0.96510386, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33447266, + "step": 514, + "time_per_iteration": 2.9473536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105767, + "balance_loss_mlp": 1.07071972, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.04861145683213968, + "language_loss": 1.01586378, + "learning_rate": 0.0009875430607045078, + "loss": 1.02692139, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.35058594, + "step": 515, + "time_per_iteration": 2.6745734214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095325, + "balance_loss_mlp": 1.06044412, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.061184004848699555, + "language_loss": 0.96467805, + "learning_rate": 0.000987473857686313, + "loss": 0.97563124, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.34887695, + "step": 516, + "time_per_iteration": 2.70771861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_mlp": 1.06909752, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.06268031252544905, + "language_loss": 1.01795554, + "learning_rate": 0.0009874044654139824, + "loss": 1.02899015, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34399414, + "step": 517, + "time_per_iteration": 2.7501027584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104488, + "balance_loss_mlp": 1.07020378, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.05802057466070587, + "language_loss": 1.01047516, + "learning_rate": 0.0009873348839144563, + "loss": 1.02152014, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34301758, + "step": 518, + "time_per_iteration": 2.5247762203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125408, + "balance_loss_mlp": 1.09100425, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.057276560313135924, + "language_loss": 1.0153054, + "learning_rate": 0.000987265113214749, + "loss": 1.02655947, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34448242, + "step": 519, + "time_per_iteration": 2.569776773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151248, + "balance_loss_mlp": 1.11705852, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.06886779278024428, + "language_loss": 1.05486548, + "learning_rate": 0.0009871951533419476, + "loss": 1.066378, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.34204102, + "step": 520, + "time_per_iteration": 2.646489381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155904, + "balance_loss_mlp": 1.12085652, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.06947260655531057, + "language_loss": 0.93715644, + "learning_rate": 0.0009871250043232132, + "loss": 0.94871557, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.35058594, + "step": 521, + "time_per_iteration": 2.729825258255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145676, + "balance_loss_mlp": 1.11196363, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.05700460680955029, + "language_loss": 0.94319808, + "learning_rate": 0.0009870546661857797, + "loss": 0.95465487, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.33740234, + "step": 522, + "time_per_iteration": 2.589205026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.10572577, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.0627280587118585, + "language_loss": 1.04607201, + "learning_rate": 0.0009869841389569553, + "loss": 1.05746591, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.33666992, + "step": 523, + "time_per_iteration": 3.007927656173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_mlp": 1.07816648, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.07025860249961899, + "language_loss": 0.94709289, + "learning_rate": 0.0009869134226641206, + "loss": 0.95821834, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.34399414, + "step": 524, + "time_per_iteration": 2.5647661685943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096367, + "balance_loss_mlp": 1.06134343, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.0754869647085307, + "language_loss": 0.96719551, + "learning_rate": 0.0009868425173347303, + "loss": 0.97815919, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.3503418, + "step": 525, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_mlp": 1.04816294, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.04461045481777941, + "language_loss": 1.01427031, + "learning_rate": 0.0009867714229963125, + "loss": 1.02508664, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.3347168, + "step": 526, + "time_per_iteration": 2.7551424503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_mlp": 1.06672287, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.06519670287778681, + "language_loss": 0.99495387, + "learning_rate": 0.000986700139676468, + "loss": 1.00596797, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34716797, + "step": 527, + "time_per_iteration": 2.5689845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_mlp": 1.08317983, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.055001529425537175, + "language_loss": 0.97175169, + "learning_rate": 0.0009866286674028717, + "loss": 0.98293233, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.34936523, + "step": 528, + "time_per_iteration": 2.6308236122131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118149, + "balance_loss_mlp": 1.08307743, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.06791274268555884, + "language_loss": 0.93964088, + "learning_rate": 0.0009865570062032717, + "loss": 0.95082229, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.35083008, + "step": 529, + "time_per_iteration": 2.931939125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117806, + "balance_loss_mlp": 1.08104193, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.05469252484924326, + "language_loss": 0.97321147, + "learning_rate": 0.0009864851561054893, + "loss": 0.98438954, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.36743164, + "step": 530, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_mlp": 1.0567745, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.053032092698093954, + "language_loss": 0.97237867, + "learning_rate": 0.0009864131171374191, + "loss": 0.9832958, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34936523, + "step": 531, + "time_per_iteration": 2.671963930130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_mlp": 1.05704737, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.037042660663456926, + "language_loss": 0.97530323, + "learning_rate": 0.0009863408893270292, + "loss": 0.98621887, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.34521484, + "step": 532, + "time_per_iteration": 2.8692965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080567, + "balance_loss_mlp": 1.0459249, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.045189468397627275, + "language_loss": 0.93818736, + "learning_rate": 0.0009862684727023605, + "loss": 0.94899297, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34692383, + "step": 533, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_mlp": 1.04978406, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.041807858593286534, + "language_loss": 0.94846106, + "learning_rate": 0.0009861958672915283, + "loss": 0.95930672, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.34814453, + "step": 534, + "time_per_iteration": 2.7894833087921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088029, + "balance_loss_mlp": 1.05348206, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.04113334704287127, + "language_loss": 0.93477535, + "learning_rate": 0.0009861230731227201, + "loss": 0.94565558, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.34570312, + "step": 535, + "time_per_iteration": 2.8369100093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_mlp": 1.06589389, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.06472741174466715, + "language_loss": 0.9716177, + "learning_rate": 0.0009860500902241973, + "loss": 0.98262858, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.35205078, + "step": 536, + "time_per_iteration": 2.6308608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_mlp": 1.06559658, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.06015330648509861, + "language_loss": 1.02488375, + "learning_rate": 0.0009859769186242942, + "loss": 1.0358845, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.34521484, + "step": 537, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094076, + "balance_loss_mlp": 1.06188989, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04182272700248836, + "language_loss": 0.96166039, + "learning_rate": 0.0009859035583514187, + "loss": 0.97260106, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32177734, + "step": 538, + "time_per_iteration": 2.665483236312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107546, + "balance_loss_mlp": 1.07497787, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.03728554890083732, + "language_loss": 0.9932602, + "learning_rate": 0.0009858300094340517, + "loss": 1.00433564, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.32568359, + "step": 539, + "time_per_iteration": 2.772207021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_mlp": 1.07908368, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.05284254114338104, + "language_loss": 0.91679931, + "learning_rate": 0.0009857562719007473, + "loss": 0.92790818, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.31787109, + "step": 540, + "time_per_iteration": 2.633002519607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_mlp": 1.06964111, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.07454941449424961, + "language_loss": 0.93962657, + "learning_rate": 0.0009856823457801331, + "loss": 0.95063812, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.888354539871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098965, + "balance_loss_mlp": 1.06682634, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.06016078646373104, + "language_loss": 1.01014686, + "learning_rate": 0.00098560823110091, + "loss": 1.02113652, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32128906, + "step": 542, + "time_per_iteration": 2.612365484237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.05664408, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.07331709746631812, + "language_loss": 0.99634022, + "learning_rate": 0.000985533927891851, + "loss": 1.00722837, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.3215332, + "step": 543, + "time_per_iteration": 2.6642584800720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_mlp": 1.05406535, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.07406485241554656, + "language_loss": 0.99318308, + "learning_rate": 0.0009854594361818044, + "loss": 1.00405657, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33300781, + "step": 544, + "time_per_iteration": 2.650541067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087044, + "balance_loss_mlp": 1.05357027, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.05515562757052397, + "language_loss": 0.98072803, + "learning_rate": 0.0009853847559996897, + "loss": 0.99159849, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.3347168, + "step": 545, + "time_per_iteration": 2.7268693447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098973, + "balance_loss_mlp": 1.0640682, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.05014767442192859, + "language_loss": 0.9781934, + "learning_rate": 0.0009853098873745, + "loss": 0.98918307, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34936523, + "step": 546, + "time_per_iteration": 3.001844644546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094885, + "balance_loss_mlp": 1.06010008, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.06665960072991474, + "language_loss": 0.96499509, + "learning_rate": 0.0009852348303353027, + "loss": 0.97594392, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34814453, + "step": 547, + "time_per_iteration": 2.7768120765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109085, + "balance_loss_mlp": 1.05692363, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.04477171592325676, + "language_loss": 0.89746928, + "learning_rate": 0.000985159584911237, + "loss": 0.90837783, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33959961, + "step": 548, + "time_per_iteration": 3.1397063732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109064, + "balance_loss_mlp": 1.0567131, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.057455808878804256, + "language_loss": 0.97617745, + "learning_rate": 0.0009850841511315162, + "loss": 0.98708391, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.33959961, + "step": 549, + "time_per_iteration": 2.6143858432769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.05660701, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.04134640300819554, + "language_loss": 0.97230792, + "learning_rate": 0.0009850085290254256, + "loss": 0.98321134, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33740234, + "step": 550, + "time_per_iteration": 2.784057855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_mlp": 1.05478084, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.041486348142279396, + "language_loss": 0.9340632, + "learning_rate": 0.0009849327186223246, + "loss": 0.94494367, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.33276367, + "step": 551, + "time_per_iteration": 2.822755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086641, + "balance_loss_mlp": 1.0536921, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.044652358506572586, + "language_loss": 1.00453854, + "learning_rate": 0.000984856719951646, + "loss": 1.01540482, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.32958984, + "step": 552, + "time_per_iteration": 2.561384439468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_mlp": 1.05577254, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.05595352831954139, + "language_loss": 0.98322356, + "learning_rate": 0.0009847805330428943, + "loss": 0.99410868, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.32739258, + "step": 553, + "time_per_iteration": 2.8988356590270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04940784, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05618387686115577, + "language_loss": 1.02895415, + "learning_rate": 0.0009847041579256481, + "loss": 1.03977895, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.33081055, + "step": 554, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088152, + "balance_loss_mlp": 1.05548859, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.04459262579832553, + "language_loss": 0.99802542, + "learning_rate": 0.0009846275946295592, + "loss": 1.00890684, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32641602, + "step": 555, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108533, + "balance_loss_mlp": 1.05347764, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.04108965909817336, + "language_loss": 0.92502242, + "learning_rate": 0.0009845508431843518, + "loss": 0.93587577, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.31835938, + "step": 556, + "time_per_iteration": 3.0189473628997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087957, + "balance_loss_mlp": 1.05612838, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.05029379164990677, + "language_loss": 0.95060432, + "learning_rate": 0.0009844739036198233, + "loss": 0.96148396, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.31811523, + "step": 557, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06340766, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.047100661757994676, + "language_loss": 1.0152961, + "learning_rate": 0.0009843967759658448, + "loss": 1.02625763, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.32739258, + "step": 558, + "time_per_iteration": 2.6677682399749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264894, + "balance_loss_mlp": 1.19775486, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.03689581784010691, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74032652, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.671875, + "step": 559, + "time_per_iteration": 4.873044013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.07234466, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.06480790167761245, + "language_loss": 1.01098323, + "learning_rate": 0.000984241956509384, + "loss": 1.02203977, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.33325195, + "step": 560, + "time_per_iteration": 2.655430555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095265, + "balance_loss_mlp": 1.0617907, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.05361377514900226, + "language_loss": 1.00074768, + "learning_rate": 0.0009841642647670078, + "loss": 1.01170027, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.33496094, + "step": 561, + "time_per_iteration": 2.5627329349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.05633116, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.04993888185520414, + "language_loss": 0.93071151, + "learning_rate": 0.0009840863850553944, + "loss": 0.94160575, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33105469, + "step": 562, + "time_per_iteration": 3.0020592212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108807, + "balance_loss_mlp": 1.05686092, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.046287089248472475, + "language_loss": 0.97956204, + "learning_rate": 0.0009840083174047782, + "loss": 0.99044275, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.31176758, + "step": 563, + "time_per_iteration": 2.7123258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_mlp": 1.06275535, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.036863902598139514, + "language_loss": 0.91394317, + "learning_rate": 0.0009839300618454685, + "loss": 0.92488301, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31176758, + "step": 564, + "time_per_iteration": 2.855482578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_mlp": 1.05386496, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0447892393855046, + "language_loss": 0.97269231, + "learning_rate": 0.0009838516184078466, + "loss": 0.98355657, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.32568359, + "step": 565, + "time_per_iteration": 2.8027093410491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090742, + "balance_loss_mlp": 1.05881739, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.039430635834492286, + "language_loss": 0.95326865, + "learning_rate": 0.0009837729871223669, + "loss": 0.964176, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3190918, + "step": 566, + "time_per_iteration": 2.621044158935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097443, + "balance_loss_mlp": 1.06473231, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.03524126234366562, + "language_loss": 0.96988255, + "learning_rate": 0.0009836941680195568, + "loss": 0.98085701, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.32714844, + "step": 567, + "time_per_iteration": 2.8241846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_mlp": 1.06359148, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.05940738915226433, + "language_loss": 0.94011569, + "learning_rate": 0.0009836151611300166, + "loss": 0.95106757, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.31567383, + "step": 568, + "time_per_iteration": 3.2259325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093362, + "balance_loss_mlp": 1.06327355, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.04952949609465528, + "language_loss": 1.01886261, + "learning_rate": 0.0009835359664844194, + "loss": 1.02979624, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.30029297, + "step": 569, + "time_per_iteration": 2.61936616897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235986, + "balance_loss_mlp": 1.17113578, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.02580255803672051, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82272792, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.6484375, + "step": 570, + "time_per_iteration": 4.946800470352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_mlp": 1.06947398, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.04088785760268294, + "language_loss": 0.98121774, + "learning_rate": 0.0009833770140481118, + "loss": 0.99224108, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.32861328, + "step": 571, + "time_per_iteration": 2.6676580905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_mlp": 1.07113993, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.04146527084622454, + "language_loss": 0.88084227, + "learning_rate": 0.000983297256319112, + "loss": 0.89187813, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.32446289, + "step": 572, + "time_per_iteration": 3.1977450847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098726, + "balance_loss_mlp": 1.06503749, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.11112801331440751, + "language_loss": 0.93675387, + "learning_rate": 0.000983217310957477, + "loss": 0.94774115, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33691406, + "step": 573, + "time_per_iteration": 2.771477222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08530974, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.046936313049011164, + "language_loss": 0.98079342, + "learning_rate": 0.000983137177994244, + "loss": 0.99198341, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.3371582, + "step": 574, + "time_per_iteration": 2.842641830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127826, + "balance_loss_mlp": 1.0945909, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.047970587572460185, + "language_loss": 0.91368234, + "learning_rate": 0.0009830568574605235, + "loss": 0.92496061, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.33227539, + "step": 575, + "time_per_iteration": 2.9841148853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136053, + "balance_loss_mlp": 1.10260296, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.06212944390612344, + "language_loss": 0.95608473, + "learning_rate": 0.0009829763493874992, + "loss": 0.96744525, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3347168, + "step": 576, + "time_per_iteration": 3.094599485397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122949, + "balance_loss_mlp": 1.08918953, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.040009357062280086, + "language_loss": 1.0022918, + "learning_rate": 0.0009828956538064264, + "loss": 1.01352131, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.33764648, + "step": 577, + "time_per_iteration": 2.7913765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128001, + "balance_loss_mlp": 1.09428823, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07834189266391174, + "language_loss": 0.97103804, + "learning_rate": 0.0009828147707486344, + "loss": 0.98231804, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.33740234, + "step": 578, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.0659467, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.066476002167881, + "language_loss": 0.94244707, + "learning_rate": 0.0009827337002455245, + "loss": 0.95344198, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.33544922, + "step": 579, + "time_per_iteration": 2.6212143898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.05940461, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.0598380025645264, + "language_loss": 0.93403691, + "learning_rate": 0.0009826524423285712, + "loss": 0.94494587, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.31469727, + "step": 580, + "time_per_iteration": 2.916363000869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_mlp": 1.05466461, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.051352596452175936, + "language_loss": 0.95457065, + "learning_rate": 0.0009825709970293218, + "loss": 0.96543789, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.32055664, + "step": 581, + "time_per_iteration": 2.975459575653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094975, + "balance_loss_mlp": 1.06414759, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.06330579048660655, + "language_loss": 1.01360774, + "learning_rate": 0.0009824893643793956, + "loss": 1.02455735, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.30810547, + "step": 582, + "time_per_iteration": 3.0850436687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109989, + "balance_loss_mlp": 1.06772757, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.05517621871728721, + "language_loss": 0.96568394, + "learning_rate": 0.0009824075444104857, + "loss": 0.9766829, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3215332, + "step": 583, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104353, + "balance_loss_mlp": 1.07214284, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.05273776870459213, + "language_loss": 1.00669086, + "learning_rate": 0.000982325537154357, + "loss": 1.01773441, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.32202148, + "step": 584, + "time_per_iteration": 2.566066265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109419, + "balance_loss_mlp": 1.07768583, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.05755454669423396, + "language_loss": 1.01869726, + "learning_rate": 0.0009822433426428484, + "loss": 1.02979159, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31713867, + "step": 585, + "time_per_iteration": 2.611968994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_mlp": 1.08987498, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.06034275506000564, + "language_loss": 0.93750811, + "learning_rate": 0.0009821609609078697, + "loss": 0.94872963, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.32275391, + "step": 586, + "time_per_iteration": 2.584847927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0726887, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.06416707827025614, + "language_loss": 0.95279968, + "learning_rate": 0.0009820783919814045, + "loss": 0.96384937, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.32275391, + "step": 587, + "time_per_iteration": 2.7885184288024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06359744, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.049104346633589514, + "language_loss": 0.92135406, + "learning_rate": 0.0009819956358955095, + "loss": 0.93231547, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32543945, + "step": 588, + "time_per_iteration": 2.560117483139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_mlp": 1.05427432, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.05114307144868452, + "language_loss": 0.93675017, + "learning_rate": 0.0009819126926823127, + "loss": 0.94761813, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.32519531, + "step": 589, + "time_per_iteration": 2.517035722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.05966008, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.04613241529975588, + "language_loss": 0.94437975, + "learning_rate": 0.000981829562374016, + "loss": 0.95531201, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.33569336, + "step": 590, + "time_per_iteration": 2.8174262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_mlp": 1.05913091, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.05348492004263644, + "language_loss": 1.04949331, + "learning_rate": 0.0009817462450028933, + "loss": 1.0604248, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.34057617, + "step": 591, + "time_per_iteration": 2.6302859783172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.0668143, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.2030818500746725, + "language_loss": 0.92329478, + "learning_rate": 0.0009816627406012916, + "loss": 0.93430716, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.34472656, + "step": 592, + "time_per_iteration": 2.8384313583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.09943521, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.0774704650100976, + "language_loss": 0.91851664, + "learning_rate": 0.0009815790492016295, + "loss": 0.92987645, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36523438, + "step": 593, + "time_per_iteration": 2.9409682750701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136834, + "balance_loss_mlp": 1.10192943, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.09332707993556091, + "language_loss": 0.94690275, + "learning_rate": 0.0009814951708363993, + "loss": 0.95827115, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.34912109, + "step": 594, + "time_per_iteration": 2.8599631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221657, + "balance_loss_mlp": 1.16023993, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.030934197408724044, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79212642, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.61328125, + "step": 595, + "time_per_iteration": 4.801583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_mlp": 1.10138512, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.0746127254366864, + "language_loss": 0.94972038, + "learning_rate": 0.0009813268533395648, + "loss": 0.96109354, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.359375, + "step": 596, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_mlp": 1.0882678, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.061536990211155544, + "language_loss": 0.95371294, + "learning_rate": 0.0009812424142733073, + "loss": 0.96494377, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.34765625, + "step": 597, + "time_per_iteration": 2.5663998126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07387781, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.04795398370622496, + "language_loss": 0.91199464, + "learning_rate": 0.000981157788372175, + "loss": 0.92308056, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.34716797, + "step": 598, + "time_per_iteration": 3.004436492919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_mlp": 1.06864619, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.04762632796488997, + "language_loss": 0.94997883, + "learning_rate": 0.0009810729756690223, + "loss": 0.96100628, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.34106445, + "step": 599, + "time_per_iteration": 2.704676628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_mlp": 1.06947374, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.06699944809564747, + "language_loss": 0.98224139, + "learning_rate": 0.0009809879761967766, + "loss": 0.99328732, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35107422, + "step": 600, + "time_per_iteration": 2.953348159790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_mlp": 1.07922578, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.06801646297960097, + "language_loss": 0.96874714, + "learning_rate": 0.0009809027899884378, + "loss": 0.97988677, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.34765625, + "step": 601, + "time_per_iteration": 2.896559953689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104267, + "balance_loss_mlp": 1.07014918, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.062436318450634756, + "language_loss": 0.9484992, + "learning_rate": 0.0009808174170770779, + "loss": 0.95954192, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.34130859, + "step": 602, + "time_per_iteration": 2.814558982849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220455, + "balance_loss_mlp": 1.16704941, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.025680107820064087, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86118698, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.53515625, + "step": 603, + "time_per_iteration": 4.897503614425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118739, + "balance_loss_mlp": 1.08566999, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.05533944227900463, + "language_loss": 1.0028702, + "learning_rate": 0.0009806461112779462, + "loss": 1.01405764, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.33081055, + "step": 604, + "time_per_iteration": 2.6172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115094, + "balance_loss_mlp": 1.08281231, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.07231087595972972, + "language_loss": 0.97971618, + "learning_rate": 0.0009805601784566814, + "loss": 0.99086702, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.32250977, + "step": 605, + "time_per_iteration": 2.4791650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125941, + "balance_loss_mlp": 1.09208584, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.06015253149930396, + "language_loss": 1.02430916, + "learning_rate": 0.0009804740590654089, + "loss": 1.03556848, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.33862305, + "step": 606, + "time_per_iteration": 2.614476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124787, + "balance_loss_mlp": 1.09229016, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.08034134565527169, + "language_loss": 0.97153747, + "learning_rate": 0.0009803877531375635, + "loss": 0.9827854, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.32495117, + "step": 607, + "time_per_iteration": 2.851011276245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_mlp": 1.09228706, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.05400582488055185, + "language_loss": 0.97512484, + "learning_rate": 0.0009803012607066523, + "loss": 0.9864068, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.35913086, + "step": 608, + "time_per_iteration": 2.700596570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128537, + "balance_loss_mlp": 1.09294093, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.15792902837654846, + "language_loss": 0.95375645, + "learning_rate": 0.0009802145818062543, + "loss": 0.96504182, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.35620117, + "step": 609, + "time_per_iteration": 2.693417549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123637, + "balance_loss_mlp": 1.08742094, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.06851059455565046, + "language_loss": 0.99132365, + "learning_rate": 0.0009801277164700212, + "loss": 1.00256002, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36254883, + "step": 610, + "time_per_iteration": 2.5825185775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131797, + "balance_loss_mlp": 1.09541452, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.1113382534985323, + "language_loss": 0.96033651, + "learning_rate": 0.0009800406647316776, + "loss": 0.97165447, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.36376953, + "step": 611, + "time_per_iteration": 2.8625166416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231096, + "balance_loss_mlp": 1.18112373, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.03346184177846584, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78145558, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.49804688, + "step": 612, + "time_per_iteration": 4.748431444168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137214, + "balance_loss_mlp": 1.09880471, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.07612220197102978, + "language_loss": 0.95326376, + "learning_rate": 0.000979866002183916, + "loss": 0.96463591, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.38378906, + "step": 613, + "time_per_iteration": 2.6311473846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155666, + "balance_loss_mlp": 1.11482501, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.0832714106614858, + "language_loss": 0.96221644, + "learning_rate": 0.0009797783914423082, + "loss": 0.97377312, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.40844727, + "step": 614, + "time_per_iteration": 2.8568782806396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126933, + "balance_loss_mlp": 1.08721232, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.08355321383380138, + "language_loss": 0.91733479, + "learning_rate": 0.0009796905944342094, + "loss": 0.92860413, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.3972168, + "step": 615, + "time_per_iteration": 2.8348331451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07517743, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.05175964705030883, + "language_loss": 0.94486296, + "learning_rate": 0.0009796026111937057, + "loss": 0.9560017, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.38671875, + "step": 616, + "time_per_iteration": 2.609276056289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111065, + "balance_loss_mlp": 1.07393384, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.1779679576065946, + "language_loss": 0.94108498, + "learning_rate": 0.0009795144417549552, + "loss": 0.95219147, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.3671875, + "step": 617, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.07760203, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.0639893702788804, + "language_loss": 0.95137906, + "learning_rate": 0.0009794260861521883, + "loss": 0.96252483, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36987305, + "step": 618, + "time_per_iteration": 2.779780387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125632, + "balance_loss_mlp": 1.08908224, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.062080445707157726, + "language_loss": 0.94238096, + "learning_rate": 0.0009793375444197075, + "loss": 0.95363724, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.3659668, + "step": 619, + "time_per_iteration": 2.6269500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.12132859, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.05728911446624217, + "language_loss": 0.93181753, + "learning_rate": 0.000979248816591888, + "loss": 0.94341516, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.38452148, + "step": 620, + "time_per_iteration": 2.7879464626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155629, + "balance_loss_mlp": 1.11600351, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.05539388103354017, + "language_loss": 0.93241715, + "learning_rate": 0.0009791599027031766, + "loss": 0.94397342, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.39624023, + "step": 621, + "time_per_iteration": 3.058497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152711, + "balance_loss_mlp": 1.11439681, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.05959109763307043, + "language_loss": 0.93889141, + "learning_rate": 0.0009790708027880932, + "loss": 0.95041847, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.38330078, + "step": 622, + "time_per_iteration": 2.857905864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217773, + "balance_loss_mlp": 1.17447615, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.033264976771994935, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78645062, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.43359375, + "step": 623, + "time_per_iteration": 4.817517518997192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130228, + "balance_loss_mlp": 1.09372652, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.07130736684785184, + "language_loss": 0.99442542, + "learning_rate": 0.0009788920450172487, + "loss": 1.00572777, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.36499023, + "step": 624, + "time_per_iteration": 2.6089231967926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_mlp": 1.0987401, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.053387747347518576, + "language_loss": 0.97139525, + "learning_rate": 0.0009788023872308875, + "loss": 0.98273742, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35522461, + "step": 625, + "time_per_iteration": 2.5482659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171918, + "balance_loss_mlp": 1.12614214, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.016755812295179123, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76600921, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.45703125, + "step": 626, + "time_per_iteration": 4.767898797988892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142548, + "balance_loss_mlp": 1.10609388, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.053046953839951706, + "language_loss": 0.99526918, + "learning_rate": 0.0009786225140303285, + "loss": 1.00669467, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.36425781, + "step": 627, + "time_per_iteration": 2.666975975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145866, + "balance_loss_mlp": 1.10974586, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.06539343990980159, + "language_loss": 0.97403502, + "learning_rate": 0.0009785322986859634, + "loss": 0.98549366, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.36132812, + "step": 628, + "time_per_iteration": 2.6613006591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116443, + "balance_loss_mlp": 1.12830925, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.05337423256033143, + "language_loss": 0.99038112, + "learning_rate": 0.0009784418975588838, + "loss": 1.00202537, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.36108398, + "step": 629, + "time_per_iteration": 2.7266693115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.11248696, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.06598420413892771, + "language_loss": 0.97636682, + "learning_rate": 0.0009783513106841862, + "loss": 0.98784697, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.35522461, + "step": 630, + "time_per_iteration": 2.7734336853027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122661, + "balance_loss_mlp": 1.17663717, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.0364602282496576, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77959311, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.5, + "step": 631, + "time_per_iteration": 4.955650091171265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118689, + "balance_loss_mlp": 1.08283055, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.061523486228641615, + "language_loss": 0.94419873, + "learning_rate": 0.0009781695798326854, + "loss": 0.95538557, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35888672, + "step": 632, + "time_per_iteration": 2.6072514057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111815, + "balance_loss_mlp": 1.08319819, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.05761126083629287, + "language_loss": 0.93996418, + "learning_rate": 0.0009780784359264365, + "loss": 0.95114571, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.34985352, + "step": 633, + "time_per_iteration": 2.6186299324035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201074, + "balance_loss_mlp": 1.15548825, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.024414945484573326, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75389773, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.45507812, + "step": 634, + "time_per_iteration": 4.757866144180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_mlp": 1.05732846, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.05071444395915749, + "language_loss": 0.91919303, + "learning_rate": 0.000977895591329867, + "loss": 0.93010104, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.3347168, + "step": 635, + "time_per_iteration": 2.7802233695983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094425, + "balance_loss_mlp": 1.06006885, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.05652682698430024, + "language_loss": 0.93613631, + "learning_rate": 0.000977803890710533, + "loss": 0.94708061, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.34399414, + "step": 636, + "time_per_iteration": 2.719989538192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109182, + "balance_loss_mlp": 1.0546267, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.05019916823038997, + "language_loss": 0.97873759, + "learning_rate": 0.0009777120045912774, + "loss": 0.98965579, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.37231445, + "step": 637, + "time_per_iteration": 2.5960683822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099212, + "balance_loss_mlp": 1.06139851, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.05186361253186237, + "language_loss": 0.97095829, + "learning_rate": 0.0009776199330077736, + "loss": 0.9819504, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37841797, + "step": 638, + "time_per_iteration": 2.7152581214904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_mlp": 1.05121303, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.05467339203371928, + "language_loss": 0.99154645, + "learning_rate": 0.0009775276759957667, + "loss": 1.00242841, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.37011719, + "step": 639, + "time_per_iteration": 2.6985981464385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090176, + "balance_loss_mlp": 1.05465198, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.06600893718108056, + "language_loss": 0.97933781, + "learning_rate": 0.0009774352335910745, + "loss": 0.99023956, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.35546875, + "step": 640, + "time_per_iteration": 2.813744306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_mlp": 1.05298471, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.05927901471916764, + "language_loss": 0.99468219, + "learning_rate": 0.000977342605829586, + "loss": 1.00554824, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.33642578, + "step": 641, + "time_per_iteration": 2.73280668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110577, + "balance_loss_mlp": 1.07240582, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.07046674646118828, + "language_loss": 0.92099506, + "learning_rate": 0.0009772497927472623, + "loss": 0.93210077, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.38183594, + "step": 642, + "time_per_iteration": 3.1258397102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.09514427, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.07438352262018386, + "language_loss": 0.93366879, + "learning_rate": 0.0009771567943801368, + "loss": 0.94501698, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3972168, + "step": 643, + "time_per_iteration": 2.6720776557922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149366, + "balance_loss_mlp": 1.10912085, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.055730629552303436, + "language_loss": 0.96261084, + "learning_rate": 0.0009770636107643152, + "loss": 0.97410446, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.40234375, + "step": 644, + "time_per_iteration": 2.7093722820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144915, + "balance_loss_mlp": 1.10734022, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.05250459899213186, + "language_loss": 0.92937833, + "learning_rate": 0.0009769702419359738, + "loss": 0.94082749, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.37597656, + "step": 645, + "time_per_iteration": 2.661512613296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173428, + "balance_loss_mlp": 1.13146591, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.052890865129340166, + "language_loss": 0.94770992, + "learning_rate": 0.000976876687931362, + "loss": 0.95944417, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.41943359, + "step": 646, + "time_per_iteration": 2.972522258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164317, + "balance_loss_mlp": 1.12555003, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.07033761546633982, + "language_loss": 0.91270661, + "learning_rate": 0.0009767829487868005, + "loss": 0.92434984, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.38769531, + "step": 647, + "time_per_iteration": 2.6150805950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164183, + "balance_loss_mlp": 1.12281775, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.07269814667774141, + "language_loss": 0.95938772, + "learning_rate": 0.000976689024538682, + "loss": 0.97102952, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.41381836, + "step": 648, + "time_per_iteration": 2.6567764282226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_mlp": 1.11497951, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.06659282576896536, + "language_loss": 0.94783676, + "learning_rate": 0.0009765949152234716, + "loss": 0.95937783, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.39135742, + "step": 649, + "time_per_iteration": 2.9032628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118823, + "balance_loss_mlp": 1.15084565, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.027365485913225348, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79874313, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.37304688, + "step": 650, + "time_per_iteration": 4.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145487, + "balance_loss_mlp": 1.10395491, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.07758701561639549, + "language_loss": 0.88880539, + "learning_rate": 0.0009764061415379919, + "loss": 0.90026021, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.41552734, + "step": 651, + "time_per_iteration": 3.2588987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_mlp": 1.09766221, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08409279007421946, + "language_loss": 0.94380724, + "learning_rate": 0.0009763114772410109, + "loss": 0.95518184, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.39794922, + "step": 652, + "time_per_iteration": 2.5698702335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112103, + "balance_loss_mlp": 1.08359814, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.056536251661147445, + "language_loss": 0.92061114, + "learning_rate": 0.0009762166280235146, + "loss": 0.93182147, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37451172, + "step": 653, + "time_per_iteration": 2.938668966293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_mlp": 1.08191729, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.0771848817407848, + "language_loss": 0.94092464, + "learning_rate": 0.0009761215939223267, + "loss": 0.95209974, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.35644531, + "step": 654, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_mlp": 1.06834149, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.07424845664771389, + "language_loss": 0.9475044, + "learning_rate": 0.0009760263749743428, + "loss": 0.95853353, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.34570312, + "step": 655, + "time_per_iteration": 2.5710902214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101838, + "balance_loss_mlp": 1.06771994, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.053259035011575195, + "language_loss": 0.94285154, + "learning_rate": 0.0009759309712165299, + "loss": 0.95386994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34130859, + "step": 656, + "time_per_iteration": 2.70626163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101868, + "balance_loss_mlp": 1.06858444, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.0693418830287988, + "language_loss": 0.9812479, + "learning_rate": 0.0009758353826859272, + "loss": 0.99226654, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.33300781, + "step": 657, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_mlp": 1.0663563, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.06782991509763603, + "language_loss": 0.96008623, + "learning_rate": 0.0009757396094196456, + "loss": 0.97111744, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36791992, + "step": 658, + "time_per_iteration": 2.8277065753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115911, + "balance_loss_mlp": 1.07926583, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.053606842709613675, + "language_loss": 0.89398581, + "learning_rate": 0.0009756436514548673, + "loss": 0.90514493, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36645508, + "step": 659, + "time_per_iteration": 2.796175718307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120986, + "balance_loss_mlp": 1.0811224, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.060525818769901533, + "language_loss": 0.92384607, + "learning_rate": 0.0009755475088288466, + "loss": 0.93505597, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.39916992, + "step": 660, + "time_per_iteration": 2.678682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133341, + "balance_loss_mlp": 1.09271395, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08191197530717065, + "language_loss": 0.958794, + "learning_rate": 0.0009754511815789095, + "loss": 0.97012746, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.40600586, + "step": 661, + "time_per_iteration": 2.7371177673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130898, + "balance_loss_mlp": 1.09093928, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08687138171908054, + "language_loss": 0.92166948, + "learning_rate": 0.0009753546697424533, + "loss": 0.93297845, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.39941406, + "step": 662, + "time_per_iteration": 2.704432249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125889, + "balance_loss_mlp": 1.08700323, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.06194581367760624, + "language_loss": 0.95628935, + "learning_rate": 0.0009752579733569475, + "loss": 0.96754825, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.38891602, + "step": 663, + "time_per_iteration": 2.682892084121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165087, + "balance_loss_mlp": 1.1326623, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.0245621431528993, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76046479, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.32421875, + "step": 664, + "time_per_iteration": 4.981603622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146598, + "balance_loss_mlp": 1.1060189, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.07818489478946229, + "language_loss": 0.96962506, + "learning_rate": 0.0009750640270890217, + "loss": 0.98109102, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.40576172, + "step": 665, + "time_per_iteration": 2.7139556407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139516, + "balance_loss_mlp": 1.10115409, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.10418725554084544, + "language_loss": 1.02824736, + "learning_rate": 0.0009749667772818983, + "loss": 1.03964257, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.38354492, + "step": 666, + "time_per_iteration": 3.000227689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148195, + "balance_loss_mlp": 1.11481678, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.027847994605201966, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78084135, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.33398438, + "step": 667, + "time_per_iteration": 4.858838319778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.1255703, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.0747922247275706, + "language_loss": 1.00932169, + "learning_rate": 0.0009747717245101093, + "loss": 1.0209403, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.36303711, + "step": 668, + "time_per_iteration": 2.4917514324188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172854, + "balance_loss_mlp": 1.13518405, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0795363237311063, + "language_loss": 0.91087645, + "learning_rate": 0.00097467392162117, + "loss": 0.92260504, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.37719727, + "step": 669, + "time_per_iteration": 2.601151466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196603, + "balance_loss_mlp": 1.15540457, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.0744221392925499, + "language_loss": 0.95630497, + "learning_rate": 0.0009745759344474708, + "loss": 0.96827102, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.41162109, + "step": 670, + "time_per_iteration": 2.878068447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200159, + "balance_loss_mlp": 1.16012812, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.07162427386273244, + "language_loss": 0.95158428, + "learning_rate": 0.0009744777630270536, + "loss": 0.96358585, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.40063477, + "step": 671, + "time_per_iteration": 2.5778517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220294, + "balance_loss_mlp": 1.17752171, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.07459259564874297, + "language_loss": 0.99775112, + "learning_rate": 0.000974379407398032, + "loss": 1.00995398, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.42797852, + "step": 672, + "time_per_iteration": 2.862168073654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_mlp": 1.15175724, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.05795101219152752, + "language_loss": 0.86696863, + "learning_rate": 0.0009742808675985913, + "loss": 0.87888587, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.3996582, + "step": 673, + "time_per_iteration": 3.0987160205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011832, + "balance_loss_mlp": 1.14142871, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.06292984682523013, + "language_loss": 0.96893597, + "learning_rate": 0.0009741821436669876, + "loss": 0.98076797, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.41772461, + "step": 674, + "time_per_iteration": 2.565317153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160814, + "balance_loss_mlp": 1.12123656, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.07127578315040689, + "language_loss": 0.99621803, + "learning_rate": 0.0009740832356415492, + "loss": 1.00782621, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.39550781, + "step": 675, + "time_per_iteration": 2.4777724742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144275, + "balance_loss_mlp": 1.10538852, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.07563598794059366, + "language_loss": 0.94837546, + "learning_rate": 0.0009739841435606756, + "loss": 0.95981824, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.38867188, + "step": 676, + "time_per_iteration": 2.9838767051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_mlp": 1.09186864, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.06693149578557214, + "language_loss": 0.94293654, + "learning_rate": 0.0009738848674628377, + "loss": 0.95424765, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.39233398, + "step": 677, + "time_per_iteration": 2.7052054405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130656, + "balance_loss_mlp": 1.0923903, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.05501746541124835, + "language_loss": 0.94784498, + "learning_rate": 0.000973785407386578, + "loss": 0.95915151, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.38232422, + "step": 678, + "time_per_iteration": 2.7535152435302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_mlp": 1.09727383, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.05430769504454563, + "language_loss": 0.91185606, + "learning_rate": 0.0009736857633705103, + "loss": 0.92322862, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.3996582, + "step": 679, + "time_per_iteration": 2.8686013221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135266, + "balance_loss_mlp": 1.09575987, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.06387426976514826, + "language_loss": 0.97335434, + "learning_rate": 0.0009735859354533196, + "loss": 0.984707, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.39501953, + "step": 680, + "time_per_iteration": 2.6952273845672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09626174, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.07637025474680663, + "language_loss": 0.97434723, + "learning_rate": 0.0009734859236737628, + "loss": 0.98571181, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.40185547, + "step": 681, + "time_per_iteration": 2.607431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_mlp": 1.09720194, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.06515090437153119, + "language_loss": 0.9831785, + "learning_rate": 0.0009733857280706678, + "loss": 0.99454683, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.39599609, + "step": 682, + "time_per_iteration": 2.5730957984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140739, + "balance_loss_mlp": 1.1007328, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.08408851923922504, + "language_loss": 0.89817083, + "learning_rate": 0.000973285348682934, + "loss": 0.90957826, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.39990234, + "step": 683, + "time_per_iteration": 2.7041609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_mlp": 1.08460057, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.021197399820989362, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7901845, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.35546875, + "step": 684, + "time_per_iteration": 4.7803051471710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145399, + "balance_loss_mlp": 1.10579789, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.06796914093678033, + "language_loss": 0.90116858, + "learning_rate": 0.0009730840387095046, + "loss": 0.91262257, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.39575195, + "step": 685, + "time_per_iteration": 3.289513111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154301, + "balance_loss_mlp": 1.11412716, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.0690044047280534, + "language_loss": 0.95956922, + "learning_rate": 0.0009729831082019642, + "loss": 0.97111225, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.40185547, + "step": 686, + "time_per_iteration": 2.8214356899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131343, + "balance_loss_mlp": 1.09383941, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08080780289155233, + "language_loss": 0.93596351, + "learning_rate": 0.0009728819940660958, + "loss": 0.94727689, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.375, + "step": 687, + "time_per_iteration": 2.749385118484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011246, + "balance_loss_mlp": 1.08542764, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.08853955851107219, + "language_loss": 0.91695315, + "learning_rate": 0.0009727806963411557, + "loss": 0.92819917, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.39135742, + "step": 688, + "time_per_iteration": 2.592099666595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_mlp": 1.08777368, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.06370494383790047, + "language_loss": 0.92130053, + "learning_rate": 0.000972679215066471, + "loss": 0.93258381, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.40551758, + "step": 689, + "time_per_iteration": 2.7344043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114145, + "balance_loss_mlp": 1.10246885, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.08478699193898473, + "language_loss": 1.04583168, + "learning_rate": 0.0009725775502814401, + "loss": 1.05724621, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.3894043, + "step": 690, + "time_per_iteration": 2.5881311893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155383, + "balance_loss_mlp": 1.1147325, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.07994389842197654, + "language_loss": 0.90077579, + "learning_rate": 0.0009724757020255327, + "loss": 0.91232961, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.40649414, + "step": 691, + "time_per_iteration": 2.8452539443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_mlp": 1.12566948, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09039906445052394, + "language_loss": 0.91914684, + "learning_rate": 0.0009723736703382902, + "loss": 0.93079573, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.39208984, + "step": 692, + "time_per_iteration": 2.5472824573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198661, + "balance_loss_mlp": 1.15557849, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07689546631051256, + "language_loss": 0.86461794, + "learning_rate": 0.0009722714552593244, + "loss": 0.87660456, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.4309082, + "step": 693, + "time_per_iteration": 2.6273465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199876, + "balance_loss_mlp": 1.15560198, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08142665414192346, + "language_loss": 1.00438499, + "learning_rate": 0.000972169056828319, + "loss": 1.01638389, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.44262695, + "step": 694, + "time_per_iteration": 2.477491617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221806, + "balance_loss_mlp": 1.17741275, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.07001491486919184, + "language_loss": 0.90590984, + "learning_rate": 0.0009720664750850283, + "loss": 0.91812789, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.4440918, + "step": 695, + "time_per_iteration": 2.7817704677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209285, + "balance_loss_mlp": 1.16870594, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.07077521288835904, + "language_loss": 0.97240067, + "learning_rate": 0.0009719637100692784, + "loss": 0.98449349, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.40625, + "step": 696, + "time_per_iteration": 2.7099833488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214589, + "balance_loss_mlp": 1.17069626, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.06395797985697109, + "language_loss": 0.87399805, + "learning_rate": 0.0009718607618209661, + "loss": 0.88614392, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.43896484, + "step": 697, + "time_per_iteration": 2.8280160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226846, + "balance_loss_mlp": 1.18445516, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.08853583224950028, + "language_loss": 0.91527486, + "learning_rate": 0.0009717576303800595, + "loss": 0.92754334, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.42382812, + "step": 698, + "time_per_iteration": 3.0102553367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206724, + "balance_loss_mlp": 1.16385674, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.07140979809376953, + "language_loss": 0.90443981, + "learning_rate": 0.0009716543157865975, + "loss": 0.91650712, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.4284668, + "step": 699, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192988, + "balance_loss_mlp": 1.15047789, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.0971528894423257, + "language_loss": 0.87731719, + "learning_rate": 0.0009715508180806907, + "loss": 0.88924706, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.42504883, + "step": 700, + "time_per_iteration": 3.183608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.12189686, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07253928509691168, + "language_loss": 0.94940412, + "learning_rate": 0.0009714471373025202, + "loss": 0.96104908, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.42578125, + "step": 701, + "time_per_iteration": 3.4071736335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_mlp": 1.10978746, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07349692890686976, + "language_loss": 0.93387866, + "learning_rate": 0.0009713432734923386, + "loss": 0.94542348, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.44702148, + "step": 702, + "time_per_iteration": 2.61545467376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149917, + "balance_loss_mlp": 1.10523736, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.07475145021416552, + "language_loss": 0.90919894, + "learning_rate": 0.0009712392266904696, + "loss": 0.92069811, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.44702148, + "step": 703, + "time_per_iteration": 2.739295482635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156115, + "balance_loss_mlp": 1.11219811, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.09690331363255131, + "language_loss": 0.90325272, + "learning_rate": 0.0009711349969373076, + "loss": 0.91481388, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.43945312, + "step": 704, + "time_per_iteration": 3.1653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175158, + "balance_loss_mlp": 1.12780786, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.09111648779989767, + "language_loss": 0.84997714, + "learning_rate": 0.0009710305842733178, + "loss": 0.86172873, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.47314453, + "step": 705, + "time_per_iteration": 2.7402727603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117117, + "balance_loss_mlp": 1.12737262, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.10189351673448747, + "language_loss": 0.9379847, + "learning_rate": 0.0009709259887390373, + "loss": 0.94969636, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.43774414, + "step": 706, + "time_per_iteration": 2.5640039443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.10467625, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.07946562356881365, + "language_loss": 0.95178437, + "learning_rate": 0.0009708212103750737, + "loss": 0.96325481, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.42382812, + "step": 707, + "time_per_iteration": 2.6138036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153312, + "balance_loss_mlp": 1.1095618, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.07708082078191984, + "language_loss": 0.91549516, + "learning_rate": 0.0009707162492221051, + "loss": 0.9270283, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.43725586, + "step": 708, + "time_per_iteration": 2.879612684249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143626, + "balance_loss_mlp": 1.10121179, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.08764140181907645, + "language_loss": 0.92509496, + "learning_rate": 0.0009706111053208815, + "loss": 0.93653119, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.42431641, + "step": 709, + "time_per_iteration": 2.804469347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156089, + "balance_loss_mlp": 1.10947847, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.07097269092186763, + "language_loss": 0.89579999, + "learning_rate": 0.0009705057787122232, + "loss": 0.90736091, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.46630859, + "step": 710, + "time_per_iteration": 2.568406105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174212, + "balance_loss_mlp": 1.12874603, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.06463299548184855, + "language_loss": 0.94250202, + "learning_rate": 0.0009704002694370216, + "loss": 0.9542442, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.45410156, + "step": 711, + "time_per_iteration": 2.525240659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116324, + "balance_loss_mlp": 1.11820245, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.06677275778781674, + "language_loss": 0.90675253, + "learning_rate": 0.0009702945775362388, + "loss": 0.91838491, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.45043945, + "step": 712, + "time_per_iteration": 2.572566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171995, + "balance_loss_mlp": 1.12478852, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.06549167744569931, + "language_loss": 0.91151595, + "learning_rate": 0.0009701887030509086, + "loss": 0.92323589, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.47167969, + "step": 713, + "time_per_iteration": 2.645202875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_mlp": 1.11450684, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.07696267649297317, + "language_loss": 0.95333648, + "learning_rate": 0.0009700826460221346, + "loss": 0.96490526, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.42382812, + "step": 714, + "time_per_iteration": 2.649831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187773, + "balance_loss_mlp": 1.13980293, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.08597126409557068, + "language_loss": 0.96336859, + "learning_rate": 0.0009699764064910921, + "loss": 0.97524625, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.47998047, + "step": 715, + "time_per_iteration": 2.8645238876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178216, + "balance_loss_mlp": 1.1317718, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08366808602410432, + "language_loss": 0.90892398, + "learning_rate": 0.0009698699844990268, + "loss": 0.92070615, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.46435547, + "step": 716, + "time_per_iteration": 2.635460376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171183, + "balance_loss_mlp": 1.12731409, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.051528021496160425, + "language_loss": 0.91132116, + "learning_rate": 0.0009697633800872555, + "loss": 0.923033, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.4387207, + "step": 717, + "time_per_iteration": 2.887854814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189757, + "balance_loss_mlp": 1.1432178, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.07388540586481528, + "language_loss": 0.94422555, + "learning_rate": 0.0009696565932971655, + "loss": 0.95612311, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.46557617, + "step": 718, + "time_per_iteration": 2.8565313816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171127, + "balance_loss_mlp": 1.12580407, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.06166568969162735, + "language_loss": 0.92794299, + "learning_rate": 0.0009695496241702153, + "loss": 0.93965423, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.45361328, + "step": 719, + "time_per_iteration": 2.827193021774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178611, + "balance_loss_mlp": 1.13152349, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.07046673128739296, + "language_loss": 0.8903814, + "learning_rate": 0.0009694424727479339, + "loss": 0.9021675, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.47094727, + "step": 720, + "time_per_iteration": 2.958855628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12150323, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.07332050167219753, + "language_loss": 0.91946507, + "learning_rate": 0.0009693351390719213, + "loss": 0.93114913, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.46899414, + "step": 721, + "time_per_iteration": 2.6910197734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012083, + "balance_loss_mlp": 1.15742183, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.06188248769550966, + "language_loss": 0.93531096, + "learning_rate": 0.000969227623183848, + "loss": 0.94739395, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.50830078, + "step": 722, + "time_per_iteration": 2.791097640991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.14776587, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06666345220966835, + "language_loss": 0.93550557, + "learning_rate": 0.0009691199251254554, + "loss": 0.94745386, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.47045898, + "step": 723, + "time_per_iteration": 2.8282151222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173107, + "balance_loss_mlp": 1.13059711, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.07191970231420823, + "language_loss": 0.88703346, + "learning_rate": 0.0009690120449385555, + "loss": 0.89876461, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.42504883, + "step": 724, + "time_per_iteration": 2.775456190109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158197, + "balance_loss_mlp": 1.11332655, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.06680700276551169, + "language_loss": 0.95181078, + "learning_rate": 0.0009689039826650312, + "loss": 0.96339279, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.44824219, + "step": 725, + "time_per_iteration": 2.7623417377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164951, + "balance_loss_mlp": 1.12756717, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.03995326528410751, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77688015, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.37304688, + "step": 726, + "time_per_iteration": 4.914167642593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146003, + "balance_loss_mlp": 1.09567261, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.07822541163530779, + "language_loss": 0.90488958, + "learning_rate": 0.0009686873120259941, + "loss": 0.91634959, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.50341797, + "step": 727, + "time_per_iteration": 2.563333749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132914, + "balance_loss_mlp": 1.09092879, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.0725242002086287, + "language_loss": 0.89161742, + "learning_rate": 0.0009685787037446004, + "loss": 0.90294659, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.41992188, + "step": 728, + "time_per_iteration": 2.7803192138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137226, + "balance_loss_mlp": 1.09192598, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.10183800223701604, + "language_loss": 0.9064362, + "learning_rate": 0.0009684699135448201, + "loss": 0.91780847, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.453125, + "step": 729, + "time_per_iteration": 2.750023603439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142476, + "balance_loss_mlp": 1.0995841, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.06503689668024501, + "language_loss": 0.94054115, + "learning_rate": 0.0009683609414688895, + "loss": 0.95196593, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.42895508, + "step": 730, + "time_per_iteration": 2.708470344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116576, + "balance_loss_mlp": 1.11652613, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.07277464462784268, + "language_loss": 0.89072424, + "learning_rate": 0.0009682517875591154, + "loss": 0.9023819, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.49243164, + "step": 731, + "time_per_iteration": 2.734145402908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173563, + "balance_loss_mlp": 1.12640429, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.08810260071203486, + "language_loss": 0.88790858, + "learning_rate": 0.0009681424518578749, + "loss": 0.8996442, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.47192383, + "step": 732, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166119, + "balance_loss_mlp": 1.11900759, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.07053265121681873, + "language_loss": 0.9010576, + "learning_rate": 0.000968032934407616, + "loss": 0.91271877, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.47143555, + "step": 733, + "time_per_iteration": 2.625128746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161975, + "balance_loss_mlp": 1.11514974, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.08143861058365946, + "language_loss": 0.84579933, + "learning_rate": 0.0009679232352508571, + "loss": 0.85741913, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.46850586, + "step": 734, + "time_per_iteration": 2.7461798191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145124, + "balance_loss_mlp": 1.10046864, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.0788084271092868, + "language_loss": 0.83272535, + "learning_rate": 0.0009678133544301871, + "loss": 0.84417665, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.44677734, + "step": 735, + "time_per_iteration": 2.68129301071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130971, + "balance_loss_mlp": 1.08731616, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.05044431767963513, + "language_loss": 0.93706036, + "learning_rate": 0.0009677032919882658, + "loss": 0.94837004, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.43652344, + "step": 736, + "time_per_iteration": 2.663874387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141167, + "balance_loss_mlp": 1.0970124, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.07155994363363784, + "language_loss": 0.94151366, + "learning_rate": 0.000967593047967823, + "loss": 0.95292532, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.44116211, + "step": 737, + "time_per_iteration": 2.512871265411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.10376751, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.07145762863961741, + "language_loss": 0.89657855, + "learning_rate": 0.0009674826224116593, + "loss": 0.90808284, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.46655273, + "step": 738, + "time_per_iteration": 2.797337293624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_mlp": 1.09865868, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.07589062836694223, + "language_loss": 0.89765012, + "learning_rate": 0.0009673720153626455, + "loss": 0.90910375, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.46728516, + "step": 739, + "time_per_iteration": 2.5743062496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.09274864, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07239717331604524, + "language_loss": 0.89863205, + "learning_rate": 0.0009672612268637235, + "loss": 0.9100163, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.45678711, + "step": 740, + "time_per_iteration": 2.6074059009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125723, + "balance_loss_mlp": 1.08125818, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.08552249660547784, + "language_loss": 0.8725301, + "learning_rate": 0.0009671502569579048, + "loss": 0.88378727, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.44458008, + "step": 741, + "time_per_iteration": 2.729733467102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116563, + "balance_loss_mlp": 1.07338512, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.05753110737252733, + "language_loss": 0.92330521, + "learning_rate": 0.0009670391056882719, + "loss": 0.93447083, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.43188477, + "step": 742, + "time_per_iteration": 2.69399356842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115871, + "balance_loss_mlp": 1.07367063, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.06711892894426404, + "language_loss": 0.91615599, + "learning_rate": 0.0009669277730979776, + "loss": 0.92731464, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.421875, + "step": 743, + "time_per_iteration": 3.1732802391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123454, + "balance_loss_mlp": 1.079561, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.07488288596065623, + "language_loss": 0.88249421, + "learning_rate": 0.0009668162592302449, + "loss": 0.89372879, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.43896484, + "step": 744, + "time_per_iteration": 2.88962459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_mlp": 1.09551311, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.08170086657731683, + "language_loss": 0.8873378, + "learning_rate": 0.0009667045641283676, + "loss": 0.89875567, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.46289062, + "step": 745, + "time_per_iteration": 2.6374380588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136601, + "balance_loss_mlp": 1.09158731, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.07376324969806651, + "language_loss": 0.9752661, + "learning_rate": 0.0009665926878357092, + "loss": 0.98663211, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.44995117, + "step": 746, + "time_per_iteration": 2.908377170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138589, + "balance_loss_mlp": 1.09283662, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.055840413500964095, + "language_loss": 0.93229979, + "learning_rate": 0.0009664806303957043, + "loss": 0.94368571, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.45751953, + "step": 747, + "time_per_iteration": 2.6940197944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_mlp": 1.11397541, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.07422855656653271, + "language_loss": 0.89923358, + "learning_rate": 0.0009663683918518571, + "loss": 0.91087878, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.50463867, + "step": 748, + "time_per_iteration": 2.8905599117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_mlp": 1.10977423, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.06951396400432043, + "language_loss": 0.88074797, + "learning_rate": 0.0009662559722477428, + "loss": 0.89237428, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.52880859, + "step": 749, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111749, + "balance_loss_mlp": 1.09059644, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.031134761916572575, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77280462, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.26953125, + "step": 750, + "time_per_iteration": 4.978729009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_mlp": 1.09359622, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.06451546089111031, + "language_loss": 0.9124738, + "learning_rate": 0.0009660305900333632, + "loss": 0.92388898, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.47973633, + "step": 751, + "time_per_iteration": 2.6556403636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145582, + "balance_loss_mlp": 1.09849465, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08083819383046088, + "language_loss": 0.8480792, + "learning_rate": 0.0009659176275105992, + "loss": 0.85953498, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.47070312, + "step": 752, + "time_per_iteration": 2.6868016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154886, + "balance_loss_mlp": 1.10667825, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.0601727082776222, + "language_loss": 0.87400204, + "learning_rate": 0.0009658044841025701, + "loss": 0.88555086, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.48217773, + "step": 753, + "time_per_iteration": 2.7701456546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189813, + "balance_loss_mlp": 1.136765, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.0800468655776831, + "language_loss": 0.83957088, + "learning_rate": 0.0009656911598532021, + "loss": 0.85146904, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.53051758, + "step": 754, + "time_per_iteration": 2.630211353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192149, + "balance_loss_mlp": 1.13943434, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.0631545589319864, + "language_loss": 0.9278729, + "learning_rate": 0.0009655776548064917, + "loss": 0.93979442, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.52758789, + "step": 755, + "time_per_iteration": 2.6447510719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.12506902, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.06497808848967317, + "language_loss": 0.90460694, + "learning_rate": 0.0009654639690065054, + "loss": 0.91637456, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.51708984, + "step": 756, + "time_per_iteration": 2.910578727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116602, + "balance_loss_mlp": 1.11785972, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.0580393303136577, + "language_loss": 0.90340179, + "learning_rate": 0.00096535010249738, + "loss": 0.91506201, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.48120117, + "step": 757, + "time_per_iteration": 2.7232277393341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149847, + "balance_loss_mlp": 1.10092402, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.07370663524734816, + "language_loss": 0.8531146, + "learning_rate": 0.0009652360553233224, + "loss": 0.86461306, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.48901367, + "step": 758, + "time_per_iteration": 2.7501397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.03528047, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.02263224740377231, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74837828, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.28710938, + "step": 759, + "time_per_iteration": 4.953639268875122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150341, + "balance_loss_mlp": 1.1019187, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.05750780582661247, + "language_loss": 0.83513778, + "learning_rate": 0.0009650074191575883, + "loss": 0.84664118, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.48388672, + "step": 760, + "time_per_iteration": 3.202252149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152626, + "balance_loss_mlp": 1.10179496, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.05303129095981597, + "language_loss": 0.88240772, + "learning_rate": 0.0009648928302546766, + "loss": 0.89393395, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.50878906, + "step": 761, + "time_per_iteration": 2.65380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_mlp": 1.09960222, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.06114398209353547, + "language_loss": 0.87573165, + "learning_rate": 0.0009647780608643613, + "loss": 0.88720453, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.47705078, + "step": 762, + "time_per_iteration": 3.3394339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.10831833, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.09093438426480749, + "language_loss": 0.90765309, + "learning_rate": 0.0009646631110312001, + "loss": 0.91919315, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.45678711, + "step": 763, + "time_per_iteration": 2.622671604156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.11200595, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.047784585244551814, + "language_loss": 0.90468627, + "learning_rate": 0.0009645479807998203, + "loss": 0.91626436, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.45751953, + "step": 764, + "time_per_iteration": 2.7322580814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156125, + "balance_loss_mlp": 1.11487842, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06523928090243644, + "language_loss": 0.94106412, + "learning_rate": 0.0009644326702149196, + "loss": 0.95262539, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.41235352, + "step": 765, + "time_per_iteration": 2.7013158798217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174535, + "balance_loss_mlp": 1.12761474, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.08055574364553787, + "language_loss": 0.86730242, + "learning_rate": 0.0009643171793212653, + "loss": 0.87904775, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.46923828, + "step": 766, + "time_per_iteration": 3.083709478378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_mlp": 1.11473966, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.07722330054572468, + "language_loss": 0.92188174, + "learning_rate": 0.0009642015081636952, + "loss": 0.93350834, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.47949219, + "step": 767, + "time_per_iteration": 2.6836585998535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.1132586, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.07123168873353844, + "language_loss": 0.90995437, + "learning_rate": 0.0009640856567871166, + "loss": 0.9215681, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.48168945, + "step": 768, + "time_per_iteration": 2.543670177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156907, + "balance_loss_mlp": 1.10626745, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07039727350928661, + "language_loss": 0.9123286, + "learning_rate": 0.0009639696252365072, + "loss": 0.92389768, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.50634766, + "step": 769, + "time_per_iteration": 3.027188539505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146634, + "balance_loss_mlp": 1.10326576, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.06094559984807647, + "language_loss": 0.83659029, + "learning_rate": 0.0009638534135569144, + "loss": 0.84805667, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.43359375, + "step": 770, + "time_per_iteration": 2.9126267433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_mlp": 1.09489226, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.06702358278695762, + "language_loss": 0.92293191, + "learning_rate": 0.0009637370217934554, + "loss": 0.93433982, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.45922852, + "step": 771, + "time_per_iteration": 2.6426541805267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.08600211, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.04968709901212579, + "language_loss": 0.84857935, + "learning_rate": 0.0009636204499913175, + "loss": 0.85987568, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.43603516, + "step": 772, + "time_per_iteration": 2.830029010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_mlp": 1.08478057, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06444605868824185, + "language_loss": 0.90028566, + "learning_rate": 0.0009635036981957581, + "loss": 0.91150796, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.37451172, + "step": 773, + "time_per_iteration": 2.850893259048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128047, + "balance_loss_mlp": 1.08546507, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.07558916443605426, + "language_loss": 0.92137265, + "learning_rate": 0.0009633867664521043, + "loss": 0.93265319, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.42553711, + "step": 774, + "time_per_iteration": 2.8405416011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154281, + "balance_loss_mlp": 1.10614467, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.07793461844194936, + "language_loss": 0.8938297, + "learning_rate": 0.0009632696548057527, + "loss": 0.9053725, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.48168945, + "step": 775, + "time_per_iteration": 2.5543088912963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158921, + "balance_loss_mlp": 1.11419404, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.07948352168051111, + "language_loss": 0.86982578, + "learning_rate": 0.0009631523633021704, + "loss": 0.88141501, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.44702148, + "step": 776, + "time_per_iteration": 2.8373982906341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151944, + "balance_loss_mlp": 1.10726452, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.07613081492567164, + "language_loss": 0.90593684, + "learning_rate": 0.0009630348919868936, + "loss": 0.91745627, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.4465332, + "step": 777, + "time_per_iteration": 2.688340187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164011, + "balance_loss_mlp": 1.1162796, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.07284380806791231, + "language_loss": 0.83743048, + "learning_rate": 0.0009629172409055293, + "loss": 0.84907055, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.47753906, + "step": 778, + "time_per_iteration": 2.496121406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_mlp": 1.13260555, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.0582041699055768, + "language_loss": 0.89173234, + "learning_rate": 0.0009627994101037531, + "loss": 0.9034642, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.40576172, + "step": 779, + "time_per_iteration": 2.7287445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116917, + "balance_loss_mlp": 1.12670779, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.06429714570378213, + "language_loss": 0.91374522, + "learning_rate": 0.0009626813996273114, + "loss": 0.92543697, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.42431641, + "step": 780, + "time_per_iteration": 2.8357532024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174237, + "balance_loss_mlp": 1.13258517, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.07735356487079731, + "language_loss": 0.90820873, + "learning_rate": 0.0009625632095220198, + "loss": 0.91995108, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.41625977, + "step": 781, + "time_per_iteration": 2.8360986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165333, + "balance_loss_mlp": 1.12408686, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.07591811383481707, + "language_loss": 0.88784671, + "learning_rate": 0.0009624448398337637, + "loss": 0.89950007, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.41259766, + "step": 782, + "time_per_iteration": 2.550873041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_mlp": 1.09920812, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.06500535683801296, + "language_loss": 0.90907973, + "learning_rate": 0.0009623262906084984, + "loss": 0.92046738, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.39550781, + "step": 783, + "time_per_iteration": 3.002237319946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127947, + "balance_loss_mlp": 1.08622408, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.06722303964642193, + "language_loss": 0.92323947, + "learning_rate": 0.0009622075618922486, + "loss": 0.93451893, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.41699219, + "step": 784, + "time_per_iteration": 2.669541120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117088, + "balance_loss_mlp": 1.07636571, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.06286377137641418, + "language_loss": 0.88948303, + "learning_rate": 0.0009620886537311091, + "loss": 0.90065384, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.40722656, + "step": 785, + "time_per_iteration": 2.6505391597747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132411, + "balance_loss_mlp": 1.08563375, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.06858268632652799, + "language_loss": 0.87318397, + "learning_rate": 0.000961969566171244, + "loss": 0.88450807, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.46777344, + "step": 786, + "time_per_iteration": 2.5492002964019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143812, + "balance_loss_mlp": 1.10037243, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.06762455123923776, + "language_loss": 0.9226557, + "learning_rate": 0.0009618502992588873, + "loss": 0.93409383, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.43481445, + "step": 787, + "time_per_iteration": 2.6596381664276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153043, + "balance_loss_mlp": 1.10714722, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07210135364095939, + "language_loss": 0.90213263, + "learning_rate": 0.0009617308530403424, + "loss": 0.91366303, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.45922852, + "step": 788, + "time_per_iteration": 2.9965012073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133874, + "balance_loss_mlp": 1.09358144, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0646084728999688, + "language_loss": 0.89177096, + "learning_rate": 0.0009616112275619825, + "loss": 0.90310967, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.40283203, + "step": 789, + "time_per_iteration": 2.702927350997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.08760214, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.04914514873585108, + "language_loss": 0.85434246, + "learning_rate": 0.0009614914228702503, + "loss": 0.86562753, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.40917969, + "step": 790, + "time_per_iteration": 2.734309196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120439, + "balance_loss_mlp": 1.08031344, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.0510031662309952, + "language_loss": 0.90581405, + "learning_rate": 0.0009613714390116581, + "loss": 0.91701841, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.40112305, + "step": 791, + "time_per_iteration": 2.9846036434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119178, + "balance_loss_mlp": 1.07890868, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.06466161117660295, + "language_loss": 0.87842512, + "learning_rate": 0.0009612512760327879, + "loss": 0.88961697, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.40283203, + "step": 792, + "time_per_iteration": 2.879507303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.0749234, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.06761791569724282, + "language_loss": 0.86834276, + "learning_rate": 0.0009611309339802909, + "loss": 0.87955594, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.46435547, + "step": 793, + "time_per_iteration": 2.4628419876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125819, + "balance_loss_mlp": 1.08180666, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.06955338926819006, + "language_loss": 0.85776877, + "learning_rate": 0.0009610104129008881, + "loss": 0.86902696, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.43994141, + "step": 794, + "time_per_iteration": 3.1157610416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112048, + "balance_loss_mlp": 1.07751703, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.0812849574801687, + "language_loss": 0.89832217, + "learning_rate": 0.0009608897128413701, + "loss": 0.90952694, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.4296875, + "step": 795, + "time_per_iteration": 2.7580387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_mlp": 1.08070254, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.07320179377966478, + "language_loss": 0.87414771, + "learning_rate": 0.0009607688338485965, + "loss": 0.88536048, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.40576172, + "step": 796, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112803, + "balance_loss_mlp": 1.08358848, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.08676784428227541, + "language_loss": 0.92063487, + "learning_rate": 0.0009606477759694969, + "loss": 0.93191516, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.4440918, + "step": 797, + "time_per_iteration": 3.0136139392852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129989, + "balance_loss_mlp": 1.08547592, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07379760567815713, + "language_loss": 0.89430279, + "learning_rate": 0.0009605265392510703, + "loss": 0.90560269, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.44555664, + "step": 798, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_mlp": 1.10169339, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.06797963908333281, + "language_loss": 0.93481082, + "learning_rate": 0.0009604051237403846, + "loss": 0.94626689, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.43896484, + "step": 799, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167168, + "balance_loss_mlp": 1.1217972, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.06891264186704958, + "language_loss": 0.88271165, + "learning_rate": 0.0009602835294845776, + "loss": 0.89438331, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.45361328, + "step": 800, + "time_per_iteration": 2.4739739894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12188447, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.06820302888180714, + "language_loss": 0.91848779, + "learning_rate": 0.0009601617565308565, + "loss": 0.93017173, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.46557617, + "step": 801, + "time_per_iteration": 2.599102020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196072, + "balance_loss_mlp": 1.14941311, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.08155438121007776, + "language_loss": 0.88506758, + "learning_rate": 0.0009600398049264977, + "loss": 0.89702827, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.46679688, + "step": 802, + "time_per_iteration": 2.9645981788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193217, + "balance_loss_mlp": 1.14574742, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.10468166660144326, + "language_loss": 0.93512642, + "learning_rate": 0.0009599176747188469, + "loss": 0.94705856, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.47485352, + "step": 803, + "time_per_iteration": 2.7997000217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160191, + "balance_loss_mlp": 1.11856318, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.07174757520021151, + "language_loss": 0.84728193, + "learning_rate": 0.0009597953659553196, + "loss": 0.85888386, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.41625977, + "step": 804, + "time_per_iteration": 2.700530529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_mlp": 1.09408379, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.4143347029392257, + "language_loss": 0.9033978, + "learning_rate": 0.0009596728786833997, + "loss": 0.91473466, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.39575195, + "step": 805, + "time_per_iteration": 2.6122889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150784, + "balance_loss_mlp": 1.10772574, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.061887733402931855, + "language_loss": 0.91321814, + "learning_rate": 0.0009595502129506415, + "loss": 0.92472601, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.43066406, + "step": 806, + "time_per_iteration": 3.336061716079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180508, + "balance_loss_mlp": 1.13694847, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.06807019640067784, + "language_loss": 0.84292483, + "learning_rate": 0.0009594273688046678, + "loss": 0.85472989, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.43579102, + "step": 807, + "time_per_iteration": 2.709182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210101, + "balance_loss_mlp": 1.15960383, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.0856522073787927, + "language_loss": 0.8780278, + "learning_rate": 0.000959304346293171, + "loss": 0.89012885, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.50512695, + "step": 808, + "time_per_iteration": 2.6307153701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236008, + "balance_loss_mlp": 1.18305564, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.09531038088821206, + "language_loss": 0.90107393, + "learning_rate": 0.0009591811454639125, + "loss": 0.91343403, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.52954102, + "step": 809, + "time_per_iteration": 2.742725372314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197955, + "balance_loss_mlp": 1.15184498, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.06212883071305714, + "language_loss": 0.902493, + "learning_rate": 0.0009590577663647234, + "loss": 0.91447246, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.4609375, + "step": 810, + "time_per_iteration": 2.711411237716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187108, + "balance_loss_mlp": 1.13837492, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.06321996034865444, + "language_loss": 0.88015836, + "learning_rate": 0.0009589342090435036, + "loss": 0.8920294, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.48779297, + "step": 811, + "time_per_iteration": 2.763784170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.12610841, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07315119709604147, + "language_loss": 0.89953744, + "learning_rate": 0.0009588104735482223, + "loss": 0.91127443, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.47631836, + "step": 812, + "time_per_iteration": 2.645106077194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169234, + "balance_loss_mlp": 1.12019134, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.06895714089970095, + "language_loss": 0.86002952, + "learning_rate": 0.0009586865599269177, + "loss": 0.87172186, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.49047852, + "step": 813, + "time_per_iteration": 2.6313953399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144128, + "balance_loss_mlp": 1.09851837, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.06467027207336487, + "language_loss": 0.90443802, + "learning_rate": 0.0009585624682276977, + "loss": 0.91587937, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.45605469, + "step": 814, + "time_per_iteration": 2.7377047538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144046, + "balance_loss_mlp": 1.09705353, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.06824176290368998, + "language_loss": 0.89156437, + "learning_rate": 0.0009584381984987386, + "loss": 0.90300483, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.47021484, + "step": 815, + "time_per_iteration": 2.5524120330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134862, + "balance_loss_mlp": 1.09225655, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.061358262400161866, + "language_loss": 0.92449033, + "learning_rate": 0.0009583137507882864, + "loss": 0.93583906, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.42626953, + "step": 816, + "time_per_iteration": 2.699207305908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.08698916, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.06309616730716378, + "language_loss": 0.82620019, + "learning_rate": 0.000958189125144656, + "loss": 0.8375479, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.47851562, + "step": 817, + "time_per_iteration": 2.6626293659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142354, + "balance_loss_mlp": 1.09493256, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08013787804574789, + "language_loss": 0.90297949, + "learning_rate": 0.0009580643216162313, + "loss": 0.91440302, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.47436523, + "step": 818, + "time_per_iteration": 2.6708288192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.09368527, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.06582812199168771, + "language_loss": 0.82167578, + "learning_rate": 0.0009579393402514652, + "loss": 0.83310658, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.49389648, + "step": 819, + "time_per_iteration": 2.577592611312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_mlp": 1.09898734, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.07647809261390527, + "language_loss": 0.92362559, + "learning_rate": 0.0009578141810988801, + "loss": 0.93505466, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.43920898, + "step": 820, + "time_per_iteration": 2.5464515686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152369, + "balance_loss_mlp": 1.10678363, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07136182637629812, + "language_loss": 0.92042351, + "learning_rate": 0.0009576888442070668, + "loss": 0.93194717, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.45556641, + "step": 821, + "time_per_iteration": 2.5755786895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114609, + "balance_loss_mlp": 1.10288835, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08295395391365894, + "language_loss": 0.94583452, + "learning_rate": 0.0009575633296246854, + "loss": 0.95729542, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.43212891, + "step": 822, + "time_per_iteration": 2.5701425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162298, + "balance_loss_mlp": 1.11821485, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.06548151577025092, + "language_loss": 0.85385978, + "learning_rate": 0.0009574376374004652, + "loss": 0.86548281, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.44116211, + "step": 823, + "time_per_iteration": 2.622905731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_mlp": 1.12019491, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.1009087476503521, + "language_loss": 0.82624936, + "learning_rate": 0.000957311767583204, + "loss": 0.83794677, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.49536133, + "step": 824, + "time_per_iteration": 2.5683999061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196161, + "balance_loss_mlp": 1.1752758, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.05150472419389455, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83267754, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.20898438, + "step": 825, + "time_per_iteration": 4.722898960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176776, + "balance_loss_mlp": 1.12170124, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.10062471557735768, + "language_loss": 0.94017303, + "learning_rate": 0.0009570594953650961, + "loss": 0.95194077, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.55029297, + "step": 826, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173437, + "balance_loss_mlp": 1.12091362, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.0719939675894647, + "language_loss": 0.8219676, + "learning_rate": 0.00095693309306219, + "loss": 0.83370197, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.52612305, + "step": 827, + "time_per_iteration": 3.0926811695098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_mlp": 1.12434745, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.06038838021195225, + "language_loss": 0.90083122, + "learning_rate": 0.0009568065133621244, + "loss": 0.91261542, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54077148, + "step": 828, + "time_per_iteration": 3.315122604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164888, + "balance_loss_mlp": 1.12013662, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.07025990147709567, + "language_loss": 0.87178355, + "learning_rate": 0.0009566797563140422, + "loss": 0.88343245, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.44775391, + "step": 829, + "time_per_iteration": 2.8680243492126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116659, + "balance_loss_mlp": 1.11912107, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.061296828426512996, + "language_loss": 0.89984798, + "learning_rate": 0.0009565528219671547, + "loss": 0.91151381, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.47460938, + "step": 830, + "time_per_iteration": 2.9325318336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.1076839, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07652275644998038, + "language_loss": 0.86699682, + "learning_rate": 0.0009564257103707418, + "loss": 0.87860584, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.53198242, + "step": 831, + "time_per_iteration": 2.598191976547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184474, + "balance_loss_mlp": 1.12973261, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08337472663089728, + "language_loss": 0.92543364, + "learning_rate": 0.0009562984215741533, + "loss": 0.93727839, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54736328, + "step": 832, + "time_per_iteration": 2.676666736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_mlp": 1.11177731, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.05762908483075192, + "language_loss": 0.8408711, + "learning_rate": 0.0009561709556268065, + "loss": 0.85247904, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.49047852, + "step": 833, + "time_per_iteration": 2.7075538635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162528, + "balance_loss_mlp": 1.11141133, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.06044842900072245, + "language_loss": 0.96042889, + "learning_rate": 0.0009560433125781884, + "loss": 0.97205412, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.51171875, + "step": 834, + "time_per_iteration": 2.7619521617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.09130979, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.06441579465763399, + "language_loss": 0.94159138, + "learning_rate": 0.0009559154924778544, + "loss": 0.95304114, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.53686523, + "step": 835, + "time_per_iteration": 2.7467222213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_mlp": 1.08218372, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.07312538570388089, + "language_loss": 0.86469144, + "learning_rate": 0.0009557874953754284, + "loss": 0.87598646, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.47314453, + "step": 836, + "time_per_iteration": 3.0907793045043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126281, + "balance_loss_mlp": 1.07618928, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08101808751207061, + "language_loss": 0.85894346, + "learning_rate": 0.0009556593213206038, + "loss": 0.87020624, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.5012207, + "step": 837, + "time_per_iteration": 2.7060487270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.07765627, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.060960398488271, + "language_loss": 0.89031309, + "learning_rate": 0.0009555309703631414, + "loss": 0.9015379, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.44848633, + "step": 838, + "time_per_iteration": 2.6838622093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131245, + "balance_loss_mlp": 1.07853079, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.0637381399971671, + "language_loss": 0.88547724, + "learning_rate": 0.0009554024425528722, + "loss": 0.89678967, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.52685547, + "step": 839, + "time_per_iteration": 2.7301504611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124486, + "balance_loss_mlp": 1.07978272, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0692663948027758, + "language_loss": 0.90811443, + "learning_rate": 0.0009552737379396948, + "loss": 0.91935933, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.44726562, + "step": 840, + "time_per_iteration": 2.6181893348693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129368, + "balance_loss_mlp": 1.08208978, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06449676765287365, + "language_loss": 0.89640445, + "learning_rate": 0.0009551448565735767, + "loss": 0.90769809, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.47265625, + "step": 841, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135502, + "balance_loss_mlp": 1.08555281, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.07291825437583387, + "language_loss": 0.86443651, + "learning_rate": 0.0009550157985045543, + "loss": 0.87579155, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.49926758, + "step": 842, + "time_per_iteration": 3.0523600578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_mlp": 1.08724499, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.06222432903322319, + "language_loss": 0.90556312, + "learning_rate": 0.0009548865637827321, + "loss": 0.91690183, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.46630859, + "step": 843, + "time_per_iteration": 2.6370396614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113271, + "balance_loss_mlp": 1.08757734, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.07459586377830821, + "language_loss": 0.91347718, + "learning_rate": 0.0009547571524582838, + "loss": 0.92480427, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.45141602, + "step": 844, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142931, + "balance_loss_mlp": 1.09460354, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.08463351541898638, + "language_loss": 0.94371468, + "learning_rate": 0.0009546275645814512, + "loss": 0.95514405, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.48339844, + "step": 845, + "time_per_iteration": 2.632861375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117157, + "balance_loss_mlp": 1.12107265, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.08033911629378378, + "language_loss": 0.92129737, + "learning_rate": 0.0009544978002025446, + "loss": 0.93301302, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.50561523, + "step": 846, + "time_per_iteration": 2.7044737339019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193795, + "balance_loss_mlp": 1.14096177, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.052695226385161484, + "language_loss": 0.88037688, + "learning_rate": 0.0009543678593719434, + "loss": 0.89231491, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.52880859, + "step": 847, + "time_per_iteration": 2.798231601715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208721, + "balance_loss_mlp": 1.15734136, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.056853368929671785, + "language_loss": 0.88963962, + "learning_rate": 0.0009542377421400945, + "loss": 0.90172684, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.51391602, + "step": 848, + "time_per_iteration": 2.7955727577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122402, + "balance_loss_mlp": 1.16584587, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06352967983147602, + "language_loss": 0.85259467, + "learning_rate": 0.0009541074485575145, + "loss": 0.86483485, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.58154297, + "step": 849, + "time_per_iteration": 2.703871488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_mlp": 1.17088127, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07774946886845908, + "language_loss": 0.93468195, + "learning_rate": 0.0009539769786747874, + "loss": 0.94693196, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.54125977, + "step": 850, + "time_per_iteration": 2.6687557697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012154, + "balance_loss_mlp": 1.16130245, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.057605035940766894, + "language_loss": 0.82393861, + "learning_rate": 0.0009538463325425665, + "loss": 0.83609259, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.54101562, + "step": 851, + "time_per_iteration": 2.751335382461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199288, + "balance_loss_mlp": 1.1491015, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.06621147850271279, + "language_loss": 0.87526274, + "learning_rate": 0.0009537155102115728, + "loss": 0.88725561, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.50170898, + "step": 852, + "time_per_iteration": 2.568573474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168884, + "balance_loss_mlp": 1.12236834, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.07419725806034035, + "language_loss": 0.85374665, + "learning_rate": 0.0009535845117325961, + "loss": 0.86543554, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.46533203, + "step": 853, + "time_per_iteration": 2.628973960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137862, + "balance_loss_mlp": 1.09511375, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.05551255594321189, + "language_loss": 0.94495642, + "learning_rate": 0.0009534533371564946, + "loss": 0.95633507, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.42724609, + "step": 854, + "time_per_iteration": 2.780510902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133546, + "balance_loss_mlp": 1.09003448, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.08632067881035285, + "language_loss": 0.90547508, + "learning_rate": 0.0009533219865341949, + "loss": 0.91681051, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.43530273, + "step": 855, + "time_per_iteration": 2.583874464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_mlp": 1.07188785, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.06082853882497287, + "language_loss": 0.88071746, + "learning_rate": 0.0009531904599166916, + "loss": 0.89188123, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.44482422, + "step": 856, + "time_per_iteration": 2.626354217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_mlp": 1.06231081, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.0709999882269981, + "language_loss": 0.86807954, + "learning_rate": 0.0009530587573550478, + "loss": 0.87915355, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.45068359, + "step": 857, + "time_per_iteration": 2.5761454105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142125, + "balance_loss_mlp": 1.11237001, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04095057850479287, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75461513, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.296875, + "step": 858, + "time_per_iteration": 5.055138349533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_mlp": 1.06165087, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.08838989258306214, + "language_loss": 0.91845137, + "learning_rate": 0.0009527948246039337, + "loss": 0.92946172, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.39379883, + "step": 859, + "time_per_iteration": 2.582608461380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111338, + "balance_loss_mlp": 1.0715934, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.06489567580347368, + "language_loss": 0.89263308, + "learning_rate": 0.000952662594516931, + "loss": 0.90374649, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.39746094, + "step": 860, + "time_per_iteration": 3.067707061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_mlp": 1.07018054, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.055059247831062384, + "language_loss": 0.88479781, + "learning_rate": 0.0009525301886907234, + "loss": 0.89590299, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.40307617, + "step": 861, + "time_per_iteration": 2.8873865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112184, + "balance_loss_mlp": 1.07758975, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.06995538812096423, + "language_loss": 0.89499515, + "learning_rate": 0.0009523976071767155, + "loss": 0.90621358, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.44262695, + "step": 862, + "time_per_iteration": 2.6588613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.08183372, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.06313062043432274, + "language_loss": 0.89038265, + "learning_rate": 0.00095226485002638, + "loss": 0.90163255, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.43115234, + "step": 863, + "time_per_iteration": 2.797896146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.07232881, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.054774526957085325, + "language_loss": 0.90381318, + "learning_rate": 0.0009521319172912576, + "loss": 0.91494584, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.40917969, + "step": 864, + "time_per_iteration": 2.7238612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_mlp": 1.08132839, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.05854649520245602, + "language_loss": 0.96491337, + "learning_rate": 0.0009519988090229579, + "loss": 0.97618109, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.4543457, + "step": 865, + "time_per_iteration": 2.683509111404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_mlp": 1.07907248, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.05699467986566688, + "language_loss": 0.89545953, + "learning_rate": 0.0009518655252731576, + "loss": 0.90669084, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.44067383, + "step": 866, + "time_per_iteration": 2.729865550994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_mlp": 1.08456326, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.06482393342324422, + "language_loss": 0.9171015, + "learning_rate": 0.0009517320660936022, + "loss": 0.9284128, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.46557617, + "step": 867, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133142, + "balance_loss_mlp": 1.08843839, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.06614373571764609, + "language_loss": 0.84472704, + "learning_rate": 0.0009515984315361051, + "loss": 0.85605848, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.44702148, + "step": 868, + "time_per_iteration": 2.796868085861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121806, + "balance_loss_mlp": 1.07657838, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08270078218547869, + "language_loss": 0.88773656, + "learning_rate": 0.000951464621652548, + "loss": 0.89895463, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.45239258, + "step": 869, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141117, + "balance_loss_mlp": 1.09751046, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.06072661062765564, + "language_loss": 0.80103016, + "learning_rate": 0.0009513306364948804, + "loss": 0.81244129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.43579102, + "step": 870, + "time_per_iteration": 2.799009084701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_mlp": 1.10373545, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09261319168225486, + "language_loss": 0.90277344, + "learning_rate": 0.0009511964761151197, + "loss": 0.91426206, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.45117188, + "step": 871, + "time_per_iteration": 2.5934712886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158743, + "balance_loss_mlp": 1.1145407, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.06739805293344515, + "language_loss": 0.91524243, + "learning_rate": 0.0009510621405653521, + "loss": 0.92682987, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.44213867, + "step": 872, + "time_per_iteration": 2.5557620525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11627746, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.06267535529199315, + "language_loss": 0.85553813, + "learning_rate": 0.0009509276298977309, + "loss": 0.86710668, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.40576172, + "step": 873, + "time_per_iteration": 2.9965007305145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187981, + "balance_loss_mlp": 1.13760364, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.07409010972210926, + "language_loss": 0.82916558, + "learning_rate": 0.0009507929441644778, + "loss": 0.84104538, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.50415039, + "step": 874, + "time_per_iteration": 3.5573699474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118449, + "balance_loss_mlp": 1.14097893, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.07388150752212762, + "language_loss": 0.8737148, + "learning_rate": 0.0009506580834178826, + "loss": 0.88555974, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.43530273, + "step": 875, + "time_per_iteration": 2.7659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215839, + "balance_loss_mlp": 1.16841793, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.06935842584614806, + "language_loss": 0.92793226, + "learning_rate": 0.0009505230477103028, + "loss": 0.94009066, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.47436523, + "step": 876, + "time_per_iteration": 2.7306137084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_mlp": 1.18224776, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10053146783154573, + "language_loss": 0.82997662, + "learning_rate": 0.0009503878370941641, + "loss": 0.84224302, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.44433594, + "step": 877, + "time_per_iteration": 2.7356183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211684, + "balance_loss_mlp": 1.16793382, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.10508781605450683, + "language_loss": 0.9020679, + "learning_rate": 0.0009502524516219595, + "loss": 0.91418481, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.43798828, + "step": 878, + "time_per_iteration": 2.7525370121002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185028, + "balance_loss_mlp": 1.14232683, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.07887273759437702, + "language_loss": 0.91364408, + "learning_rate": 0.0009501168913462506, + "loss": 0.92549431, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.42724609, + "step": 879, + "time_per_iteration": 2.7009639739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115086, + "balance_loss_mlp": 1.11919844, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04902821320434346, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80272782, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.31640625, + "step": 880, + "time_per_iteration": 4.812703609466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116281, + "balance_loss_mlp": 1.11748707, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.06555145426806878, + "language_loss": 0.86756283, + "learning_rate": 0.0009498452465949042, + "loss": 0.87919092, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.453125, + "step": 881, + "time_per_iteration": 3.230407476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159747, + "balance_loss_mlp": 1.1133033, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.0753185527775994, + "language_loss": 0.92756218, + "learning_rate": 0.0009497091622247285, + "loss": 0.93915963, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.46459961, + "step": 882, + "time_per_iteration": 2.7412030696868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141259, + "balance_loss_mlp": 1.09734213, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.07197762243887564, + "language_loss": 0.94941783, + "learning_rate": 0.0009495729032619723, + "loss": 0.96083045, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.43945312, + "step": 883, + "time_per_iteration": 2.6705245971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_mlp": 1.09724283, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07033792867334165, + "language_loss": 0.85310471, + "learning_rate": 0.0009494364697595354, + "loss": 0.86451751, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.44018555, + "step": 884, + "time_per_iteration": 2.9024457931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115677, + "balance_loss_mlp": 1.10977769, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.0673266035955572, + "language_loss": 0.90739167, + "learning_rate": 0.0009492998617703867, + "loss": 0.91895938, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.47045898, + "step": 885, + "time_per_iteration": 2.6497459411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151127, + "balance_loss_mlp": 1.10813999, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.0863252086663651, + "language_loss": 0.89101255, + "learning_rate": 0.0009491630793475619, + "loss": 0.90252388, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.42993164, + "step": 886, + "time_per_iteration": 2.6258063316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159756, + "balance_loss_mlp": 1.11231089, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.0686214928272948, + "language_loss": 0.85993534, + "learning_rate": 0.0009490261225441643, + "loss": 0.87153292, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.47412109, + "step": 887, + "time_per_iteration": 2.9036519527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168173, + "balance_loss_mlp": 1.12370825, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07914830411429463, + "language_loss": 0.91452426, + "learning_rate": 0.0009488889914133656, + "loss": 0.92620599, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.44458008, + "step": 888, + "time_per_iteration": 3.0038132667541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155268, + "balance_loss_mlp": 1.10706019, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07300075385020723, + "language_loss": 0.90558064, + "learning_rate": 0.0009487516860084047, + "loss": 0.91713333, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.48193359, + "step": 889, + "time_per_iteration": 2.7158679962158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147984, + "balance_loss_mlp": 1.0996089, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.09172908653222724, + "language_loss": 0.90068781, + "learning_rate": 0.0009486142063825884, + "loss": 0.91216767, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.48364258, + "step": 890, + "time_per_iteration": 2.5330443382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.06175303, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.031797672969882694, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73511147, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.23144531, + "step": 891, + "time_per_iteration": 4.953175783157349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_mlp": 1.11835372, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.06989736404119995, + "language_loss": 0.91231126, + "learning_rate": 0.0009483387246819542, + "loss": 0.92398739, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.49243164, + "step": 892, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.0426023, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.022698270048783192, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83350885, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.2265625, + "step": 893, + "time_per_iteration": 4.662828683853149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12312233, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.06047387129149895, + "language_loss": 0.90360647, + "learning_rate": 0.0009480625467392688, + "loss": 0.91527206, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.43481445, + "step": 894, + "time_per_iteration": 2.615447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046079, + "balance_loss_mlp": 1.02433491, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.017910617622931155, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79040754, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.21777344, + "step": 895, + "time_per_iteration": 4.802469968795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196327, + "balance_loss_mlp": 1.15264833, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0591778940977726, + "language_loss": 0.88960874, + "learning_rate": 0.0009477856729834196, + "loss": 0.90157199, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.43652344, + "step": 896, + "time_per_iteration": 2.743036985397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214543, + "balance_loss_mlp": 1.17217648, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.09709817551063968, + "language_loss": 0.91585428, + "learning_rate": 0.0009476469753098809, + "loss": 0.92799973, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.42358398, + "step": 897, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206917, + "balance_loss_mlp": 1.16080689, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08785360527314089, + "language_loss": 0.87616539, + "learning_rate": 0.0009475081038443738, + "loss": 0.88823456, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.46118164, + "step": 898, + "time_per_iteration": 2.5958664417266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178905, + "balance_loss_mlp": 1.13436794, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.08099470404026293, + "language_loss": 0.87109447, + "learning_rate": 0.0009473690586408124, + "loss": 0.88288355, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.44482422, + "step": 899, + "time_per_iteration": 2.885279417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.13184392, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.060075693842180825, + "language_loss": 0.87349975, + "learning_rate": 0.0009472298397531792, + "loss": 0.88526928, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.45141602, + "step": 900, + "time_per_iteration": 2.6987335681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117213, + "balance_loss_mlp": 1.12244344, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.06597136758704356, + "language_loss": 0.87749296, + "learning_rate": 0.0009470904472355235, + "loss": 0.88921428, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.49707031, + "step": 901, + "time_per_iteration": 2.6920526027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_mlp": 1.08898544, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.06929151708835651, + "language_loss": 0.8084361, + "learning_rate": 0.0009469508811419626, + "loss": 0.81977129, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.44555664, + "step": 902, + "time_per_iteration": 2.7087764739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_mlp": 1.01825094, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.018918236495105482, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7265144, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.831868648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130376, + "balance_loss_mlp": 1.08429003, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.06904883588321564, + "language_loss": 0.84871197, + "learning_rate": 0.0009466712284439292, + "loss": 0.86001575, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.46118164, + "step": 904, + "time_per_iteration": 2.727154493331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135856, + "balance_loss_mlp": 1.08867335, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.0797697294198037, + "language_loss": 0.90077758, + "learning_rate": 0.0009465311419480276, + "loss": 0.9121362, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.47216797, + "step": 905, + "time_per_iteration": 2.659696340560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130539, + "balance_loss_mlp": 1.0859549, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.0780460064240459, + "language_loss": 0.89685637, + "learning_rate": 0.0009463908820933622, + "loss": 0.90816176, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.44604492, + "step": 906, + "time_per_iteration": 2.845508337020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_mlp": 1.10657179, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.06621529993663824, + "language_loss": 0.83420271, + "learning_rate": 0.0009462504489343868, + "loss": 0.84573436, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.46582031, + "step": 907, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152101, + "balance_loss_mlp": 1.10246193, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0823987818854668, + "language_loss": 0.9018122, + "learning_rate": 0.0009461098425256222, + "loss": 0.91333324, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.49633789, + "step": 908, + "time_per_iteration": 2.5904529094696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.11457169, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.0762262609163865, + "language_loss": 0.87090451, + "learning_rate": 0.0009459690629216567, + "loss": 0.88250846, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.45874023, + "step": 909, + "time_per_iteration": 2.61710524559021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155202, + "balance_loss_mlp": 1.10921121, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06657664395828655, + "language_loss": 0.88943893, + "learning_rate": 0.0009458281101771457, + "loss": 0.90099096, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.46020508, + "step": 910, + "time_per_iteration": 2.6421282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176316, + "balance_loss_mlp": 1.12810779, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.08799417436837091, + "language_loss": 0.8354404, + "learning_rate": 0.0009456869843468122, + "loss": 0.84720349, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.48217773, + "step": 911, + "time_per_iteration": 2.8633837699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178078, + "balance_loss_mlp": 1.12688971, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.08410877580390771, + "language_loss": 0.79552639, + "learning_rate": 0.0009455456854854459, + "loss": 0.80730712, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.51220703, + "step": 912, + "time_per_iteration": 2.661038875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180916, + "balance_loss_mlp": 1.13564038, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.17307911593328887, + "language_loss": 0.85480136, + "learning_rate": 0.0009454042136479039, + "loss": 0.86661053, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.45263672, + "step": 913, + "time_per_iteration": 2.561790943145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198041, + "balance_loss_mlp": 1.15183568, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.06959724621682493, + "language_loss": 0.8438077, + "learning_rate": 0.0009452625688891103, + "loss": 0.85578811, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.4621582, + "step": 914, + "time_per_iteration": 2.5396227836608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092507, + "balance_loss_mlp": 1.07600832, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.034614734916794516, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79827243, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.16503906, + "step": 915, + "time_per_iteration": 4.550157308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_mlp": 1.21347213, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08235911171958209, + "language_loss": 0.94223297, + "learning_rate": 0.0009449787608278015, + "loss": 0.95488179, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.51489258, + "step": 916, + "time_per_iteration": 2.8292665481567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243722, + "balance_loss_mlp": 1.19525158, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08361954447634375, + "language_loss": 0.9338274, + "learning_rate": 0.0009448365976354704, + "loss": 0.94626462, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.48461914, + "step": 917, + "time_per_iteration": 2.543883800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216482, + "balance_loss_mlp": 1.16622329, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.08482517786251102, + "language_loss": 0.91736883, + "learning_rate": 0.0009446942617422558, + "loss": 0.9295336, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.50317383, + "step": 918, + "time_per_iteration": 2.6130669116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118235, + "balance_loss_mlp": 1.13740778, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.07957198864097685, + "language_loss": 0.8648746, + "learning_rate": 0.0009445517532034176, + "loss": 0.87669808, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.44970703, + "step": 919, + "time_per_iteration": 2.7341010570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116033, + "balance_loss_mlp": 1.11002386, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.08371374964142012, + "language_loss": 0.9020586, + "learning_rate": 0.0009444090720742824, + "loss": 0.9136619, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.50341797, + "step": 920, + "time_per_iteration": 2.628169298171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158892, + "balance_loss_mlp": 1.1083951, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.07483188289837522, + "language_loss": 0.89025688, + "learning_rate": 0.0009442662184102439, + "loss": 0.90184581, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.50512695, + "step": 921, + "time_per_iteration": 2.7538435459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154828, + "balance_loss_mlp": 1.11210358, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.05276545299780942, + "language_loss": 0.88537991, + "learning_rate": 0.000944123192266763, + "loss": 0.89692819, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.42724609, + "step": 922, + "time_per_iteration": 2.788759469985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190709, + "balance_loss_mlp": 1.13887644, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.07681776188261369, + "language_loss": 0.84657156, + "learning_rate": 0.0009439799936993671, + "loss": 0.85847867, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.51904297, + "step": 923, + "time_per_iteration": 2.7123734951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196866, + "balance_loss_mlp": 1.14787149, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.09732559260361714, + "language_loss": 0.89131558, + "learning_rate": 0.0009438366227636511, + "loss": 0.90328419, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.49047852, + "step": 924, + "time_per_iteration": 2.6907341480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171193, + "balance_loss_mlp": 1.12396216, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07379366042998667, + "language_loss": 0.86971134, + "learning_rate": 0.0009436930795152763, + "loss": 0.88142323, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.47241211, + "step": 925, + "time_per_iteration": 2.865673065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168215, + "balance_loss_mlp": 1.12174773, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07469970420174622, + "language_loss": 0.8767308, + "learning_rate": 0.0009435493640099713, + "loss": 0.88841295, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.46411133, + "step": 926, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_mlp": 1.10388088, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.06972760602295516, + "language_loss": 0.85458124, + "learning_rate": 0.0009434054763035314, + "loss": 0.86612737, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.50756836, + "step": 927, + "time_per_iteration": 2.5972957611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 1.09983397, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.05666425765353489, + "language_loss": 0.86302543, + "learning_rate": 0.0009432614164518185, + "loss": 0.8745054, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.48168945, + "step": 928, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150828, + "balance_loss_mlp": 1.09780383, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07484249942420804, + "language_loss": 0.85464913, + "learning_rate": 0.000943117184510762, + "loss": 0.86615741, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 2.9855945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124448, + "balance_loss_mlp": 1.10556555, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.03465095249088487, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79914415, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.18847656, + "step": 930, + "time_per_iteration": 5.016055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148447, + "balance_loss_mlp": 1.09997642, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.07304481613225793, + "language_loss": 0.89790976, + "learning_rate": 0.0009428282045846674, + "loss": 0.90939426, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.48461914, + "step": 931, + "time_per_iteration": 2.787473678588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134009, + "balance_loss_mlp": 1.08797026, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.05043968313129053, + "language_loss": 0.90432143, + "learning_rate": 0.0009426834567118214, + "loss": 0.91566151, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.46044922, + "step": 932, + "time_per_iteration": 3.1106340885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149699, + "balance_loss_mlp": 1.10091829, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.0884624873286247, + "language_loss": 0.81563932, + "learning_rate": 0.0009425385369740155, + "loss": 0.82713628, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.48779297, + "step": 933, + "time_per_iteration": 3.056328296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.1138767, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.0672899912264689, + "language_loss": 0.88411558, + "learning_rate": 0.0009423934454275125, + "loss": 0.8957603, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.50561523, + "step": 934, + "time_per_iteration": 2.827507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162381, + "balance_loss_mlp": 1.11333871, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.07880287247644589, + "language_loss": 0.92845738, + "learning_rate": 0.0009422481821286418, + "loss": 0.94008112, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.49072266, + "step": 935, + "time_per_iteration": 2.7188265323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164456, + "balance_loss_mlp": 1.11918044, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.07978340192275198, + "language_loss": 0.88968349, + "learning_rate": 0.0009421027471337998, + "loss": 0.90132797, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.45239258, + "step": 936, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176728, + "balance_loss_mlp": 1.1271131, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.07049523693926517, + "language_loss": 0.83782339, + "learning_rate": 0.0009419571404994493, + "loss": 0.84959066, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.49584961, + "step": 937, + "time_per_iteration": 2.641847610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_mlp": 1.11354589, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.06745021535989586, + "language_loss": 0.91665328, + "learning_rate": 0.00094181136228212, + "loss": 0.92827624, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.48803711, + "step": 938, + "time_per_iteration": 2.622314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146811, + "balance_loss_mlp": 1.10334706, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06209482952821168, + "language_loss": 0.87085009, + "learning_rate": 0.0009416654125384077, + "loss": 0.88231826, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.43432617, + "step": 939, + "time_per_iteration": 2.735565423965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167753, + "balance_loss_mlp": 1.15230346, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.039552666267989665, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80940127, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15429688, + "step": 940, + "time_per_iteration": 4.9464662075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_mlp": 1.10293126, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.06405620484007693, + "language_loss": 0.85002685, + "learning_rate": 0.000941372998698552, + "loss": 0.86150396, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.44750977, + "step": 941, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152914, + "balance_loss_mlp": 1.10344219, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.07883971857950696, + "language_loss": 0.82437575, + "learning_rate": 0.0009412265347159336, + "loss": 0.8359049, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.49487305, + "step": 942, + "time_per_iteration": 2.727071762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135445, + "balance_loss_mlp": 1.09083664, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.10057326993772005, + "language_loss": 0.85614288, + "learning_rate": 0.0009410798994339829, + "loss": 0.86749732, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.44604492, + "step": 943, + "time_per_iteration": 2.6305696964263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.09248304, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.05478952043416941, + "language_loss": 0.88907182, + "learning_rate": 0.000940933092909628, + "loss": 0.90042174, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.42529297, + "step": 944, + "time_per_iteration": 2.631101369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.10530019, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.06051663433249254, + "language_loss": 0.84961444, + "learning_rate": 0.0009407861151998649, + "loss": 0.8611083, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.44067383, + "step": 945, + "time_per_iteration": 2.5717978477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116837, + "balance_loss_mlp": 1.12040067, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.06666982795430461, + "language_loss": 0.87044382, + "learning_rate": 0.0009406389663617552, + "loss": 0.88212758, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.47998047, + "step": 946, + "time_per_iteration": 2.6768407821655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170796, + "balance_loss_mlp": 1.12757087, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0759743739596538, + "language_loss": 0.87192827, + "learning_rate": 0.000940491646452427, + "loss": 0.88363624, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.43212891, + "step": 947, + "time_per_iteration": 2.7174758911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174804, + "balance_loss_mlp": 1.1271199, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.06285362616764655, + "language_loss": 0.91503757, + "learning_rate": 0.000940344155529075, + "loss": 0.92678559, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.47680664, + "step": 948, + "time_per_iteration": 2.6130924224853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175811, + "balance_loss_mlp": 1.12643504, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.07182633578445446, + "language_loss": 0.88395435, + "learning_rate": 0.0009401964936489605, + "loss": 0.89571244, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.4934082, + "step": 949, + "time_per_iteration": 2.518735885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154077, + "balance_loss_mlp": 1.11173368, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.08616214546245322, + "language_loss": 0.86381257, + "learning_rate": 0.0009400486608694108, + "loss": 0.87535334, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.42358398, + "step": 950, + "time_per_iteration": 2.7356269359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_mlp": 1.10071373, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.05684050086710682, + "language_loss": 0.88146299, + "learning_rate": 0.0009399006572478195, + "loss": 0.89294124, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.47119141, + "step": 951, + "time_per_iteration": 3.0829784870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113898, + "balance_loss_mlp": 1.09449124, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06809630737889293, + "language_loss": 0.91594249, + "learning_rate": 0.0009397524828416468, + "loss": 0.92733228, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.44482422, + "step": 952, + "time_per_iteration": 2.710500478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141316, + "balance_loss_mlp": 1.09339356, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.06814185159234107, + "language_loss": 0.97457635, + "learning_rate": 0.0009396041377084192, + "loss": 0.98598951, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.47949219, + "step": 953, + "time_per_iteration": 2.6530585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011339, + "balance_loss_mlp": 1.08716977, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.06688505748067412, + "language_loss": 0.88496006, + "learning_rate": 0.0009394556219057295, + "loss": 0.896299, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.46704102, + "step": 954, + "time_per_iteration": 2.662543773651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.08948374, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08148035498798997, + "language_loss": 0.84775722, + "learning_rate": 0.0009393069354912362, + "loss": 0.85911626, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.46386719, + "step": 955, + "time_per_iteration": 2.7262632846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_mlp": 1.0954181, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07343823471440349, + "language_loss": 0.83466816, + "learning_rate": 0.0009391580785226649, + "loss": 0.8460598, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.43774414, + "step": 956, + "time_per_iteration": 2.8661141395568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_mlp": 1.04708123, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.029557521366383285, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80407178, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.19628906, + "step": 957, + "time_per_iteration": 4.751030921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.08978534, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.06490118531587029, + "language_loss": 0.87677503, + "learning_rate": 0.0009388598531545196, + "loss": 0.88812232, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.44946289, + "step": 958, + "time_per_iteration": 2.8378970623016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143042, + "balance_loss_mlp": 1.09702718, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.07391212127287443, + "language_loss": 0.86896807, + "learning_rate": 0.000938710484870727, + "loss": 0.88039851, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.46044922, + "step": 959, + "time_per_iteration": 4.31168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128823, + "balance_loss_mlp": 1.08416748, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0638837232249089, + "language_loss": 0.86957002, + "learning_rate": 0.0009385609462644189, + "loss": 0.88085824, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.44702148, + "step": 960, + "time_per_iteration": 2.6793572902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_mlp": 1.07233214, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07248975394705585, + "language_loss": 0.86711299, + "learning_rate": 0.0009384112373936514, + "loss": 0.87830293, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.46679688, + "step": 961, + "time_per_iteration": 2.6220860481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119858, + "balance_loss_mlp": 1.07334304, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.06813544125014795, + "language_loss": 0.92053163, + "learning_rate": 0.0009382613583165467, + "loss": 0.93173021, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.46533203, + "step": 962, + "time_per_iteration": 2.8032093048095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108588, + "balance_loss_mlp": 1.06142831, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07296294799157402, + "language_loss": 0.9064188, + "learning_rate": 0.0009381113090912928, + "loss": 0.91750467, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.47167969, + "step": 963, + "time_per_iteration": 2.7358789443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_mlp": 1.06741881, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.07962159601741099, + "language_loss": 0.90353996, + "learning_rate": 0.000937961089776144, + "loss": 0.91463923, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.42480469, + "step": 964, + "time_per_iteration": 2.5761237144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.07924736, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09082243760489998, + "language_loss": 0.83673573, + "learning_rate": 0.0009378107004294208, + "loss": 0.84802246, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.49438477, + "step": 965, + "time_per_iteration": 2.9681291580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132442, + "balance_loss_mlp": 1.08542585, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08405098410424734, + "language_loss": 0.92054594, + "learning_rate": 0.0009376601411095096, + "loss": 0.93187034, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.4699707, + "step": 966, + "time_per_iteration": 2.696122407913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.09773731, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07104128547690361, + "language_loss": 0.87554526, + "learning_rate": 0.0009375094118748622, + "loss": 0.88693225, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.40991211, + "step": 967, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179223, + "balance_loss_mlp": 1.13373268, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.0728928893981835, + "language_loss": 0.91626799, + "learning_rate": 0.0009373585127839976, + "loss": 0.92806023, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.45507812, + "step": 968, + "time_per_iteration": 2.9854021072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212732, + "balance_loss_mlp": 1.16905367, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08777237711590531, + "language_loss": 0.91368866, + "learning_rate": 0.0009372074438954994, + "loss": 0.92581606, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.43652344, + "step": 969, + "time_per_iteration": 2.5014536380767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211792, + "balance_loss_mlp": 1.16539574, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.0704882552763471, + "language_loss": 0.92436379, + "learning_rate": 0.0009370562052680181, + "loss": 0.93648171, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.46411133, + "step": 970, + "time_per_iteration": 2.453458070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120766, + "balance_loss_mlp": 1.16183591, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.07372597108689087, + "language_loss": 0.89988613, + "learning_rate": 0.0009369047969602695, + "loss": 0.91196281, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.45825195, + "step": 971, + "time_per_iteration": 2.703948497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192702, + "balance_loss_mlp": 1.14396954, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.08557962606734577, + "language_loss": 0.8750906, + "learning_rate": 0.0009367532190310357, + "loss": 0.88701761, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.48657227, + "step": 972, + "time_per_iteration": 4.1564977169036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148896, + "balance_loss_mlp": 1.1052649, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.06811184838385763, + "language_loss": 0.89467651, + "learning_rate": 0.0009366014715391644, + "loss": 0.90616548, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.43603516, + "step": 973, + "time_per_iteration": 2.695730209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134701, + "balance_loss_mlp": 1.09307301, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.054567817192194557, + "language_loss": 0.84347546, + "learning_rate": 0.0009364495545435693, + "loss": 0.85482252, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.41625977, + "step": 974, + "time_per_iteration": 2.828831672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146218, + "balance_loss_mlp": 1.09970224, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.08256927623824414, + "language_loss": 0.89333141, + "learning_rate": 0.0009362974681032297, + "loss": 0.90479362, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.46484375, + "step": 975, + "time_per_iteration": 2.5982418060302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143654, + "balance_loss_mlp": 1.09909391, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.07754570301250979, + "language_loss": 0.89447427, + "learning_rate": 0.0009361452122771907, + "loss": 0.90591079, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.44555664, + "step": 976, + "time_per_iteration": 2.881242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_mlp": 1.08834195, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.0965092241218366, + "language_loss": 0.84541976, + "learning_rate": 0.0009359927871245635, + "loss": 0.85675669, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.45361328, + "step": 977, + "time_per_iteration": 2.4720265865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113596, + "balance_loss_mlp": 1.09039843, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09227923665031239, + "language_loss": 0.87538362, + "learning_rate": 0.0009358401927045246, + "loss": 0.88674331, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.45581055, + "step": 978, + "time_per_iteration": 2.8225297927856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_mlp": 1.0945406, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.05953389716062443, + "language_loss": 0.88990903, + "learning_rate": 0.0009356874290763166, + "loss": 0.90131652, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.46264648, + "step": 979, + "time_per_iteration": 3.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_mlp": 1.09494936, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.06969100284100371, + "language_loss": 0.89955008, + "learning_rate": 0.0009355344962992474, + "loss": 0.91095543, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.45581055, + "step": 980, + "time_per_iteration": 2.6008429527282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138568, + "balance_loss_mlp": 1.09291101, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07021551702573088, + "language_loss": 0.88888156, + "learning_rate": 0.0009353813944326908, + "loss": 0.90026724, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.45654297, + "step": 981, + "time_per_iteration": 2.9102253913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141833, + "balance_loss_mlp": 1.09352899, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0640154196439605, + "language_loss": 0.83560127, + "learning_rate": 0.0009352281235360863, + "loss": 0.84701967, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.4831543, + "step": 982, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.10627127, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.06254433649037737, + "language_loss": 0.85791624, + "learning_rate": 0.0009350746836689389, + "loss": 0.86940861, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.4296875, + "step": 983, + "time_per_iteration": 2.524491548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.02905524, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.024687708549402564, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82486492, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.18261719, + "step": 984, + "time_per_iteration": 5.200335741043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156154, + "balance_loss_mlp": 1.1069684, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08202626484000469, + "language_loss": 0.84151661, + "learning_rate": 0.0009347672972613634, + "loss": 0.85307819, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.49145508, + "step": 985, + "time_per_iteration": 2.6939473152160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011517, + "balance_loss_mlp": 1.10756862, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.061889675774481866, + "language_loss": 0.8651796, + "learning_rate": 0.0009346133508402735, + "loss": 0.87669659, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.44140625, + "step": 986, + "time_per_iteration": 2.695004463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146205, + "balance_loss_mlp": 1.1000948, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07730871241699967, + "language_loss": 0.84821075, + "learning_rate": 0.0009344592356873166, + "loss": 0.85967278, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.46118164, + "step": 987, + "time_per_iteration": 2.635143518447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_mlp": 1.0975666, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.058246004489727894, + "language_loss": 0.79289091, + "learning_rate": 0.0009343049518623255, + "loss": 0.80432773, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.46142578, + "step": 988, + "time_per_iteration": 2.7257165908813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126709, + "balance_loss_mlp": 1.08503366, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.06464318177286693, + "language_loss": 0.83752143, + "learning_rate": 0.0009341504994251985, + "loss": 0.84878862, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.41674805, + "step": 989, + "time_per_iteration": 2.8336057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_mlp": 1.03692603, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.01962059038868396, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74572587, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.15136719, + "step": 990, + "time_per_iteration": 4.980287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.07682681, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.06360467015426281, + "language_loss": 0.82411575, + "learning_rate": 0.0009338410889544574, + "loss": 0.83530033, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.41601562, + "step": 991, + "time_per_iteration": 3.0192768573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123102, + "balance_loss_mlp": 1.0790422, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.06107834506241764, + "language_loss": 0.88440853, + "learning_rate": 0.000933686131040967, + "loss": 0.89563954, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.44067383, + "step": 992, + "time_per_iteration": 2.795952796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118187, + "balance_loss_mlp": 1.07479525, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.08075044213119366, + "language_loss": 0.91145802, + "learning_rate": 0.0009335310047555883, + "loss": 0.92263985, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.43383789, + "step": 993, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144036, + "balance_loss_mlp": 1.10052443, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.06789475617385991, + "language_loss": 0.89048505, + "learning_rate": 0.0009333757101585467, + "loss": 0.90192544, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.43554688, + "step": 994, + "time_per_iteration": 2.659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_mlp": 1.11687493, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.05475551086737561, + "language_loss": 0.94071913, + "learning_rate": 0.0009332202473101329, + "loss": 0.95231587, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.42822266, + "step": 995, + "time_per_iteration": 2.672307014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153956, + "balance_loss_mlp": 1.11011088, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.060816834986447306, + "language_loss": 0.8370983, + "learning_rate": 0.0009330646162707028, + "loss": 0.84863788, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.4387207, + "step": 996, + "time_per_iteration": 2.7483248710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.11274719, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.05013127115514869, + "language_loss": 0.85195571, + "learning_rate": 0.0009329088171006779, + "loss": 0.86350954, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.42626953, + "step": 997, + "time_per_iteration": 3.1445202827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_mlp": 1.1197654, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.07353815647154911, + "language_loss": 0.86074895, + "learning_rate": 0.0009327528498605446, + "loss": 0.87238026, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.43383789, + "step": 998, + "time_per_iteration": 2.536146402359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159094, + "balance_loss_mlp": 1.11844337, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.06861677349241169, + "language_loss": 0.9080506, + "learning_rate": 0.0009325967146108548, + "loss": 0.91964149, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.40649414, + "step": 999, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151834, + "balance_loss_mlp": 1.11049271, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.0672850368289366, + "language_loss": 0.88138115, + "learning_rate": 0.0009324404114122258, + "loss": 0.89289951, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.41357422, + "step": 1000, + "time_per_iteration": 2.677651882171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.12221444, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.06402741154285656, + "language_loss": 0.8710497, + "learning_rate": 0.0009322839403253397, + "loss": 0.88269627, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.42431641, + "step": 1001, + "time_per_iteration": 2.7528679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169088, + "balance_loss_mlp": 1.12440836, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07104878229054386, + "language_loss": 0.84949791, + "learning_rate": 0.0009321273014109439, + "loss": 0.86118877, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.44702148, + "step": 1002, + "time_per_iteration": 2.9990484714508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114889, + "balance_loss_mlp": 1.10523582, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.0673469195429183, + "language_loss": 0.85240018, + "learning_rate": 0.0009319704947298513, + "loss": 0.8638891, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.43676758, + "step": 1003, + "time_per_iteration": 2.8755459785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141118, + "balance_loss_mlp": 1.10127831, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.0925310675323854, + "language_loss": 0.89122581, + "learning_rate": 0.0009318135203429393, + "loss": 0.902637, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.3984375, + "step": 1004, + "time_per_iteration": 2.771192789077759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_mlp": 1.0866611, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.05779097302789, + "language_loss": 0.88602638, + "learning_rate": 0.0009316563783111511, + "loss": 0.8973062, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.41308594, + "step": 1005, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_mlp": 1.08638334, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06006842888316194, + "language_loss": 0.83199531, + "learning_rate": 0.0009314990686954943, + "loss": 0.84330451, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.44506836, + "step": 1006, + "time_per_iteration": 2.935081720352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_mlp": 1.09561515, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.0666735983489841, + "language_loss": 0.81657201, + "learning_rate": 0.000931341591557042, + "loss": 0.82798046, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.45263672, + "step": 1007, + "time_per_iteration": 3.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155404, + "balance_loss_mlp": 1.1041683, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.08115294197805281, + "language_loss": 0.87899536, + "learning_rate": 0.0009311839469569325, + "loss": 0.89054936, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.51171875, + "step": 1008, + "time_per_iteration": 2.6384472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150065, + "balance_loss_mlp": 1.10030699, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.07776470075981182, + "language_loss": 0.88065994, + "learning_rate": 0.0009310261349563687, + "loss": 0.89216053, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.49804688, + "step": 1009, + "time_per_iteration": 2.703058958053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_mlp": 1.11160064, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.05519618089274153, + "language_loss": 0.86250293, + "learning_rate": 0.0009308681556166186, + "loss": 0.87407839, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.45922852, + "step": 1010, + "time_per_iteration": 2.8404791355133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177928, + "balance_loss_mlp": 1.12480855, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10323239067467582, + "language_loss": 0.8870275, + "learning_rate": 0.0009307100089990152, + "loss": 0.89880681, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.53100586, + "step": 1011, + "time_per_iteration": 2.7103512287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185201, + "balance_loss_mlp": 1.13530004, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.08766026563197518, + "language_loss": 0.84582877, + "learning_rate": 0.0009305516951649568, + "loss": 0.8576808, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.49902344, + "step": 1012, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175674, + "balance_loss_mlp": 1.12818122, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07259628373080033, + "language_loss": 0.87723738, + "learning_rate": 0.0009303932141759057, + "loss": 0.8889941, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.47485352, + "step": 1013, + "time_per_iteration": 2.7738490104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161359, + "balance_loss_mlp": 1.11200666, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.07589756885314788, + "language_loss": 0.84698361, + "learning_rate": 0.0009302345660933902, + "loss": 0.85859716, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.49291992, + "step": 1014, + "time_per_iteration": 2.7809414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152692, + "balance_loss_mlp": 1.10579538, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.06636914889533592, + "language_loss": 0.85938931, + "learning_rate": 0.0009300757509790026, + "loss": 0.87091625, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.46875, + "step": 1015, + "time_per_iteration": 2.886200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151123, + "balance_loss_mlp": 1.10324848, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.08384883211824797, + "language_loss": 0.91210115, + "learning_rate": 0.0009299167688944005, + "loss": 0.92361236, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.47827148, + "step": 1016, + "time_per_iteration": 2.5308799743652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135215, + "balance_loss_mlp": 1.09036839, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07612639660839114, + "language_loss": 0.86733758, + "learning_rate": 0.0009297576199013063, + "loss": 0.87868977, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.44873047, + "step": 1017, + "time_per_iteration": 2.699352264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_mlp": 1.14159799, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.04987694814110311, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74158609, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.1484375, + "step": 1018, + "time_per_iteration": 4.927512168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099249, + "balance_loss_mlp": 1.08494341, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.032347612483235935, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80525547, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14257812, + "step": 1019, + "time_per_iteration": 5.494646787643433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_mlp": 1.08855522, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.06601293097738069, + "language_loss": 0.87223667, + "learning_rate": 0.0009292791720892659, + "loss": 0.88352561, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.40332031, + "step": 1020, + "time_per_iteration": 2.8718464374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_mlp": 1.08823943, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07136038826441608, + "language_loss": 0.89387941, + "learning_rate": 0.0009291193560807218, + "loss": 0.90521628, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.45483398, + "step": 1021, + "time_per_iteration": 2.588604211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132851, + "balance_loss_mlp": 1.09141409, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.06738480994857221, + "language_loss": 0.87651652, + "learning_rate": 0.0009289593734732688, + "loss": 0.88784504, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.41430664, + "step": 1022, + "time_per_iteration": 2.5915818214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.09036541, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.06942729809827348, + "language_loss": 0.94984972, + "learning_rate": 0.0009287992243290175, + "loss": 0.96114612, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.39282227, + "step": 1023, + "time_per_iteration": 2.4477546215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142342, + "balance_loss_mlp": 1.09880638, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.1017247644504036, + "language_loss": 0.91891634, + "learning_rate": 0.0009286389087101435, + "loss": 0.93033981, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 2.765334129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142412, + "balance_loss_mlp": 1.09942544, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07195718640229302, + "language_loss": 0.8893857, + "learning_rate": 0.0009284784266788864, + "loss": 0.90080982, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.42993164, + "step": 1025, + "time_per_iteration": 2.7323853969573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_mlp": 1.10327554, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.069193395974369, + "language_loss": 0.93259764, + "learning_rate": 0.0009283177782975512, + "loss": 0.94401753, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.38696289, + "step": 1026, + "time_per_iteration": 2.9729068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114551, + "balance_loss_mlp": 1.10142589, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08755988500201482, + "language_loss": 0.88955659, + "learning_rate": 0.000928156963628507, + "loss": 0.90101171, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.44067383, + "step": 1027, + "time_per_iteration": 2.594200849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138947, + "balance_loss_mlp": 1.09855926, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.07316483198701504, + "language_loss": 0.89277303, + "learning_rate": 0.0009279959827341877, + "loss": 0.90416259, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.40405273, + "step": 1028, + "time_per_iteration": 2.7378368377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140451, + "balance_loss_mlp": 1.09727335, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.059550544329949856, + "language_loss": 0.88526183, + "learning_rate": 0.0009278348356770915, + "loss": 0.89666629, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.43188477, + "step": 1029, + "time_per_iteration": 2.5737922191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133825, + "balance_loss_mlp": 1.0914098, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.06393748023743129, + "language_loss": 0.8587814, + "learning_rate": 0.0009276735225197814, + "loss": 0.87011963, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.42431641, + "step": 1030, + "time_per_iteration": 2.648477077484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146668, + "balance_loss_mlp": 1.10170269, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.06069855374703422, + "language_loss": 0.86812896, + "learning_rate": 0.0009275120433248847, + "loss": 0.87959564, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.44946289, + "step": 1031, + "time_per_iteration": 2.6862802505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.10327268, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.06482797348212818, + "language_loss": 0.87033594, + "learning_rate": 0.0009273503981550931, + "loss": 0.8818205, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.45166016, + "step": 1032, + "time_per_iteration": 3.0549416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157268, + "balance_loss_mlp": 1.11235023, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.07571303407420105, + "language_loss": 0.87661642, + "learning_rate": 0.0009271885870731626, + "loss": 0.88818914, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.44946289, + "step": 1033, + "time_per_iteration": 2.4938008785247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172373, + "balance_loss_mlp": 1.12495148, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.07801561202279184, + "language_loss": 0.89466584, + "learning_rate": 0.0009270266101419143, + "loss": 0.90638959, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.47460938, + "step": 1034, + "time_per_iteration": 2.61181378364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.12681675, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.07487269237991181, + "language_loss": 0.85762119, + "learning_rate": 0.0009268644674242328, + "loss": 0.86931992, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.43066406, + "step": 1035, + "time_per_iteration": 2.6761085987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163027, + "balance_loss_mlp": 1.1147716, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.06997084642295975, + "language_loss": 0.81697071, + "learning_rate": 0.0009267021589830678, + "loss": 0.828601, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.4831543, + "step": 1036, + "time_per_iteration": 2.6166343688964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162737, + "balance_loss_mlp": 1.14547551, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.04224955266067769, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78789818, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.17285156, + "step": 1037, + "time_per_iteration": 4.932336330413818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124804, + "balance_loss_mlp": 1.08224678, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.07370646472771722, + "language_loss": 0.9354341, + "learning_rate": 0.000926377045182406, + "loss": 0.94668216, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.42553711, + "step": 1038, + "time_per_iteration": 2.89486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122228, + "balance_loss_mlp": 1.07704759, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.06351485696264159, + "language_loss": 0.88915765, + "learning_rate": 0.0009262142399491296, + "loss": 0.9003799, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.4519043, + "step": 1039, + "time_per_iteration": 3.0843544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132553, + "balance_loss_mlp": 1.08784938, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.06429886269356283, + "language_loss": 0.89007306, + "learning_rate": 0.0009260512692448105, + "loss": 0.9013986, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.44677734, + "step": 1040, + "time_per_iteration": 2.7221181392669678 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2343431372144640.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/training_args.bin b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b28f0633932ff84d8e0fde7beb2f9c59f0d04be --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54b92ce31f27a60f5f91da41c22febbdc5fe6a9ac82c4d361c2b9dbc9096639 +size 7992 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/zero_to_fp32.py b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-1040/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/added_tokens.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4477091c8e5e4d06ea14a8a918edb0ae2310c298 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/generation_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a28a3b3416cdede7f5dea7dec3534bd04615b27c --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa5d89c54cf5ed6c5b0d2da9e0b36998ad8f2c893892dce65e5fa599a5fa85bf +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ad462ef000310e799feeef0abd13b5228a296f3 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459c0e0d1c529e4dbeb05dc0d0fda9104621a14a6f6b2cb0ea3a810d189b3ad5 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a36b939c49cc7d556c5428a07f9d45c9bb819aa --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5aae66db5a966c6ee0a0a98d491e3084b0674e65a5b77cc2fed727a3fcbe76 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c699676d6c9d2477cfd82658711c620519ac0ac7 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ec6331fef1380ead70a7aa7aa1e6ca227cdfdd955413c7763c12bbef162c074 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b541fb483acb31767d3af6c8a86ce9ec407d340d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48cf43f659100088bd00cc73cededfd61ff8b89ef3710025d20e2f7c37e8d474 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af41de876b85ee413f0a426cd0b8aa82c37808c7 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec86425c0eb48a83445cd111d801edcce15144bf6a9f1442b107b4962308ab74 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11f777a08ba4bc35ff13fa9d0f45f735b9e6a5c8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba7ba9c0284c5325ac12d803ada342d4e4759eb6f35d5b59f24e30292bdbb87f +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5c13200e494afbd9dad72a1cb1380a22ac4a3fb --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32d1ca1d05176a0b1d29b01276fd6af50e5d17336ab6261b3ad74f5c570b779b +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/latest b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/latest new file mode 100644 index 0000000000000000000000000000000000000000..306b989cc55bbad3d1661dff0bcd6923a752cb0a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/latest @@ -0,0 +1 @@ +global_step2080 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f0e19b8c388cf88fef93ef52ca6be9db572c902b --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b14925fe39eeaa9764d3b66d3a9cf62346c7ef526370fa53f34d27a04bf06ad +size 3759025152 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model.safetensors.index.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_0.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_1.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_2.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_3.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/special_tokens_map.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/tokenizer.model b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/tokenizer_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/trainer_state.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bb061a5eb9879ac640d21bdc0c8ffadf8fb02ea9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/trainer_state.json @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03935784, + "balance_loss_mlp": 2.84935808, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 13.498251331228948, + "language_loss": 2.81572914, + "learning_rate": 0.0, + "loss": 1.90346789, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.30480647087097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0351246, + "balance_loss_mlp": 2.65644169, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 27.482987886380492, + "language_loss": 8.76816368, + "learning_rate": 0.00013726078121135892, + "loss": 8.80328846, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 8.578125, + "step": 2, + "time_per_iteration": 2.6929261684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03513305, + "balance_loss_mlp": 2.65728736, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 28.576563245741852, + "language_loss": 9.00053596, + "learning_rate": 0.00021755319103969496, + "loss": 9.03566933, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 8.578125, + "step": 3, + "time_per_iteration": 2.7945075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03667009, + "balance_loss_mlp": 2.78657675, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 15.694146018083416, + "language_loss": 2.74122858, + "learning_rate": 0.00027452156242271784, + "loss": 2.77789879, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 8.828125, + "step": 4, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03933422, + "balance_loss_mlp": 3.01102829, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 3.505338851882968, + "language_loss": 1.83478093, + "learning_rate": 0.0003187096642208417, + "loss": 1.87411511, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 9.2109375, + "step": 5, + "time_per_iteration": 2.651094675064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04005588, + "balance_loss_mlp": 3.05420256, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 3.050600048840319, + "language_loss": 1.61776543, + "learning_rate": 0.0003548139722510539, + "loss": 1.65782118, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 9.4921875, + "step": 6, + "time_per_iteration": 2.697614908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03708502, + "balance_loss_mlp": 2.7708497, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7974788691124679, + "language_loss": 1.32417345, + "learning_rate": 0.00038533972973918044, + "loss": 1.36125851, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 9.3515625, + "step": 7, + "time_per_iteration": 2.6407949924468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0332405, + "balance_loss_mlp": 2.38868618, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.7144720842381633, + "language_loss": 1.25956392, + "learning_rate": 0.0004117823436340768, + "loss": 1.29280448, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 9.3359375, + "step": 8, + "time_per_iteration": 2.6287930011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02785454, + "balance_loss_mlp": 1.8508532, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.3140255221758466, + "language_loss": 1.29993415, + "learning_rate": 0.00043510638207938993, + "loss": 1.32778871, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 9.3203125, + "step": 9, + "time_per_iteration": 2.8048858642578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0244685, + "balance_loss_mlp": 1.50004196, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.19799802642524775, + "language_loss": 1.19032216, + "learning_rate": 0.00045597044543220066, + "loss": 1.2147907, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 9.4453125, + "step": 10, + "time_per_iteration": 2.7669434547424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02310187, + "balance_loss_mlp": 1.35117221, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.14485632700798082, + "language_loss": 1.18421102, + "learning_rate": 0.00047484428652143135, + "loss": 1.20731282, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 9.5703125, + "step": 11, + "time_per_iteration": 2.9067423343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02309394, + "balance_loss_mlp": 1.33740926, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.1366980934684776, + "language_loss": 1.24379897, + "learning_rate": 0.0004920747534624128, + "loss": 1.26689291, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 9.703125, + "step": 12, + "time_per_iteration": 2.612813949584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022984, + "balance_loss_mlp": 1.32565212, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.11957957623458634, + "language_loss": 1.26615512, + "learning_rate": 0.0005079252465375872, + "loss": 1.28913903, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 9.7109375, + "step": 13, + "time_per_iteration": 2.879688262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02311662, + "balance_loss_mlp": 1.34730673, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.10749127497061137, + "language_loss": 1.14448667, + "learning_rate": 0.0005226005109505393, + "loss": 1.16760325, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 9.625, + "step": 14, + "time_per_iteration": 2.568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02285502, + "balance_loss_mlp": 1.3615818, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.11405493545380829, + "language_loss": 1.20514369, + "learning_rate": 0.0005362628552605367, + "loss": 1.22799873, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 9.21875, + "step": 15, + "time_per_iteration": 2.6814210414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02243131, + "balance_loss_mlp": 1.36117291, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.10465613456634369, + "language_loss": 1.24307358, + "learning_rate": 0.0005490431248454357, + "loss": 1.26550484, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 8.84375, + "step": 16, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02323403, + "balance_loss_mlp": 1.52994621, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2929644268686402, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78028512, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.90625, + "step": 17, + "time_per_iteration": 6.376815319061279 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02154669, + "balance_loss_mlp": 1.37418151, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.15081794947454089, + "language_loss": 1.11159086, + "learning_rate": 0.0005723671632907488, + "loss": 1.13313746, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.80078125, + "step": 18, + "time_per_iteration": 2.721731424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02067628, + "balance_loss_mlp": 1.35466075, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11430094844987627, + "language_loss": 1.15730095, + "learning_rate": 0.0005830738490244919, + "loss": 1.1779772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 7.12890625, + "step": 19, + "time_per_iteration": 2.691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01966178, + "balance_loss_mlp": 1.31958628, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10166759343386816, + "language_loss": 1.17760253, + "learning_rate": 0.0005932312266435596, + "loss": 1.19726431, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.46484375, + "step": 20, + "time_per_iteration": 2.779218912124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01836812, + "balance_loss_mlp": 1.26727819, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.12846528828878043, + "language_loss": 1.12106359, + "learning_rate": 0.0006028929207788754, + "loss": 1.13943172, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 5.70703125, + "step": 21, + "time_per_iteration": 2.716970443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01720951, + "balance_loss_mlp": 1.21970022, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09445288880840001, + "language_loss": 1.16516471, + "learning_rate": 0.0006121050677327902, + "loss": 1.18237424, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 5.0078125, + "step": 22, + "time_per_iteration": 2.92696475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01630624, + "balance_loss_mlp": 1.19193399, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.11621712848760359, + "language_loss": 1.06380248, + "learning_rate": 0.0006209076479463684, + "loss": 1.08010876, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.3984375, + "step": 23, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01572853, + "balance_loss_mlp": 1.18394423, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.10970997088624258, + "language_loss": 1.16519284, + "learning_rate": 0.0006293355346737718, + "loss": 1.18092132, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 3.88476562, + "step": 24, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0152954, + "balance_loss_mlp": 1.18755198, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.09735665571869598, + "language_loss": 1.12784922, + "learning_rate": 0.0006374193284416834, + "loss": 1.14314473, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 3.42382812, + "step": 25, + "time_per_iteration": 2.7919249534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0148827, + "balance_loss_mlp": 1.19282198, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.09233879954989622, + "language_loss": 1.11062908, + "learning_rate": 0.0006451860277489461, + "loss": 1.12551177, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 2.953125, + "step": 26, + "time_per_iteration": 2.581066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462989, + "balance_loss_mlp": 1.20988345, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.12330238493557526, + "language_loss": 1.19441557, + "learning_rate": 0.0006526595731190848, + "loss": 1.20904553, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 2.52929688, + "step": 27, + "time_per_iteration": 2.49725604057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423898, + "balance_loss_mlp": 1.20874906, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.09841719698503415, + "language_loss": 1.12322927, + "learning_rate": 0.0006598612921618983, + "loss": 1.13746822, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 2.15625, + "step": 28, + "time_per_iteration": 2.822068929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399446, + "balance_loss_mlp": 1.21443295, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.2589331093265968, + "language_loss": 1.06232262, + "learning_rate": 0.0006668102665011454, + "loss": 1.07631707, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 1.84765625, + "step": 29, + "time_per_iteration": 3.2402820587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444994, + "balance_loss_mlp": 1.28353739, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.1317361033328709, + "language_loss": 1.14859319, + "learning_rate": 0.0006735236364718957, + "loss": 1.16304302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 1.61425781, + "step": 30, + "time_per_iteration": 2.6861231327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333301, + "balance_loss_mlp": 1.20445967, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.07039345614882069, + "language_loss": 1.13512135, + "learning_rate": 0.0006800168558381346, + "loss": 1.14845431, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 1.28808594, + "step": 31, + "time_per_iteration": 2.6444640159606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254242, + "balance_loss_mlp": 1.153772, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.07602265872136475, + "language_loss": 1.1720531, + "learning_rate": 0.0006863039060567947, + "loss": 1.18459558, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 1.00439453, + "step": 32, + "time_per_iteration": 2.7225399017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117915, + "balance_loss_mlp": 1.10071015, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.062098451262649575, + "language_loss": 1.09530759, + "learning_rate": 0.0006923974775611263, + "loss": 1.10709918, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.78417969, + "step": 33, + "time_per_iteration": 2.795565366744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155392, + "balance_loss_mlp": 1.09416604, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0750568617782567, + "language_loss": 1.06307364, + "learning_rate": 0.0006983091239737814, + "loss": 1.0746274, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.61132812, + "step": 34, + "time_per_iteration": 3.0703423023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.0903163, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.057198892540160154, + "language_loss": 1.05094206, + "learning_rate": 0.0007040493939600222, + "loss": 1.06232452, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.47949219, + "step": 35, + "time_per_iteration": 2.8476996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136821, + "balance_loss_mlp": 1.09926963, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.07105443011946577, + "language_loss": 1.05056715, + "learning_rate": 0.0007096279445021078, + "loss": 1.06193542, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.37548828, + "step": 36, + "time_per_iteration": 2.8306472301483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_mlp": 1.12274194, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09366404592926651, + "language_loss": 1.11846077, + "learning_rate": 0.0007150536386503726, + "loss": 1.12998605, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.29736328, + "step": 37, + "time_per_iteration": 2.875190258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150569, + "balance_loss_mlp": 1.12677491, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.0928332145488954, + "language_loss": 1.04548562, + "learning_rate": 0.0007203346302358509, + "loss": 1.05699134, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.23791504, + "step": 38, + "time_per_iteration": 3.0075292587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128748, + "balance_loss_mlp": 1.10757613, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.056043607360260886, + "language_loss": 1.09224963, + "learning_rate": 0.000725478437577282, + "loss": 1.10353708, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.21179199, + "step": 39, + "time_per_iteration": 2.78564715385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_mlp": 1.09953475, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.2122838817863008, + "language_loss": 1.04638147, + "learning_rate": 0.0007304920078549186, + "loss": 1.0575583, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18151855, + "step": 40, + "time_per_iteration": 2.745100975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133734, + "balance_loss_mlp": 1.11621058, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.14528393981530327, + "language_loss": 1.06509054, + "learning_rate": 0.0007353817735343603, + "loss": 1.07642794, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.17529297, + "step": 41, + "time_per_iteration": 2.7425575256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.10357416, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.06769616325508275, + "language_loss": 1.0188365, + "learning_rate": 0.0007401537019902344, + "loss": 1.03003538, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.16308594, + "step": 42, + "time_per_iteration": 2.6797902584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118919, + "balance_loss_mlp": 1.10271883, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.14916902722339276, + "language_loss": 1.05194306, + "learning_rate": 0.0007448133392900729, + "loss": 1.06313229, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.1619873, + "step": 43, + "time_per_iteration": 2.779276132583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_mlp": 1.09945166, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.052417895665492535, + "language_loss": 1.00651026, + "learning_rate": 0.0007493658489441491, + "loss": 1.0176717, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.16711426, + "step": 44, + "time_per_iteration": 2.965435028076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_mlp": 1.09195447, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.04248825884697869, + "language_loss": 1.04600978, + "learning_rate": 0.0007538160463002316, + "loss": 1.05709875, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.16967773, + "step": 45, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_mlp": 1.08735132, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.08538228051147774, + "language_loss": 1.08093452, + "learning_rate": 0.0007581684291577274, + "loss": 1.09198785, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.17980957, + "step": 46, + "time_per_iteration": 2.6020169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.04723509056908367, + "language_loss": 1.10695386, + "learning_rate": 0.0007624272050891776, + "loss": 1.11800754, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.19006348, + "step": 47, + "time_per_iteration": 2.8620407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_mlp": 1.08244705, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.07235265954126073, + "language_loss": 1.00601125, + "learning_rate": 0.0007665963158851307, + "loss": 1.01704311, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.20751953, + "step": 48, + "time_per_iteration": 2.8312995433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114938, + "balance_loss_mlp": 1.09308696, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.10505304652404167, + "language_loss": 1.09914839, + "learning_rate": 0.0007706794594783609, + "loss": 1.1102978, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.21850586, + "step": 49, + "time_per_iteration": 2.779561758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.0874207, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.04709564792407722, + "language_loss": 1.08694363, + "learning_rate": 0.0007746801096530423, + "loss": 1.09804368, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.22583008, + "step": 50, + "time_per_iteration": 2.785332441329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_mlp": 1.09285581, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09574874491356838, + "language_loss": 1.13402438, + "learning_rate": 0.0007786015338021173, + "loss": 1.14518726, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.23425293, + "step": 51, + "time_per_iteration": 2.676326274871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_mlp": 1.09500206, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.12325193255180054, + "language_loss": 1.06019998, + "learning_rate": 0.0007824468089603051, + "loss": 1.07138121, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.23144531, + "step": 52, + "time_per_iteration": 2.688828945159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_mlp": 1.11038983, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.07208467676878935, + "language_loss": 1.05329835, + "learning_rate": 0.0007862188363098669, + "loss": 1.06464922, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.24707031, + "step": 53, + "time_per_iteration": 3.3342933654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126914, + "balance_loss_mlp": 1.10158229, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.09794855088059086, + "language_loss": 1.06043434, + "learning_rate": 0.0007899203543304438, + "loss": 1.07170355, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25354004, + "step": 54, + "time_per_iteration": 2.933236837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145083, + "balance_loss_mlp": 1.12053776, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.1404118977896248, + "language_loss": 1.20000231, + "learning_rate": 0.0007935539507422731, + "loss": 1.2114532, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.24536133, + "step": 55, + "time_per_iteration": 2.8257975578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.12969017, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.05382700946372506, + "language_loss": 1.10560298, + "learning_rate": 0.0007971220733732573, + "loss": 1.11713552, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.2355957, + "step": 56, + "time_per_iteration": 2.749382495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_mlp": 1.13151693, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.17392462927294325, + "language_loss": 1.05995011, + "learning_rate": 0.0008006270400641869, + "loss": 1.07150006, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.23474121, + "step": 57, + "time_per_iteration": 2.743929147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_mlp": 1.10234821, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.10169017538987117, + "language_loss": 1.06833839, + "learning_rate": 0.0008040710477125043, + "loss": 1.07959747, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.23547363, + "step": 58, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111797, + "balance_loss_mlp": 1.08861065, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.059941584643697095, + "language_loss": 1.07409072, + "learning_rate": 0.0008074561805429771, + "loss": 1.08520865, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.23181152, + "step": 59, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123772, + "balance_loss_mlp": 1.09970331, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.06438674129900752, + "language_loss": 1.04891515, + "learning_rate": 0.0008107844176832545, + "loss": 1.06015277, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.24072266, + "step": 60, + "time_per_iteration": 2.7009053230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.11569333, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.09833112160800331, + "language_loss": 1.0671711, + "learning_rate": 0.0008140576401132568, + "loss": 1.07856739, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.23913574, + "step": 61, + "time_per_iteration": 2.678501844406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114169, + "balance_loss_mlp": 1.11887348, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.11014501355567002, + "language_loss": 1.07748628, + "learning_rate": 0.0008172776370494935, + "loss": 1.08890319, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.22814941, + "step": 62, + "time_per_iteration": 2.7718141078948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116479, + "balance_loss_mlp": 1.09356666, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.06441650429015075, + "language_loss": 1.15269816, + "learning_rate": 0.0008204461118185703, + "loss": 1.16386294, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.22912598, + "step": 63, + "time_per_iteration": 2.5839178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_mlp": 1.09543014, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.06608006175674933, + "language_loss": 1.04523873, + "learning_rate": 0.0008235646872681536, + "loss": 1.05641007, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.21728516, + "step": 64, + "time_per_iteration": 2.5611703395843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_mlp": 1.10659182, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.07834673611922068, + "language_loss": 1.04319417, + "learning_rate": 0.0008266349107584288, + "loss": 1.05447328, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.21313477, + "step": 65, + "time_per_iteration": 2.727666139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141841, + "balance_loss_mlp": 1.1207881, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.06003338375813584, + "language_loss": 1.07126927, + "learning_rate": 0.0008296582587724851, + "loss": 1.08268762, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21057129, + "step": 66, + "time_per_iteration": 2.716701030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127113, + "balance_loss_mlp": 1.10609627, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.04807876202194694, + "language_loss": 1.04662776, + "learning_rate": 0.0008326361411800136, + "loss": 1.05789876, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21008301, + "step": 67, + "time_per_iteration": 2.9571592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114337, + "balance_loss_mlp": 1.09446514, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.05551510449528945, + "language_loss": 1.05008268, + "learning_rate": 0.0008355699051851403, + "loss": 1.06122601, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1986084, + "step": 68, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.1242373, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.0697970629442659, + "language_loss": 1.12296045, + "learning_rate": 0.0008384608389860635, + "loss": 1.13439655, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.19372559, + "step": 69, + "time_per_iteration": 2.685215711593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.122311, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.08511613263061502, + "language_loss": 1.02745342, + "learning_rate": 0.000841310175171381, + "loss": 1.03886437, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.18774414, + "step": 70, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_mlp": 1.12464356, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.055787325190813475, + "language_loss": 1.0065217, + "learning_rate": 0.000844119093875517, + "loss": 1.0179472, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.17944336, + "step": 71, + "time_per_iteration": 2.753220319747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152267, + "balance_loss_mlp": 1.13508892, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08668312915327946, + "language_loss": 1.05463254, + "learning_rate": 0.0008468887257134666, + "loss": 1.0661552, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.17199707, + "step": 72, + "time_per_iteration": 2.7056305408477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117134, + "balance_loss_mlp": 1.15478206, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07356095482564125, + "language_loss": 1.08388793, + "learning_rate": 0.0008496201545131264, + "loss": 1.09560132, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.16564941, + "step": 73, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152437, + "balance_loss_mlp": 1.13545001, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.06787935984484554, + "language_loss": 1.06090975, + "learning_rate": 0.0008523144198617317, + "loss": 1.07243395, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16992188, + "step": 74, + "time_per_iteration": 3.2090003490448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1223346, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.04825332815792917, + "language_loss": 1.053195, + "learning_rate": 0.0008549725194813783, + "loss": 1.06458783, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.16967773, + "step": 75, + "time_per_iteration": 2.654343605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.10599899, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.03887402020767282, + "language_loss": 1.04797029, + "learning_rate": 0.0008575954114472099, + "loss": 1.05919111, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.1607666, + "step": 76, + "time_per_iteration": 3.119884967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.1187191, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.056937643991546806, + "language_loss": 1.02038705, + "learning_rate": 0.0008601840162606118, + "loss": 1.03173184, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.1574707, + "step": 77, + "time_per_iteration": 3.025688886642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146725, + "balance_loss_mlp": 1.13034582, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04989291514363055, + "language_loss": 1.08127129, + "learning_rate": 0.000862739218788641, + "loss": 1.09273863, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16381836, + "step": 78, + "time_per_iteration": 2.7922520637512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149681, + "balance_loss_mlp": 1.13339734, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.06709094188277621, + "language_loss": 1.06189477, + "learning_rate": 0.0008652618700799138, + "loss": 1.07339156, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.1628418, + "step": 79, + "time_per_iteration": 2.6902618408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_mlp": 1.1367681, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.062162504049989416, + "language_loss": 1.05161238, + "learning_rate": 0.0008677527890662774, + "loss": 1.06314492, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16491699, + "step": 80, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_mlp": 1.13076603, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.04934081686369646, + "language_loss": 1.06529951, + "learning_rate": 0.0008702127641587799, + "loss": 1.0767715, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.16430664, + "step": 81, + "time_per_iteration": 2.634038209915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_mlp": 1.12558985, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.08879987127008451, + "language_loss": 1.0221808, + "learning_rate": 0.0008726425547457192, + "loss": 1.0336051, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.16845703, + "step": 82, + "time_per_iteration": 2.74308705329895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.13108134, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.06313420095488197, + "language_loss": 1.01906681, + "learning_rate": 0.0008750428925998964, + "loss": 1.03054249, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.16491699, + "step": 83, + "time_per_iteration": 2.777132511138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146759, + "balance_loss_mlp": 1.13009322, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.11663644047392754, + "language_loss": 1.07169831, + "learning_rate": 0.0008774144832015932, + "loss": 1.08316588, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16674805, + "step": 84, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01524523, + "balance_loss_mlp": 1.51412809, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.22860236459315994, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76298833, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.10400391, + "step": 85, + "time_per_iteration": 4.57580041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166169, + "balance_loss_mlp": 1.1501826, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.05249425037579876, + "language_loss": 1.01959693, + "learning_rate": 0.0008820741205014318, + "loss": 1.03125858, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.15979004, + "step": 86, + "time_per_iteration": 2.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223619, + "balance_loss_mlp": 1.20703709, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.10761462625124436, + "language_loss": 1.03955913, + "learning_rate": 0.0008843634575408404, + "loss": 1.05179524, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.16577148, + "step": 87, + "time_per_iteration": 2.6694159507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228231, + "balance_loss_mlp": 1.21267366, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.10737104518045529, + "language_loss": 1.05078888, + "learning_rate": 0.0008866266301555082, + "loss": 1.06307125, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.15551758, + "step": 88, + "time_per_iteration": 2.7686069011688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212138, + "balance_loss_mlp": 1.19609249, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.1616084590878673, + "language_loss": 1.0609467, + "learning_rate": 0.0008888642296509615, + "loss": 1.07306814, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.16040039, + "step": 89, + "time_per_iteration": 2.625988721847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199649, + "balance_loss_mlp": 1.18316197, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.07585409016808545, + "language_loss": 1.1065979, + "learning_rate": 0.0008910768275115906, + "loss": 1.11859453, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16491699, + "step": 90, + "time_per_iteration": 2.793017864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_mlp": 1.15697813, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.07277460951060387, + "language_loss": 1.06493175, + "learning_rate": 0.0008932649762767675, + "loss": 1.07666695, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16552734, + "step": 91, + "time_per_iteration": 2.5919723510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169355, + "balance_loss_mlp": 1.15323818, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.10172519854243242, + "language_loss": 1.09112859, + "learning_rate": 0.0008954292103690864, + "loss": 1.10282218, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.16113281, + "step": 92, + "time_per_iteration": 2.9366836547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174542, + "balance_loss_mlp": 1.15828145, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.07803491111319032, + "language_loss": 1.10981905, + "learning_rate": 0.0008975700468778296, + "loss": 1.12156439, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16259766, + "step": 93, + "time_per_iteration": 2.592458963394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156862, + "balance_loss_mlp": 1.14067388, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.09102852745954727, + "language_loss": 1.04703569, + "learning_rate": 0.0008996879863005366, + "loss": 1.05860424, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.71566104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148536, + "balance_loss_mlp": 1.13235974, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.03859462796979438, + "language_loss": 1.04768109, + "learning_rate": 0.0009017835132453337, + "loss": 1.05916631, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.16174316, + "step": 95, + "time_per_iteration": 2.664511203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_mlp": 1.121889, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.060963703759419355, + "language_loss": 1.04675508, + "learning_rate": 0.0009038570970964896, + "loss": 1.05813384, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.15991211, + "step": 96, + "time_per_iteration": 2.7669789791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_mlp": 1.10899043, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0943042692373462, + "language_loss": 1.02071011, + "learning_rate": 0.0009059091926454854, + "loss": 1.03196073, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16064453, + "step": 97, + "time_per_iteration": 2.6028668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_mlp": 1.11052442, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.06745462513624549, + "language_loss": 1.0144124, + "learning_rate": 0.0009079402406897198, + "loss": 1.02567911, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.16137695, + "step": 98, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127975, + "balance_loss_mlp": 1.11166739, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.10523687850003575, + "language_loss": 1.03251696, + "learning_rate": 0.0009099506686008212, + "loss": 1.04379678, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16308594, + "step": 99, + "time_per_iteration": 2.8251914978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116643, + "balance_loss_mlp": 1.10100293, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.08495157768411668, + "language_loss": 1.0609076, + "learning_rate": 0.0009119408908644013, + "loss": 1.07207406, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15625, + "step": 100, + "time_per_iteration": 2.6573309898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.12211871, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.09022378013673595, + "language_loss": 1.11755276, + "learning_rate": 0.0009139113095929519, + "loss": 1.12892556, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15124512, + "step": 101, + "time_per_iteration": 2.844698429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159661, + "balance_loss_mlp": 1.14373517, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.0892612752622512, + "language_loss": 1.05698013, + "learning_rate": 0.0009158623150134762, + "loss": 1.06857681, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15917969, + "step": 102, + "time_per_iteration": 2.589857339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_mlp": 1.12158906, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.06508497546963277, + "language_loss": 1.05496848, + "learning_rate": 0.000917794285931332, + "loss": 1.06634164, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15710449, + "step": 103, + "time_per_iteration": 2.6433918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_mlp": 1.1019367, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.07675487095909958, + "language_loss": 0.97610366, + "learning_rate": 0.0009197075901716639, + "loss": 0.98728061, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.1574707, + "step": 104, + "time_per_iteration": 2.709157943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137693, + "balance_loss_mlp": 1.12159956, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.05257934075389246, + "language_loss": 1.0758431, + "learning_rate": 0.0009216025849997171, + "loss": 1.08722019, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16088867, + "step": 105, + "time_per_iteration": 2.7638583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111903, + "balance_loss_mlp": 1.09596467, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.07457888312135433, + "language_loss": 1.02261579, + "learning_rate": 0.0009234796175212258, + "loss": 1.03373492, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.15930176, + "step": 106, + "time_per_iteration": 2.9391980171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117989, + "balance_loss_mlp": 1.10228872, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.06024423434996524, + "language_loss": 1.05948544, + "learning_rate": 0.000925339025064007, + "loss": 1.07066536, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.15686035, + "step": 107, + "time_per_iteration": 2.975294828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118819, + "balance_loss_mlp": 1.10334611, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.07105297051955457, + "language_loss": 0.99294066, + "learning_rate": 0.0009271811355418027, + "loss": 1.00412893, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.15454102, + "step": 108, + "time_per_iteration": 2.8750014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125269, + "balance_loss_mlp": 1.10940242, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09212378946406244, + "language_loss": 1.05636311, + "learning_rate": 0.0009290062678013548, + "loss": 1.06761575, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.15856934, + "step": 109, + "time_per_iteration": 2.8552017211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119898, + "balance_loss_mlp": 1.10393572, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.059465971869905314, + "language_loss": 1.04477715, + "learning_rate": 0.0009308147319536321, + "loss": 1.05597615, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.1595459, + "step": 110, + "time_per_iteration": 2.6493232250213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129754, + "balance_loss_mlp": 1.11385095, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.08324280754141193, + "language_loss": 1.10257316, + "learning_rate": 0.0009326068296900676, + "loss": 1.11387074, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.15893555, + "step": 111, + "time_per_iteration": 2.8384125232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112769, + "balance_loss_mlp": 1.11171615, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06941460102767082, + "language_loss": 1.01355243, + "learning_rate": 0.0009343828545846161, + "loss": 1.02482939, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.15966797, + "step": 112, + "time_per_iteration": 2.7743477821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114893, + "balance_loss_mlp": 1.13326573, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.047977415311889204, + "language_loss": 1.05199587, + "learning_rate": 0.0009361430923823841, + "loss": 1.06348515, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.15649414, + "step": 113, + "time_per_iteration": 2.6022982597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.10308659, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.080001842017843, + "language_loss": 1.09258401, + "learning_rate": 0.0009378878212755459, + "loss": 1.10376549, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15039062, + "step": 114, + "time_per_iteration": 2.491594076156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115419, + "balance_loss_mlp": 1.09967113, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.05036418666557463, + "language_loss": 0.9906168, + "learning_rate": 0.0009396173121672103, + "loss": 1.00177097, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.15734863, + "step": 115, + "time_per_iteration": 2.668848991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_mlp": 1.10945916, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.05918191636932359, + "language_loss": 1.04414749, + "learning_rate": 0.0009413318289238633, + "loss": 1.05539548, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.15307617, + "step": 116, + "time_per_iteration": 2.7496132850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106823, + "balance_loss_mlp": 1.09139705, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.1124204963758038, + "language_loss": 0.96924931, + "learning_rate": 0.0009430316286169771, + "loss": 0.98031747, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.15405273, + "step": 117, + "time_per_iteration": 3.026118278503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_mlp": 1.11998308, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.03693994945601898, + "language_loss": 1.02417183, + "learning_rate": 0.0009447169617543361, + "loss": 1.03552485, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15307617, + "step": 118, + "time_per_iteration": 2.575666666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156925, + "balance_loss_mlp": 1.14185703, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.10959367855453626, + "language_loss": 1.09001684, + "learning_rate": 0.0009463880725016029, + "loss": 1.1015861, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.15039062, + "step": 119, + "time_per_iteration": 2.6811347007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115422, + "balance_loss_mlp": 1.10052109, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.05068852434870314, + "language_loss": 1.03909945, + "learning_rate": 0.0009480451988946134, + "loss": 1.05025363, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.14880371, + "step": 120, + "time_per_iteration": 2.801814079284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_mlp": 1.09179425, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.05688398470992871, + "language_loss": 1.05377555, + "learning_rate": 0.0009496885730428627, + "loss": 1.06484532, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1517334, + "step": 121, + "time_per_iteration": 3.04720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_mlp": 1.10574555, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.08369646841136469, + "language_loss": 1.03908122, + "learning_rate": 0.0009513184213246156, + "loss": 1.05029583, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.15710449, + "step": 122, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129626, + "balance_loss_mlp": 1.11406958, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.05522871343558165, + "language_loss": 1.07008672, + "learning_rate": 0.0009529349645740552, + "loss": 1.08138299, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15539551, + "step": 123, + "time_per_iteration": 2.69759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129797, + "balance_loss_mlp": 1.11481285, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.053769267634074955, + "language_loss": 1.05687594, + "learning_rate": 0.0009545384182608524, + "loss": 1.06817389, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1496582, + "step": 124, + "time_per_iteration": 2.550584316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126, + "balance_loss_mlp": 1.11114669, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.08700167249890467, + "language_loss": 1.02945745, + "learning_rate": 0.0009561289926625252, + "loss": 1.04071736, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14831543, + "step": 125, + "time_per_iteration": 2.6619794368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_mlp": 1.10831082, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.07114777459455598, + "language_loss": 1.07932711, + "learning_rate": 0.0009577068930299292, + "loss": 1.09056234, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.15209961, + "step": 126, + "time_per_iteration": 2.553642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125226, + "balance_loss_mlp": 1.11038458, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.08279894264625885, + "language_loss": 1.03556633, + "learning_rate": 0.0009592723197462087, + "loss": 1.04681861, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.14819336, + "step": 127, + "time_per_iteration": 2.7255966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_mlp": 1.10936916, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07600858050716931, + "language_loss": 0.99905002, + "learning_rate": 0.0009608254684795125, + "loss": 1.01029539, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15148926, + "step": 128, + "time_per_iteration": 2.9839587211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_mlp": 1.11718702, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.08573045125619827, + "language_loss": 1.02976727, + "learning_rate": 0.0009623665303297678, + "loss": 1.04109192, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.15258789, + "step": 129, + "time_per_iteration": 2.7344865798950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_mlp": 1.10497391, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.07510500588649292, + "language_loss": 1.07057762, + "learning_rate": 0.0009638956919697878, + "loss": 1.08177161, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.14416504, + "step": 130, + "time_per_iteration": 2.864952802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_mlp": 1.08930528, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.0567118244953117, + "language_loss": 0.99135083, + "learning_rate": 0.0009654131357809714, + "loss": 1.00239229, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.14819336, + "step": 131, + "time_per_iteration": 2.6095099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_mlp": 1.1081202, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.05892082702998288, + "language_loss": 1.08188879, + "learning_rate": 0.0009669190399838441, + "loss": 1.09312594, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.15576172, + "step": 132, + "time_per_iteration": 3.096733331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_mlp": 1.08531809, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.09564892115109941, + "language_loss": 1.01233923, + "learning_rate": 0.0009684135787636724, + "loss": 1.02334726, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.15478516, + "step": 133, + "time_per_iteration": 2.8120856285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111325, + "balance_loss_mlp": 1.09529161, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.04870542745948935, + "language_loss": 1.05797207, + "learning_rate": 0.0009698969223913726, + "loss": 1.06908536, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.16027832, + "step": 134, + "time_per_iteration": 3.0269176959991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_mlp": 1.10735679, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.04083122637660085, + "language_loss": 1.08225274, + "learning_rate": 0.0009713692373399265, + "loss": 1.09348655, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.16015625, + "step": 135, + "time_per_iteration": 2.690932273864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01798361, + "balance_loss_mlp": 1.75773478, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.2058674005568875, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.8125459, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.40625, + "step": 136, + "time_per_iteration": 5.460411548614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01507549, + "balance_loss_mlp": 1.47512448, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.12866590611947104, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79318589, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.32421875, + "step": 137, + "time_per_iteration": 4.989046335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146765, + "balance_loss_mlp": 1.13081443, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.04917093034878699, + "language_loss": 1.00934815, + "learning_rate": 0.0009757216201974225, + "loss": 1.02081585, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.1595459, + "step": 138, + "time_per_iteration": 2.9566736221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162305, + "balance_loss_mlp": 1.1448524, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.06281235859244827, + "language_loss": 1.0596863, + "learning_rate": 0.0009771514130396581, + "loss": 1.07130933, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17468262, + "step": 139, + "time_per_iteration": 2.683931350708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150087, + "balance_loss_mlp": 1.1330874, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09254080332591261, + "language_loss": 1.06202602, + "learning_rate": 0.00097857095638274, + "loss": 1.07352686, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17016602, + "step": 140, + "time_per_iteration": 2.558708906173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149417, + "balance_loss_mlp": 1.13241768, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.03864103733020509, + "language_loss": 0.97399604, + "learning_rate": 0.0009799803961288726, + "loss": 0.9854902, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17016602, + "step": 141, + "time_per_iteration": 2.992034673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_mlp": 1.10685217, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.06378420241673269, + "language_loss": 1.03629804, + "learning_rate": 0.000981379875086876, + "loss": 1.0475328, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16625977, + "step": 142, + "time_per_iteration": 3.063534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121821, + "balance_loss_mlp": 1.10560894, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.046520134554953796, + "language_loss": 0.98784387, + "learning_rate": 0.0009827695330590185, + "loss": 0.99906206, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.1619873, + "step": 143, + "time_per_iteration": 2.6495330333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_mlp": 1.1078757, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.05485832849515215, + "language_loss": 0.98036379, + "learning_rate": 0.0009841495069248256, + "loss": 0.99160779, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.1652832, + "step": 144, + "time_per_iteration": 2.9577834606170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_mlp": 1.12901306, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.09798795242100523, + "language_loss": 0.97478735, + "learning_rate": 0.0009855199307219871, + "loss": 0.98624128, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.16381836, + "step": 145, + "time_per_iteration": 2.6759142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148365, + "balance_loss_mlp": 1.13168764, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.1254453322996171, + "language_loss": 0.99733889, + "learning_rate": 0.0009868809357244854, + "loss": 1.00882256, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16687012, + "step": 146, + "time_per_iteration": 2.66375994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113683, + "balance_loss_mlp": 1.11978364, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.08248071954181796, + "language_loss": 1.03600287, + "learning_rate": 0.0009882326505180556, + "loss": 1.04737115, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.1706543, + "step": 147, + "time_per_iteration": 2.719353437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_mlp": 1.13280392, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.12761243433758393, + "language_loss": 1.02101135, + "learning_rate": 0.0009895752010730906, + "loss": 1.03252351, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.1842041, + "step": 148, + "time_per_iteration": 2.9704201221466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141454, + "balance_loss_mlp": 1.12377512, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07962775403881484, + "language_loss": 1.0825479, + "learning_rate": 0.0009909087108150867, + "loss": 1.09396255, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.17687988, + "step": 149, + "time_per_iteration": 2.7516071796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151554, + "balance_loss_mlp": 1.13330352, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.10196194967952074, + "language_loss": 1.09083438, + "learning_rate": 0.0009922333006927371, + "loss": 1.10235, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.18249512, + "step": 150, + "time_per_iteration": 2.4685099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.15218103, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.13259475383105176, + "language_loss": 1.020684, + "learning_rate": 0.0009935490892437632, + "loss": 1.03238916, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.18322754, + "step": 151, + "time_per_iteration": 2.5665087699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166904, + "balance_loss_mlp": 1.14880824, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10481585745820837, + "language_loss": 1.00390673, + "learning_rate": 0.0009948561926585687, + "loss": 1.01557577, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.18103027, + "step": 152, + "time_per_iteration": 2.7641003131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139325, + "balance_loss_mlp": 1.122576, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09697971136145118, + "language_loss": 1.05073512, + "learning_rate": 0.0009961547248418122, + "loss": 1.06212831, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.16760254, + "step": 153, + "time_per_iteration": 2.631476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123418, + "balance_loss_mlp": 1.10662186, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.05437877185758658, + "language_loss": 1.01441622, + "learning_rate": 0.0009974447974719707, + "loss": 1.0256505, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.16809082, + "step": 154, + "time_per_iteration": 2.709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.11151338, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.09703401576709127, + "language_loss": 1.03478801, + "learning_rate": 0.0009987265200589763, + "loss": 1.0460813, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.17810059, + "step": 155, + "time_per_iteration": 2.77809739112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140894, + "balance_loss_mlp": 1.12376344, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.08300490544518559, + "language_loss": 1.02959824, + "learning_rate": 0.001, + "loss": 1.04100728, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.17150879, + "step": 156, + "time_per_iteration": 2.845790386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144802, + "balance_loss_mlp": 1.12720668, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07590676388764007, + "language_loss": 1.00599122, + "learning_rate": 0.0009999999029413921, + "loss": 1.01743913, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.17614746, + "step": 157, + "time_per_iteration": 2.833735227584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142594, + "balance_loss_mlp": 1.12554669, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.06607639809804342, + "language_loss": 1.01453137, + "learning_rate": 0.0009999996117656068, + "loss": 1.02595735, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.1706543, + "step": 158, + "time_per_iteration": 2.803636074066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011301, + "balance_loss_mlp": 1.11345792, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.08769352458743468, + "language_loss": 0.94982773, + "learning_rate": 0.0009999991264727564, + "loss": 0.96112871, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.16638184, + "step": 159, + "time_per_iteration": 2.7776851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.11870432, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.05788098803643346, + "language_loss": 1.06247735, + "learning_rate": 0.0009999984470630296, + "loss": 1.07383585, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.17163086, + "step": 160, + "time_per_iteration": 2.6311371326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125321, + "balance_loss_mlp": 1.10836911, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.05159431076001957, + "language_loss": 0.94850963, + "learning_rate": 0.0009999975735366902, + "loss": 0.95976287, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.16955566, + "step": 161, + "time_per_iteration": 3.0904829502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148114, + "balance_loss_mlp": 1.13099504, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0692270455282635, + "language_loss": 0.96706492, + "learning_rate": 0.0009999965058940775, + "loss": 0.97854608, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.17138672, + "step": 162, + "time_per_iteration": 3.490063428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150632, + "balance_loss_mlp": 1.13323975, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08572766411177644, + "language_loss": 1.03267431, + "learning_rate": 0.0009999952441356057, + "loss": 1.04418063, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.17382812, + "step": 163, + "time_per_iteration": 2.497690439224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130614, + "balance_loss_mlp": 1.11405563, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.05784293330097489, + "language_loss": 1.03805065, + "learning_rate": 0.000999993788261765, + "loss": 1.0493567, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.16564941, + "step": 164, + "time_per_iteration": 3.6041390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.1152972, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.05766532368121917, + "language_loss": 1.05311596, + "learning_rate": 0.00099999213827312, + "loss": 1.06444073, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.171875, + "step": 165, + "time_per_iteration": 2.806014060974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_mlp": 1.12589669, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.05992608893057494, + "language_loss": 1.00112009, + "learning_rate": 0.000999990294170312, + "loss": 1.01254439, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.16540527, + "step": 166, + "time_per_iteration": 2.6405951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.11351717, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.05363857392651908, + "language_loss": 1.03767109, + "learning_rate": 0.0009999882559540566, + "loss": 1.04897451, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.16845703, + "step": 167, + "time_per_iteration": 2.69801664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_mlp": 1.11079764, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.03971308084427602, + "language_loss": 1.00767386, + "learning_rate": 0.000999986023625145, + "loss": 1.01894999, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.16821289, + "step": 168, + "time_per_iteration": 2.710706949234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04227602, + "balance_loss_mlp": 3.93005633, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.49669676383753814, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8315202, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.96875, + "step": 169, + "time_per_iteration": 4.921034574508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178384, + "balance_loss_mlp": 1.15987098, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11256254520903143, + "language_loss": 1.01289928, + "learning_rate": 0.0009999809766328958, + "loss": 1.02468312, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.18518066, + "step": 170, + "time_per_iteration": 2.6784250736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236994, + "balance_loss_mlp": 1.21676469, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.13219145589868983, + "language_loss": 1.0357101, + "learning_rate": 0.0009999781619715177, + "loss": 1.04807997, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.20227051, + "step": 171, + "time_per_iteration": 2.5412755012512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234758, + "balance_loss_mlp": 1.21518433, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.05193788120122226, + "language_loss": 1.03408492, + "learning_rate": 0.000999975153201402, + "loss": 1.0464325, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.19567871, + "step": 172, + "time_per_iteration": 2.864586353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_mlp": 1.21688426, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.0814546252210238, + "language_loss": 1.01345742, + "learning_rate": 0.0009999719503237174, + "loss": 1.02582097, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.19470215, + "step": 173, + "time_per_iteration": 2.765923261642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_mlp": 1.24583161, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11494520888694326, + "language_loss": 1.10141742, + "learning_rate": 0.0009999685533397073, + "loss": 1.11407971, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20410156, + "step": 174, + "time_per_iteration": 2.5439114570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_mlp": 1.24525094, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.12313705571337571, + "language_loss": 1.01947784, + "learning_rate": 0.00099996496225069, + "loss": 1.03212488, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19445801, + "step": 175, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257561, + "balance_loss_mlp": 1.23677111, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07888015485072913, + "language_loss": 1.04929149, + "learning_rate": 0.0009999611770580604, + "loss": 1.06186724, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.20788574, + "step": 176, + "time_per_iteration": 2.841484785079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258013, + "balance_loss_mlp": 1.23668683, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.1202186920466195, + "language_loss": 1.03394961, + "learning_rate": 0.0009999571977632876, + "loss": 1.04652977, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21350098, + "step": 177, + "time_per_iteration": 2.567788600921631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_mlp": 1.25026441, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.09201820914192435, + "language_loss": 1.05765235, + "learning_rate": 0.0009999530243679166, + "loss": 1.07036722, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.21240234, + "step": 178, + "time_per_iteration": 2.5753743648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258548, + "balance_loss_mlp": 1.23935485, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.06529189645852858, + "language_loss": 1.00495052, + "learning_rate": 0.0009999486568735675, + "loss": 1.01753592, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.19177246, + "step": 179, + "time_per_iteration": 3.0607473850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251876, + "balance_loss_mlp": 1.23275518, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.07628849485304477, + "language_loss": 1.00889277, + "learning_rate": 0.0009999440952819362, + "loss": 1.02141166, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.19116211, + "step": 180, + "time_per_iteration": 3.6515376567840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248658, + "balance_loss_mlp": 1.22853494, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.05983966318213213, + "language_loss": 1.0115366, + "learning_rate": 0.0009999393395947935, + "loss": 1.02402306, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2010498, + "step": 181, + "time_per_iteration": 2.799633502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253433, + "balance_loss_mlp": 1.23378766, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.0770350968764605, + "language_loss": 1.04747987, + "learning_rate": 0.0009999343898139858, + "loss": 1.06001413, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19641113, + "step": 182, + "time_per_iteration": 2.627434253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258891, + "balance_loss_mlp": 1.23675334, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.06485795323962908, + "language_loss": 1.03381288, + "learning_rate": 0.0009999292459414348, + "loss": 1.04640174, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.22131348, + "step": 183, + "time_per_iteration": 2.5552356243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227697, + "balance_loss_mlp": 1.20765769, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.06837915158031915, + "language_loss": 1.07873201, + "learning_rate": 0.0009999239079791374, + "loss": 1.0910089, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.20031738, + "step": 184, + "time_per_iteration": 2.5553643703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225953, + "balance_loss_mlp": 1.20453107, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.05538225102727573, + "language_loss": 1.00595856, + "learning_rate": 0.0009999183759291659, + "loss": 1.01821804, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.21435547, + "step": 185, + "time_per_iteration": 2.6955769062042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199054, + "balance_loss_mlp": 1.17938447, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.052094207769016576, + "language_loss": 1.02581143, + "learning_rate": 0.0009999126497936682, + "loss": 1.03780198, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1965332, + "step": 186, + "time_per_iteration": 2.5304598808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198293, + "balance_loss_mlp": 1.1770494, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057723222775786294, + "language_loss": 1.05774581, + "learning_rate": 0.0009999067295748676, + "loss": 1.06972873, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21252441, + "step": 187, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225876, + "balance_loss_mlp": 1.20496714, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.0756096280824464, + "language_loss": 1.03738201, + "learning_rate": 0.000999900615275062, + "loss": 1.04964077, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.20922852, + "step": 188, + "time_per_iteration": 2.677471399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211466, + "balance_loss_mlp": 1.18979406, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.0898221855427691, + "language_loss": 1.09605587, + "learning_rate": 0.0009998943068966256, + "loss": 1.10817051, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21679688, + "step": 189, + "time_per_iteration": 2.4233202934265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217638, + "balance_loss_mlp": 1.19651425, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.10338446511893212, + "language_loss": 1.03747463, + "learning_rate": 0.0009998878044420072, + "loss": 1.04965115, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.21130371, + "step": 190, + "time_per_iteration": 2.6978025436401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177731, + "balance_loss_mlp": 1.15573716, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06881722524262912, + "language_loss": 0.99768066, + "learning_rate": 0.0009998811079137318, + "loss": 1.00945807, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22009277, + "step": 191, + "time_per_iteration": 2.5934321880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.12218916, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.0852793637050772, + "language_loss": 1.0086391, + "learning_rate": 0.0009998742173143987, + "loss": 1.02007401, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.2130127, + "step": 192, + "time_per_iteration": 2.6706249713897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139307, + "balance_loss_mlp": 1.1180048, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.07456835679934387, + "language_loss": 1.01398337, + "learning_rate": 0.0009998671326466833, + "loss": 1.02537644, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.21313477, + "step": 193, + "time_per_iteration": 2.992595672607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10519516, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.08171257283174432, + "language_loss": 1.02813613, + "learning_rate": 0.0009998598539133362, + "loss": 1.03940392, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21594238, + "step": 194, + "time_per_iteration": 3.0081543922424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113199, + "balance_loss_mlp": 1.11179638, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.05573112518601677, + "language_loss": 1.02892375, + "learning_rate": 0.0009998523811171828, + "loss": 1.04024363, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2019043, + "step": 195, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149122, + "balance_loss_mlp": 1.12843966, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0935188115694547, + "language_loss": 1.0387187, + "learning_rate": 0.0009998447142611248, + "loss": 1.05020976, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.20690918, + "step": 196, + "time_per_iteration": 2.6388566493988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160139, + "balance_loss_mlp": 1.13986123, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.047444937864230444, + "language_loss": 0.96302813, + "learning_rate": 0.0009998368533481387, + "loss": 0.97462952, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.20275879, + "step": 197, + "time_per_iteration": 3.033572196960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132809, + "balance_loss_mlp": 1.11254394, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08710369828361038, + "language_loss": 0.9995833, + "learning_rate": 0.0009998287983812762, + "loss": 1.01091146, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.20263672, + "step": 198, + "time_per_iteration": 2.8421950340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155397, + "balance_loss_mlp": 1.13373709, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.10277508525357126, + "language_loss": 1.05776644, + "learning_rate": 0.0009998205493636646, + "loss": 1.06932044, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.2166748, + "step": 199, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141939, + "balance_loss_mlp": 1.12035084, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.09429923895154278, + "language_loss": 0.98451054, + "learning_rate": 0.0009998121062985063, + "loss": 0.99592984, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.21594238, + "step": 200, + "time_per_iteration": 2.6926732063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171328, + "balance_loss_mlp": 1.15014482, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.08332681767957313, + "language_loss": 1.00419915, + "learning_rate": 0.0009998034691890794, + "loss": 1.01591253, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.21203613, + "step": 201, + "time_per_iteration": 2.7643332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165409, + "balance_loss_mlp": 1.14516699, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.11326578301102472, + "language_loss": 1.05536067, + "learning_rate": 0.0009997946380387369, + "loss": 1.06701469, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.20251465, + "step": 202, + "time_per_iteration": 2.630284070968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157571, + "balance_loss_mlp": 1.13723421, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09790094078320352, + "language_loss": 1.07388449, + "learning_rate": 0.0009997856128509076, + "loss": 1.08546019, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.20336914, + "step": 203, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144349, + "balance_loss_mlp": 1.12458408, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.1356659453961297, + "language_loss": 1.02559984, + "learning_rate": 0.0009997763936290952, + "loss": 1.03704333, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.19750977, + "step": 204, + "time_per_iteration": 2.503309965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138207, + "balance_loss_mlp": 1.11642766, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.053010676996176516, + "language_loss": 1.07603145, + "learning_rate": 0.0009997669803768789, + "loss": 1.08741355, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.21789551, + "step": 205, + "time_per_iteration": 2.7773749828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_mlp": 1.09366679, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07785432610828748, + "language_loss": 1.0289582, + "learning_rate": 0.0009997573730979134, + "loss": 1.04010415, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.20947266, + "step": 206, + "time_per_iteration": 2.7241222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04720912, + "balance_loss_mlp": 3.71993518, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.31672297251450016, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.83914113, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 10.0, + "step": 207, + "time_per_iteration": 4.65311074256897 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160001, + "balance_loss_mlp": 1.13651657, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.09244016287770654, + "language_loss": 1.01599813, + "learning_rate": 0.0009997375764747294, + "loss": 1.02759814, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.23449707, + "step": 208, + "time_per_iteration": 2.999249219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144547, + "balance_loss_mlp": 1.12159967, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.10768555369795524, + "language_loss": 0.98886019, + "learning_rate": 0.0009997273871381967, + "loss": 1.00030565, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.22949219, + "step": 209, + "time_per_iteration": 2.740895986557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154635, + "balance_loss_mlp": 1.13075733, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.0670178022721504, + "language_loss": 1.03911638, + "learning_rate": 0.0009997170037902862, + "loss": 1.05066276, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.23876953, + "step": 210, + "time_per_iteration": 2.7199809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161677, + "balance_loss_mlp": 1.13826418, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.062356382061819024, + "language_loss": 1.06535935, + "learning_rate": 0.0009997064264350292, + "loss": 1.07697606, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.23413086, + "step": 211, + "time_per_iteration": 2.85477614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164794, + "balance_loss_mlp": 1.14111865, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.11782714892356931, + "language_loss": 1.00570273, + "learning_rate": 0.0009996956550765317, + "loss": 1.01735067, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.23657227, + "step": 212, + "time_per_iteration": 2.683258295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178964, + "balance_loss_mlp": 1.15452623, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07352585681220185, + "language_loss": 0.95357072, + "learning_rate": 0.0009996846897189762, + "loss": 0.9653604, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.24438477, + "step": 213, + "time_per_iteration": 2.64486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.14665973, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.06101080420793073, + "language_loss": 1.01569629, + "learning_rate": 0.0009996735303666193, + "loss": 1.02740788, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.24499512, + "step": 214, + "time_per_iteration": 2.719754934310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189275, + "balance_loss_mlp": 1.16434813, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.09805160088916984, + "language_loss": 1.03784573, + "learning_rate": 0.0009996621770237937, + "loss": 1.04973853, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24938965, + "step": 215, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202725, + "balance_loss_mlp": 1.17728579, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.05858333324383458, + "language_loss": 0.99328029, + "learning_rate": 0.0009996506296949073, + "loss": 1.00530756, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.25463867, + "step": 216, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175124, + "balance_loss_mlp": 1.14957714, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.09898600739692984, + "language_loss": 0.99386859, + "learning_rate": 0.0009996388883844428, + "loss": 1.00561976, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.25561523, + "step": 217, + "time_per_iteration": 2.5985324382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155134, + "balance_loss_mlp": 1.13007665, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06208913439552352, + "language_loss": 1.03500867, + "learning_rate": 0.0009996269530969588, + "loss": 1.04656017, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25048828, + "step": 218, + "time_per_iteration": 2.591993808746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152332, + "balance_loss_mlp": 1.12778735, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.08789931910276294, + "language_loss": 1.02762055, + "learning_rate": 0.0009996148238370888, + "loss": 1.0391438, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24536133, + "step": 219, + "time_per_iteration": 2.7247660160064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146753, + "balance_loss_mlp": 1.12125421, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.059765696203788965, + "language_loss": 0.98427057, + "learning_rate": 0.0009996025006095421, + "loss": 0.99573809, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.25524902, + "step": 220, + "time_per_iteration": 3.314250946044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04012538, + "balance_loss_mlp": 3.61886096, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.18322335632445477, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81795681, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 3.921875, + "step": 221, + "time_per_iteration": 5.397853851318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_mlp": 1.11779404, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.10045289138425088, + "language_loss": 0.98726314, + "learning_rate": 0.0009995772722706307, + "loss": 0.99869102, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.25, + "step": 222, + "time_per_iteration": 2.8346786499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168149, + "balance_loss_mlp": 1.14130318, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.07395583213906755, + "language_loss": 1.12709904, + "learning_rate": 0.0009995643671690604, + "loss": 1.13878047, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.26879883, + "step": 223, + "time_per_iteration": 2.4760169982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157966, + "balance_loss_mlp": 1.1317513, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.08239055528326475, + "language_loss": 1.00208497, + "learning_rate": 0.0009995512681194023, + "loss": 1.01366448, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.26257324, + "step": 224, + "time_per_iteration": 2.833751916885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151672, + "balance_loss_mlp": 1.12492132, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.058356102807926864, + "language_loss": 0.97854793, + "learning_rate": 0.0009995379751267417, + "loss": 0.99006462, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.2677002, + "step": 225, + "time_per_iteration": 3.295761823654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_mlp": 1.1551652, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.09032086206875983, + "language_loss": 0.99067688, + "learning_rate": 0.0009995244881962398, + "loss": 1.00250244, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.27416992, + "step": 226, + "time_per_iteration": 2.6147754192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162924, + "balance_loss_mlp": 1.1352675, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.05273235380658081, + "language_loss": 1.00220668, + "learning_rate": 0.0009995108073331323, + "loss": 1.01383591, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27661133, + "step": 227, + "time_per_iteration": 2.575477361679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165107, + "balance_loss_mlp": 1.13835633, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.07222661628022838, + "language_loss": 1.03328192, + "learning_rate": 0.0009994969325427309, + "loss": 1.04493296, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.26733398, + "step": 228, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159475, + "balance_loss_mlp": 1.13215184, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.05690950477809338, + "language_loss": 0.99788582, + "learning_rate": 0.0009994828638304218, + "loss": 1.0094806, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.2734375, + "step": 229, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160216, + "balance_loss_mlp": 1.13327467, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.0671245201901001, + "language_loss": 1.05080867, + "learning_rate": 0.0009994686012016675, + "loss": 1.06241083, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.26953125, + "step": 230, + "time_per_iteration": 2.5507686138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200075, + "balance_loss_mlp": 1.17368245, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.08083200993131012, + "language_loss": 1.04836714, + "learning_rate": 0.000999454144662005, + "loss": 1.06036782, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.26416016, + "step": 231, + "time_per_iteration": 2.872386932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177085, + "balance_loss_mlp": 1.15090632, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.06521500069668446, + "language_loss": 0.98697901, + "learning_rate": 0.0009994394942170468, + "loss": 0.99874985, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.26208496, + "step": 232, + "time_per_iteration": 2.6734542846679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_mlp": 1.13452244, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06848368332912834, + "language_loss": 0.96340638, + "learning_rate": 0.0009994246498724808, + "loss": 0.97500765, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.25598145, + "step": 233, + "time_per_iteration": 2.735145330429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.14341569, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.09664881582101635, + "language_loss": 0.99309772, + "learning_rate": 0.00099940961163407, + "loss": 1.00479114, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.25964355, + "step": 234, + "time_per_iteration": 2.8988683223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_mlp": 1.11722803, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.06003753756121682, + "language_loss": 1.01686716, + "learning_rate": 0.0009993943795076528, + "loss": 1.02828944, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.25012207, + "step": 235, + "time_per_iteration": 2.6333067417144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132836, + "balance_loss_mlp": 1.10618043, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.08170413586498586, + "language_loss": 1.0374043, + "learning_rate": 0.0009993789534991427, + "loss": 1.04873264, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.26708984, + "step": 236, + "time_per_iteration": 2.4350106716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_mlp": 1.0960753, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.0440176634981383, + "language_loss": 0.99063611, + "learning_rate": 0.0009993633336145287, + "loss": 1.00186157, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26513672, + "step": 237, + "time_per_iteration": 2.6414294242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134799, + "balance_loss_mlp": 1.10904956, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.04213473561248219, + "language_loss": 1.02718055, + "learning_rate": 0.0009993475198598752, + "loss": 1.03852856, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.25756836, + "step": 238, + "time_per_iteration": 2.9781904220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152995, + "balance_loss_mlp": 1.12614954, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08613106589232603, + "language_loss": 1.00055635, + "learning_rate": 0.0009993315122413212, + "loss": 1.01208627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.26879883, + "step": 239, + "time_per_iteration": 2.6395275592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_mlp": 1.13594294, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.06839694959482054, + "language_loss": 0.99973977, + "learning_rate": 0.0009993153107650818, + "loss": 1.01136363, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.2644043, + "step": 240, + "time_per_iteration": 2.563133716583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_mlp": 1.13391829, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.06471449859153773, + "language_loss": 0.98970807, + "learning_rate": 0.0009992989154374468, + "loss": 1.00131631, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.26928711, + "step": 241, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145463, + "balance_loss_mlp": 1.11914206, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06957696695924716, + "language_loss": 1.05868769, + "learning_rate": 0.0009992823262647817, + "loss": 1.07014227, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26342773, + "step": 242, + "time_per_iteration": 2.6841883659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111302, + "balance_loss_mlp": 1.08692503, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.0649477492764712, + "language_loss": 0.99848783, + "learning_rate": 0.0009992655432535264, + "loss": 1.00961804, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.2611084, + "step": 243, + "time_per_iteration": 2.7613234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107198, + "balance_loss_mlp": 1.08162785, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.05612685480258275, + "language_loss": 1.00329947, + "learning_rate": 0.0009992485664101973, + "loss": 1.01437151, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.25598145, + "step": 244, + "time_per_iteration": 2.717280387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.09556472, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.10316769075352135, + "language_loss": 1.02662849, + "learning_rate": 0.000999231395741385, + "loss": 1.03785205, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.26831055, + "step": 245, + "time_per_iteration": 3.095249891281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_mlp": 1.11837006, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.09647975042234339, + "language_loss": 1.01015186, + "learning_rate": 0.0009992140312537557, + "loss": 1.02159202, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.25671387, + "step": 246, + "time_per_iteration": 2.633258819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.09845233, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.09798218580430706, + "language_loss": 0.95550418, + "learning_rate": 0.000999196472954051, + "loss": 0.96674085, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.25231934, + "step": 247, + "time_per_iteration": 3.024939775466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02466762, + "balance_loss_mlp": 2.43700695, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.2831653982047738, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81891614, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 0.296875, + "step": 248, + "time_per_iteration": 5.486468076705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162703, + "balance_loss_mlp": 1.13626289, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.12969478117477343, + "language_loss": 1.03178453, + "learning_rate": 0.0009991607749457578, + "loss": 1.04341149, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.26464844, + "step": 249, + "time_per_iteration": 2.5253713130950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119774, + "balance_loss_mlp": 1.16941571, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.09425507858465235, + "language_loss": 1.01008546, + "learning_rate": 0.0009991426352510286, + "loss": 1.0220629, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.28295898, + "step": 250, + "time_per_iteration": 3.0042202472686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204128, + "balance_loss_mlp": 1.174016, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.07677732337183582, + "language_loss": 1.0282234, + "learning_rate": 0.0009991243017719422, + "loss": 1.04026473, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30126953, + "step": 251, + "time_per_iteration": 2.709934711456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206766, + "balance_loss_mlp": 1.17522311, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.1103729500964747, + "language_loss": 0.97436613, + "learning_rate": 0.0009991057745156165, + "loss": 0.9864338, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.31518555, + "step": 252, + "time_per_iteration": 2.5961716175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03348202, + "balance_loss_mlp": 3.30471396, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.3811060337507454, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85259187, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.43554688, + "step": 253, + "time_per_iteration": 5.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_mlp": 1.1623621, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.07473951959737497, + "language_loss": 1.05491519, + "learning_rate": 0.0009990681387000943, + "loss": 1.06686831, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.3293457, + "step": 254, + "time_per_iteration": 2.7937283515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121698, + "balance_loss_mlp": 1.18345821, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.06898181212790383, + "language_loss": 1.01063621, + "learning_rate": 0.0009990490301555093, + "loss": 1.02280605, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.33544922, + "step": 255, + "time_per_iteration": 2.9615726470947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05252755, + "balance_loss_mlp": 5.12458086, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.5609302024280507, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.84467912, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.28125, + "step": 256, + "time_per_iteration": 4.8413920402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03162439, + "balance_loss_mlp": 3.09758925, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.1723793408951341, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8240518, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6484375, + "step": 257, + "time_per_iteration": 4.985513687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03630928, + "balance_loss_mlp": 3.55844903, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4079591987734508, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73606813, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.7265625, + "step": 258, + "time_per_iteration": 4.858096361160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403117, + "balance_loss_mlp": 1.35569584, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.11330256318865821, + "language_loss": 0.95339322, + "learning_rate": 0.0009989706585723202, + "loss": 0.96742439, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.47436523, + "step": 259, + "time_per_iteration": 2.794419765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437412, + "balance_loss_mlp": 1.38651013, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.10381773722922016, + "language_loss": 1.0219605, + "learning_rate": 0.0009989505813633442, + "loss": 1.03633475, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.50927734, + "step": 260, + "time_per_iteration": 2.6660099029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145174, + "balance_loss_mlp": 1.39776254, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12909552841436595, + "language_loss": 1.02080631, + "learning_rate": 0.000998930310444573, + "loss": 1.03532374, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.5402832, + "step": 261, + "time_per_iteration": 2.7547266483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429363, + "balance_loss_mlp": 1.37698281, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.08616818959721087, + "language_loss": 0.99936116, + "learning_rate": 0.0009989098458238765, + "loss": 1.01365471, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.52441406, + "step": 262, + "time_per_iteration": 2.804656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431577, + "balance_loss_mlp": 1.38310647, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.10103635045761167, + "language_loss": 0.99213421, + "learning_rate": 0.0009988891875091998, + "loss": 1.00644994, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.48486328, + "step": 263, + "time_per_iteration": 2.780696392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359367, + "balance_loss_mlp": 1.31771505, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09437475228894394, + "language_loss": 0.93793595, + "learning_rate": 0.0009988683355085636, + "loss": 0.95152962, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.41625977, + "step": 264, + "time_per_iteration": 2.758275032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314446, + "balance_loss_mlp": 1.27684712, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09784246378207673, + "language_loss": 1.02612829, + "learning_rate": 0.000998847289830063, + "loss": 1.03927279, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37524414, + "step": 265, + "time_per_iteration": 2.8752288818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289086, + "balance_loss_mlp": 1.25468266, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.06973466471853282, + "language_loss": 0.95293748, + "learning_rate": 0.0009988260504818682, + "loss": 0.9658283, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.34423828, + "step": 266, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290407, + "balance_loss_mlp": 1.2563374, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.0971565340820806, + "language_loss": 1.02148294, + "learning_rate": 0.000998804617472226, + "loss": 1.03438699, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.34082031, + "step": 267, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275377, + "balance_loss_mlp": 1.24085402, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.10761719469623075, + "language_loss": 0.96939588, + "learning_rate": 0.0009987829908094568, + "loss": 0.98214972, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.34545898, + "step": 268, + "time_per_iteration": 2.8270740509033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271333, + "balance_loss_mlp": 1.23785877, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.1226169977774822, + "language_loss": 1.04002702, + "learning_rate": 0.0009987611705019569, + "loss": 1.05274034, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.33496094, + "step": 269, + "time_per_iteration": 4.483954429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277218, + "balance_loss_mlp": 1.24267149, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.07374197309260985, + "language_loss": 1.02401245, + "learning_rate": 0.0009987391565581978, + "loss": 1.03678453, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34594727, + "step": 270, + "time_per_iteration": 2.627356767654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304636, + "balance_loss_mlp": 1.26977956, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06923057034816653, + "language_loss": 0.94496262, + "learning_rate": 0.000998716948986726, + "loss": 0.95800889, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34887695, + "step": 271, + "time_per_iteration": 2.804185628890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322736, + "balance_loss_mlp": 1.28718746, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.1173780328671846, + "language_loss": 0.97372609, + "learning_rate": 0.0009986945477961633, + "loss": 0.9869535, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.35571289, + "step": 272, + "time_per_iteration": 2.739595890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297409, + "balance_loss_mlp": 1.2620039, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07261359465506025, + "language_loss": 1.02136993, + "learning_rate": 0.0009986719529952066, + "loss": 1.03434396, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.35424805, + "step": 273, + "time_per_iteration": 2.8717877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_mlp": 1.20389819, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.13624684616705834, + "language_loss": 1.01736569, + "learning_rate": 0.000998649164592628, + "loss": 1.0297575, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.35327148, + "step": 274, + "time_per_iteration": 2.590993642807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206885, + "balance_loss_mlp": 1.16945291, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.061304815826305474, + "language_loss": 0.99439085, + "learning_rate": 0.0009986261825972748, + "loss": 1.00645971, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.37426758, + "step": 275, + "time_per_iteration": 2.702202081680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_mlp": 1.14466429, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.10486338408500256, + "language_loss": 1.01433325, + "learning_rate": 0.000998603007018069, + "loss": 1.02616751, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.38745117, + "step": 276, + "time_per_iteration": 2.876267671585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190326, + "balance_loss_mlp": 1.15055728, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.08719890934761923, + "language_loss": 0.99445826, + "learning_rate": 0.0009985796378640089, + "loss": 1.00636148, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.39746094, + "step": 277, + "time_per_iteration": 2.74886155128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165278, + "balance_loss_mlp": 1.12720275, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.06292174667602014, + "language_loss": 0.99806106, + "learning_rate": 0.0009985560751441665, + "loss": 1.00971389, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.38061523, + "step": 278, + "time_per_iteration": 2.8894753456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175743, + "balance_loss_mlp": 1.13790607, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.06329003141341145, + "language_loss": 1.01538157, + "learning_rate": 0.00099853231886769, + "loss": 1.02713895, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.37792969, + "step": 279, + "time_per_iteration": 2.783085823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183406, + "balance_loss_mlp": 1.14633179, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06545769746199957, + "language_loss": 1.01316965, + "learning_rate": 0.0009985083690438024, + "loss": 1.02500367, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.37084961, + "step": 280, + "time_per_iteration": 2.707329511642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147788, + "balance_loss_mlp": 1.11245418, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.05305898567294309, + "language_loss": 0.9175781, + "learning_rate": 0.0009984842256818016, + "loss": 0.92905599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.35400391, + "step": 281, + "time_per_iteration": 3.1014201641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_mlp": 1.13106215, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.05782684737590577, + "language_loss": 1.02446878, + "learning_rate": 0.0009984598887910613, + "loss": 1.03612816, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.34912109, + "step": 282, + "time_per_iteration": 2.75343656539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_mlp": 1.14555514, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0631633618899466, + "language_loss": 0.98333299, + "learning_rate": 0.0009984353583810297, + "loss": 0.99513876, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.3503418, + "step": 283, + "time_per_iteration": 2.8092565536499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.15350997, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0821933313576245, + "language_loss": 1.00416183, + "learning_rate": 0.0009984106344612302, + "loss": 1.01602352, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.32666016, + "step": 284, + "time_per_iteration": 2.7632908821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_mlp": 1.1310904, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.06349155766627652, + "language_loss": 0.95740765, + "learning_rate": 0.0009983857170412615, + "loss": 0.96904278, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.32421875, + "step": 285, + "time_per_iteration": 2.9946134090423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130334, + "balance_loss_mlp": 1.09912539, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.0487694941790178, + "language_loss": 0.95326382, + "learning_rate": 0.000998360606130798, + "loss": 0.96456718, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.31176758, + "step": 286, + "time_per_iteration": 2.8205370903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.09512836, + "balance_loss_mlp": 7.26674223, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.42812971022266805, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.78585953, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 22.5, + "step": 287, + "time_per_iteration": 4.986966848373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173658, + "balance_loss_mlp": 1.14278328, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08917023960137904, + "language_loss": 1.01027536, + "learning_rate": 0.0009983098038774552, + "loss": 1.02201188, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.30834961, + "step": 288, + "time_per_iteration": 2.8100168704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06110836, + "balance_loss_mlp": 5.25634384, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.4031517895181362, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.84281063, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 8.5625, + "step": 289, + "time_per_iteration": 4.790200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_mlp": 1.23435044, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.18275347501036113, + "language_loss": 0.9955281, + "learning_rate": 0.0009982582277800948, + "loss": 1.00819802, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.32641602, + "step": 290, + "time_per_iteration": 2.5976333618164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281728, + "balance_loss_mlp": 1.24694288, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.14603269886404707, + "language_loss": 1.06751418, + "learning_rate": 0.0009982321495648908, + "loss": 1.08033144, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.34838867, + "step": 291, + "time_per_iteration": 2.8513312339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250537, + "balance_loss_mlp": 1.21348643, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.09283742859778188, + "language_loss": 0.97403693, + "learning_rate": 0.0009982058779188115, + "loss": 0.98654234, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.37011719, + "step": 292, + "time_per_iteration": 2.728203773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230786, + "balance_loss_mlp": 1.19170928, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.08826519450204054, + "language_loss": 1.05705655, + "learning_rate": 0.0009981794128520567, + "loss": 1.06936455, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.39038086, + "step": 293, + "time_per_iteration": 2.79616379737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253904, + "balance_loss_mlp": 1.21258569, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.08065602932127632, + "language_loss": 1.01724029, + "learning_rate": 0.000998152754374901, + "loss": 1.02977943, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.41333008, + "step": 294, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232141, + "balance_loss_mlp": 1.19132411, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.07309017642696977, + "language_loss": 0.9826439, + "learning_rate": 0.0009981259024976943, + "loss": 0.99496531, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.40820312, + "step": 295, + "time_per_iteration": 2.7376105785369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244019, + "balance_loss_mlp": 1.20112753, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.07769478500482971, + "language_loss": 0.96765345, + "learning_rate": 0.0009980988572308612, + "loss": 0.9800936, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.42871094, + "step": 296, + "time_per_iteration": 3.001779556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226011, + "balance_loss_mlp": 1.18197489, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.0588150430335769, + "language_loss": 0.99343681, + "learning_rate": 0.0009980716185849015, + "loss": 1.00569689, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44067383, + "step": 297, + "time_per_iteration": 2.9817121028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223805, + "balance_loss_mlp": 1.18153381, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06400414638033543, + "language_loss": 0.95616293, + "learning_rate": 0.0009980441865703904, + "loss": 0.96840101, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4230957, + "step": 298, + "time_per_iteration": 2.615875244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122669, + "balance_loss_mlp": 1.18513405, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.09089975305964836, + "language_loss": 1.03662193, + "learning_rate": 0.000998016561197978, + "loss": 1.04888892, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.41577148, + "step": 299, + "time_per_iteration": 2.765833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219698, + "balance_loss_mlp": 1.17835617, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.05662219614280908, + "language_loss": 0.94978034, + "learning_rate": 0.0009979887424783895, + "loss": 0.96197736, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.41357422, + "step": 300, + "time_per_iteration": 2.8931760787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122099, + "balance_loss_mlp": 1.17850339, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.05388706690809858, + "language_loss": 0.94851983, + "learning_rate": 0.0009979607304224248, + "loss": 0.96072972, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.42504883, + "step": 301, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213648, + "balance_loss_mlp": 1.16951644, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.0564182452216587, + "language_loss": 1.02312028, + "learning_rate": 0.000997932525040959, + "loss": 1.03525686, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.44140625, + "step": 302, + "time_per_iteration": 2.7084572315216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.14165473, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.07525794393376325, + "language_loss": 1.04335976, + "learning_rate": 0.000997904126344943, + "loss": 1.05521822, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.44165039, + "step": 303, + "time_per_iteration": 2.6271631717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121438, + "balance_loss_mlp": 1.17055893, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.0664075129682053, + "language_loss": 1.00263453, + "learning_rate": 0.0009978755343454018, + "loss": 1.01477838, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.791146993637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182664, + "balance_loss_mlp": 1.13869941, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07350056034493838, + "language_loss": 1.01461756, + "learning_rate": 0.0009978467490534355, + "loss": 1.0264442, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.43969727, + "step": 305, + "time_per_iteration": 2.614455461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186922, + "balance_loss_mlp": 1.14424467, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.056638515612222363, + "language_loss": 0.97774673, + "learning_rate": 0.00099781777048022, + "loss": 0.98961592, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.42700195, + "step": 306, + "time_per_iteration": 2.717700481414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011718, + "balance_loss_mlp": 1.12855101, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.056560878082468485, + "language_loss": 0.99827361, + "learning_rate": 0.0009977885986370057, + "loss": 1.00999165, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.43310547, + "step": 307, + "time_per_iteration": 2.557203531265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164879, + "balance_loss_mlp": 1.12263095, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.05991229640473007, + "language_loss": 0.9525907, + "learning_rate": 0.000997759233535118, + "loss": 0.9642396, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.42285156, + "step": 308, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174986, + "balance_loss_mlp": 1.1345737, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.06710738832596337, + "language_loss": 1.01122141, + "learning_rate": 0.0009977296751859576, + "loss": 1.02297115, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.40405273, + "step": 309, + "time_per_iteration": 2.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164837, + "balance_loss_mlp": 1.12487829, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.05223481097130428, + "language_loss": 1.03482628, + "learning_rate": 0.0009976999236009998, + "loss": 1.0464747, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.39941406, + "step": 310, + "time_per_iteration": 2.769092321395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164403, + "balance_loss_mlp": 1.1263994, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.05685909644716586, + "language_loss": 1.04877043, + "learning_rate": 0.0009976699787917955, + "loss": 1.06041443, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37963867, + "step": 311, + "time_per_iteration": 2.6526851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08775091, + "balance_loss_mlp": 7.79852915, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.2725707199289832, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.82218087, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 9.75, + "step": 312, + "time_per_iteration": 5.006884813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_mlp": 1.12172294, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.06726838636277511, + "language_loss": 0.96427834, + "learning_rate": 0.0009976095095472243, + "loss": 0.97589004, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39428711, + "step": 313, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166252, + "balance_loss_mlp": 1.12738967, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.0761643630364548, + "language_loss": 0.97957367, + "learning_rate": 0.0009975789851353334, + "loss": 0.99123621, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.38818359, + "step": 314, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_mlp": 1.13191843, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07475166161853689, + "language_loss": 1.00319684, + "learning_rate": 0.0009975482675461487, + "loss": 1.0149318, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.41601562, + "step": 315, + "time_per_iteration": 2.65468692779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159286, + "balance_loss_mlp": 1.11591756, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08252555003670439, + "language_loss": 0.98425788, + "learning_rate": 0.0009975173567915952, + "loss": 0.99585068, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.43383789, + "step": 316, + "time_per_iteration": 2.6916940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.12767935, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.0640207679679256, + "language_loss": 0.91960573, + "learning_rate": 0.000997486252883674, + "loss": 0.93133986, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.45727539, + "step": 317, + "time_per_iteration": 2.8535635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188261, + "balance_loss_mlp": 1.13979006, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.0671416603225842, + "language_loss": 0.97457695, + "learning_rate": 0.0009974549558344602, + "loss": 0.98645949, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.484375, + "step": 318, + "time_per_iteration": 3.6911113262176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189393, + "balance_loss_mlp": 1.14037383, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.09268216800999254, + "language_loss": 1.06808639, + "learning_rate": 0.000997423465656105, + "loss": 1.07998025, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.49023438, + "step": 319, + "time_per_iteration": 2.727130651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.096205, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.06029287427116143, + "language_loss": 1.04509127, + "learning_rate": 0.0009973917823608335, + "loss": 1.05656588, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.51318359, + "step": 320, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.09605646, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.03213952729051003, + "language_loss": 0.98612553, + "learning_rate": 0.0009973599059609462, + "loss": 0.99760658, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.52075195, + "step": 321, + "time_per_iteration": 2.7024786472320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.09133446, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.04984356389382333, + "language_loss": 0.97161096, + "learning_rate": 0.000997327836468819, + "loss": 0.9830358, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.51147461, + "step": 322, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_mlp": 1.0917964, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.06671524152363617, + "language_loss": 0.99795449, + "learning_rate": 0.000997295573896902, + "loss": 1.00938356, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.51171875, + "step": 323, + "time_per_iteration": 2.834237813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03299168, + "balance_loss_mlp": 3.12445545, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.43556355854402456, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.84495211, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.75, + "step": 324, + "time_per_iteration": 4.770992040634155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02151431, + "balance_loss_mlp": 1.9545927, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.14082611715048204, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80723369, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.9609375, + "step": 325, + "time_per_iteration": 4.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.14768362, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.08367806581965369, + "language_loss": 0.93651855, + "learning_rate": 0.000997197627828043, + "loss": 0.94848073, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.4855957, + "step": 326, + "time_per_iteration": 2.5508148670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215208, + "balance_loss_mlp": 1.16862106, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.06635735350324974, + "language_loss": 0.89348811, + "learning_rate": 0.0009971645930629716, + "loss": 0.90564024, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.46533203, + "step": 327, + "time_per_iteration": 2.711386203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125047, + "balance_loss_mlp": 1.20192814, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.08863859510008423, + "language_loss": 1.03147936, + "learning_rate": 0.0009971313652814872, + "loss": 1.04398406, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.48486328, + "step": 328, + "time_per_iteration": 2.8484854698181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225004, + "balance_loss_mlp": 1.17553234, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.08503417282278386, + "language_loss": 1.0059731, + "learning_rate": 0.0009970979444964903, + "loss": 1.01822317, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.49487305, + "step": 329, + "time_per_iteration": 2.957482099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197604, + "balance_loss_mlp": 1.14846587, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.06790724972181753, + "language_loss": 1.01849604, + "learning_rate": 0.0009970643307209556, + "loss": 1.03047216, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.49121094, + "step": 330, + "time_per_iteration": 2.8220374584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170349, + "balance_loss_mlp": 1.1215446, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.06721894230078661, + "language_loss": 0.98097444, + "learning_rate": 0.0009970305239679334, + "loss": 0.99267793, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.48803711, + "step": 331, + "time_per_iteration": 2.8813369274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176679, + "balance_loss_mlp": 1.12754059, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.056286161373139375, + "language_loss": 1.03013992, + "learning_rate": 0.0009969965242505483, + "loss": 1.04190671, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.4909668, + "step": 332, + "time_per_iteration": 2.6662604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168774, + "balance_loss_mlp": 1.11932611, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06031850484613652, + "language_loss": 0.99096131, + "learning_rate": 0.0009969623315820007, + "loss": 1.00264907, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.49487305, + "step": 333, + "time_per_iteration": 2.6671581268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.10619712, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06229524640691676, + "language_loss": 0.99215055, + "learning_rate": 0.000996927945975565, + "loss": 1.00368309, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.47070312, + "step": 334, + "time_per_iteration": 2.568838357925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.1125921, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.05620099657237302, + "language_loss": 0.95852566, + "learning_rate": 0.0009968933674445906, + "loss": 0.97011936, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.46728516, + "step": 335, + "time_per_iteration": 2.6725666522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160514, + "balance_loss_mlp": 1.1122818, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.05589062806096766, + "language_loss": 0.97974062, + "learning_rate": 0.0009968585960025028, + "loss": 0.99134576, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.48217773, + "step": 336, + "time_per_iteration": 2.945194959640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0396516, + "balance_loss_mlp": 3.85834861, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.42886267506062575, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.81618351, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.0703125, + "step": 337, + "time_per_iteration": 4.802944183349609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215082, + "balance_loss_mlp": 1.16968668, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.09324534870618859, + "language_loss": 0.96021777, + "learning_rate": 0.0009967884744390583, + "loss": 0.9723686, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.45361328, + "step": 338, + "time_per_iteration": 3.5247950553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251582, + "balance_loss_mlp": 1.2060678, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.09123718626917265, + "language_loss": 0.97373873, + "learning_rate": 0.0009967531243449256, + "loss": 0.98625457, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.45507812, + "step": 339, + "time_per_iteration": 2.681973695755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211309, + "balance_loss_mlp": 1.163077, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.06030156589334856, + "language_loss": 1.04525125, + "learning_rate": 0.000996717581394126, + "loss": 1.05736434, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.48242188, + "step": 340, + "time_per_iteration": 2.6031126976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205107, + "balance_loss_mlp": 1.15630233, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.06934362388274598, + "language_loss": 1.05133414, + "learning_rate": 0.000996681845600459, + "loss": 1.06338525, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.48803711, + "step": 341, + "time_per_iteration": 2.6689491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190009, + "balance_loss_mlp": 1.1402986, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07929020766121274, + "language_loss": 0.97276402, + "learning_rate": 0.0009966459169777982, + "loss": 0.98466408, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.49731445, + "step": 342, + "time_per_iteration": 2.5235347747802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183772, + "balance_loss_mlp": 1.13444376, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.06503113555429127, + "language_loss": 1.05431008, + "learning_rate": 0.0009966097955400924, + "loss": 1.0661478, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.4934082, + "step": 343, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195626, + "balance_loss_mlp": 1.14772749, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.05810753199069879, + "language_loss": 0.99792945, + "learning_rate": 0.0009965734813013652, + "loss": 1.00988579, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.47924805, + "step": 344, + "time_per_iteration": 2.8092823028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211149, + "balance_loss_mlp": 1.16191518, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.08606224500635251, + "language_loss": 1.02011895, + "learning_rate": 0.0009965369742757151, + "loss": 1.03223062, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.49243164, + "step": 345, + "time_per_iteration": 2.5981764793395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193116, + "balance_loss_mlp": 1.14435959, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.0619511290056959, + "language_loss": 0.98293203, + "learning_rate": 0.0009965002744773152, + "loss": 0.99486327, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.48730469, + "step": 346, + "time_per_iteration": 3.4968950748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178364, + "balance_loss_mlp": 1.13115668, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.04856723246232052, + "language_loss": 0.95658922, + "learning_rate": 0.0009964633819204139, + "loss": 0.96837282, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.47167969, + "step": 347, + "time_per_iteration": 2.6705336570739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04576048, + "balance_loss_mlp": 4.3029151, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.32603271390487504, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.86377156, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 2.734375, + "step": 348, + "time_per_iteration": 4.961863994598389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03789769, + "balance_loss_mlp": 3.60590124, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.16497869204612428, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.78943658, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.8359375, + "step": 349, + "time_per_iteration": 4.876751184463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181375, + "balance_loss_mlp": 1.13578987, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.07770510755269132, + "language_loss": 0.96067584, + "learning_rate": 0.000996351547842304, + "loss": 0.9724896, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.45581055, + "step": 350, + "time_per_iteration": 3.166680097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217287, + "balance_loss_mlp": 1.16969919, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.06167835917893234, + "language_loss": 0.94333142, + "learning_rate": 0.0009963138843953744, + "loss": 0.9555043, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.47558594, + "step": 351, + "time_per_iteration": 2.5784904956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122803, + "balance_loss_mlp": 1.18005991, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.06188972934791396, + "language_loss": 0.98543227, + "learning_rate": 0.000996276028262306, + "loss": 0.99771261, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.47924805, + "step": 352, + "time_per_iteration": 2.7985076904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216963, + "balance_loss_mlp": 1.16760993, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.0659402302829914, + "language_loss": 1.04801619, + "learning_rate": 0.0009962379794577964, + "loss": 1.06018579, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.49365234, + "step": 353, + "time_per_iteration": 2.608032703399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123128, + "balance_loss_mlp": 1.18266606, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.051231802586423875, + "language_loss": 0.94352609, + "learning_rate": 0.000996199737996617, + "loss": 0.95583886, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.48657227, + "step": 354, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227436, + "balance_loss_mlp": 1.17770219, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.05676190931504088, + "language_loss": 1.03759205, + "learning_rate": 0.0009961613038936149, + "loss": 1.04986644, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.49755859, + "step": 355, + "time_per_iteration": 2.617859125137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216553, + "balance_loss_mlp": 1.16572189, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.04878484453506707, + "language_loss": 0.95482612, + "learning_rate": 0.000996122677163711, + "loss": 0.96699166, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.50878906, + "step": 356, + "time_per_iteration": 2.8171308040618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230039, + "balance_loss_mlp": 1.18037653, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.06504242786199886, + "language_loss": 1.01527905, + "learning_rate": 0.000996083857821902, + "loss": 1.02757955, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.49682617, + "step": 357, + "time_per_iteration": 3.0562636852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221322, + "balance_loss_mlp": 1.17237508, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.043415107047687695, + "language_loss": 0.99947309, + "learning_rate": 0.0009960448458832588, + "loss": 1.01168633, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.48925781, + "step": 358, + "time_per_iteration": 2.6778266429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224961, + "balance_loss_mlp": 1.17675292, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.061398357107108094, + "language_loss": 0.99686754, + "learning_rate": 0.000996005641362927, + "loss": 1.00911713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.48193359, + "step": 359, + "time_per_iteration": 2.5839953422546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218039, + "balance_loss_mlp": 1.16792321, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.045504813624839685, + "language_loss": 1.02907789, + "learning_rate": 0.0009959662442761274, + "loss": 1.04125834, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.5012207, + "step": 360, + "time_per_iteration": 2.9012227058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225991, + "balance_loss_mlp": 1.17504108, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.05242893208235044, + "language_loss": 0.96392268, + "learning_rate": 0.000995926654638155, + "loss": 0.97618258, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.50976562, + "step": 361, + "time_per_iteration": 2.7972850799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120421, + "balance_loss_mlp": 1.15323579, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0452718414118582, + "language_loss": 0.98678619, + "learning_rate": 0.00099588687246438, + "loss": 0.99882829, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.51025391, + "step": 362, + "time_per_iteration": 2.845742702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011953, + "balance_loss_mlp": 1.14241886, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.06654716127982052, + "language_loss": 1.06146324, + "learning_rate": 0.0009958468977702471, + "loss": 1.07341623, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.52978516, + "step": 363, + "time_per_iteration": 2.5876591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05386722, + "balance_loss_mlp": 5.09527922, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.35536528906135745, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.85121429, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 2.921875, + "step": 364, + "time_per_iteration": 4.7958595752716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183221, + "balance_loss_mlp": 1.12800324, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.06493728064972926, + "language_loss": 0.94085538, + "learning_rate": 0.0009957663708830612, + "loss": 0.95268762, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.55273438, + "step": 365, + "time_per_iteration": 3.238919258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188034, + "balance_loss_mlp": 1.13048029, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.06418297657416602, + "language_loss": 0.98210049, + "learning_rate": 0.0009957258187212714, + "loss": 0.99398077, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.57470703, + "step": 366, + "time_per_iteration": 3.0337131023406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0292345, + "balance_loss_mlp": 2.78612089, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.09868001986151984, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.82118309, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.375, + "step": 367, + "time_per_iteration": 4.825684070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118916, + "balance_loss_mlp": 1.12988925, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.06345017711900697, + "language_loss": 0.94456601, + "learning_rate": 0.0009956441370400167, + "loss": 0.95645761, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.59179688, + "step": 368, + "time_per_iteration": 2.6685595512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203671, + "balance_loss_mlp": 1.14411354, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.07550644934377632, + "language_loss": 1.00098681, + "learning_rate": 0.0009956030075522636, + "loss": 1.0130235, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.59472656, + "step": 369, + "time_per_iteration": 2.7824065685272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.12555027, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0634963537383221, + "language_loss": 1.00245738, + "learning_rate": 0.0009955616856543587, + "loss": 1.01431036, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.59667969, + "step": 370, + "time_per_iteration": 2.6869115829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117739, + "balance_loss_mlp": 1.11649847, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.04749901473855408, + "language_loss": 0.92605507, + "learning_rate": 0.0009955201713623448, + "loss": 0.93782902, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.60791016, + "step": 371, + "time_per_iteration": 2.7894065380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03553003, + "balance_loss_mlp": 3.34700894, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.1539254818196356, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.80225718, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 2.0625, + "step": 372, + "time_per_iteration": 5.025646924972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_mlp": 1.12739396, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.05697389015463885, + "language_loss": 1.05361807, + "learning_rate": 0.0009954365656605333, + "loss": 1.06550562, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.61328125, + "step": 373, + "time_per_iteration": 2.5767741203308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203971, + "balance_loss_mlp": 1.13878703, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.0561234241567743, + "language_loss": 0.98981488, + "learning_rate": 0.0009953944742831947, + "loss": 1.00185454, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.65185547, + "step": 374, + "time_per_iteration": 3.0126912593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209318, + "balance_loss_mlp": 1.14351439, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.05197007853134015, + "language_loss": 1.02623391, + "learning_rate": 0.0009953521905766642, + "loss": 1.0383271, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.65820312, + "step": 375, + "time_per_iteration": 2.9678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207965, + "balance_loss_mlp": 1.14464104, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.05250799377029981, + "language_loss": 1.01212132, + "learning_rate": 0.0009953097145573577, + "loss": 1.02420104, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.6328125, + "step": 376, + "time_per_iteration": 2.7048561573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121329, + "balance_loss_mlp": 1.1502521, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.050651846587156886, + "language_loss": 0.98499894, + "learning_rate": 0.000995267046241766, + "loss": 0.99713182, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.62988281, + "step": 377, + "time_per_iteration": 3.287705421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225924, + "balance_loss_mlp": 1.16341114, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.05776369312695448, + "language_loss": 0.98701203, + "learning_rate": 0.0009952241856464547, + "loss": 0.99927127, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.62451172, + "step": 378, + "time_per_iteration": 2.5897629261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220563, + "balance_loss_mlp": 1.16010034, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.05450855675542614, + "language_loss": 1.05642247, + "learning_rate": 0.0009951811327880632, + "loss": 1.06862807, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.60351562, + "step": 379, + "time_per_iteration": 2.7320594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220943, + "balance_loss_mlp": 1.15924072, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.04947645913164449, + "language_loss": 0.99005401, + "learning_rate": 0.0009951378876833063, + "loss": 1.00226343, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.61669922, + "step": 380, + "time_per_iteration": 2.595810651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196634, + "balance_loss_mlp": 1.13798296, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.058807068798268386, + "language_loss": 1.05567527, + "learning_rate": 0.0009950944503489736, + "loss": 1.06764162, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.5859375, + "step": 381, + "time_per_iteration": 2.733560562133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197608, + "balance_loss_mlp": 1.13914812, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.06747680453051412, + "language_loss": 0.99337935, + "learning_rate": 0.0009950508208019285, + "loss": 1.00535548, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.58398438, + "step": 382, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176507, + "balance_loss_mlp": 1.12062192, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.05827239016363537, + "language_loss": 1.03707182, + "learning_rate": 0.0009950069990591096, + "loss": 1.04883695, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.55908203, + "step": 383, + "time_per_iteration": 2.6856980323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05393736, + "balance_loss_mlp": 5.19079447, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.38241300139143997, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.81795102, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 2.03125, + "step": 384, + "time_per_iteration": 4.860661268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128436, + "balance_loss_mlp": 1.07369518, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.06005395599718801, + "language_loss": 0.96679938, + "learning_rate": 0.0009949187790542777, + "loss": 0.97808379, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.54760742, + "step": 385, + "time_per_iteration": 2.7245922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.09042215, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.06780842756482337, + "language_loss": 0.9270733, + "learning_rate": 0.0009948743808265148, + "loss": 0.93854064, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.56298828, + "step": 386, + "time_per_iteration": 2.6745331287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187036, + "balance_loss_mlp": 1.13334417, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.04295711334598506, + "language_loss": 1.02854586, + "learning_rate": 0.0009948297904714782, + "loss": 1.04041624, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.53759766, + "step": 387, + "time_per_iteration": 2.681718111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202671, + "balance_loss_mlp": 1.15167296, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.05564614333293379, + "language_loss": 0.94366896, + "learning_rate": 0.0009947850080064796, + "loss": 0.95569569, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.51049805, + "step": 388, + "time_per_iteration": 2.788663148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216483, + "balance_loss_mlp": 1.16817975, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.07112384111458, + "language_loss": 0.99713415, + "learning_rate": 0.0009947400334489047, + "loss": 1.00929892, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.48291016, + "step": 389, + "time_per_iteration": 2.9905049800872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227498, + "balance_loss_mlp": 1.17926562, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.06900212518032732, + "language_loss": 0.91264081, + "learning_rate": 0.0009946948668162145, + "loss": 0.92491579, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.48168945, + "step": 390, + "time_per_iteration": 2.767531394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012247, + "balance_loss_mlp": 1.17277205, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.052104168644034804, + "language_loss": 0.95126128, + "learning_rate": 0.0009946495081259441, + "loss": 0.96350825, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.52001953, + "step": 391, + "time_per_iteration": 2.816908597946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192311, + "balance_loss_mlp": 1.14057434, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.051504782312047234, + "language_loss": 0.99421549, + "learning_rate": 0.0009946039573957035, + "loss": 1.00613856, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.51782227, + "step": 392, + "time_per_iteration": 2.9265222549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116666, + "balance_loss_mlp": 1.11478019, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.055053573084277836, + "language_loss": 0.95799196, + "learning_rate": 0.000994558214643177, + "loss": 0.96965855, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.51928711, + "step": 393, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165121, + "balance_loss_mlp": 1.11352682, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.05925711706254076, + "language_loss": 0.97585773, + "learning_rate": 0.000994512279886123, + "loss": 0.98750889, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.51660156, + "step": 394, + "time_per_iteration": 3.0709142684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.09191656, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.04191079383555719, + "language_loss": 0.97239089, + "learning_rate": 0.0009944661531423758, + "loss": 0.98382699, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.51757812, + "step": 395, + "time_per_iteration": 2.7044599056243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134219, + "balance_loss_mlp": 1.08338809, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.05545815376917658, + "language_loss": 0.96390671, + "learning_rate": 0.000994419834429843, + "loss": 0.97524893, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.50854492, + "step": 396, + "time_per_iteration": 2.6767609119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135922, + "balance_loss_mlp": 1.08525789, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.05307630449121137, + "language_loss": 1.01208472, + "learning_rate": 0.0009943733237665069, + "loss": 1.02344394, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.50683594, + "step": 397, + "time_per_iteration": 2.819148302078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124426, + "balance_loss_mlp": 1.07502615, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.049844903289807924, + "language_loss": 0.99488425, + "learning_rate": 0.0009943266211704248, + "loss": 1.00612843, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.49389648, + "step": 398, + "time_per_iteration": 2.9555482864379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125466, + "balance_loss_mlp": 1.07675719, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.05620775813161816, + "language_loss": 1.01430082, + "learning_rate": 0.000994279726659728, + "loss": 1.02555549, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.48706055, + "step": 399, + "time_per_iteration": 2.5138003826141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.07761765, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.05674792404596756, + "language_loss": 0.99883693, + "learning_rate": 0.0009942326402526231, + "loss": 1.01010823, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.49511719, + "step": 400, + "time_per_iteration": 2.5245604515075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_mlp": 1.07793891, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.036646942736225624, + "language_loss": 0.9767518, + "learning_rate": 0.0009941853619673902, + "loss": 0.98802906, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.49804688, + "step": 401, + "time_per_iteration": 2.644771099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_mlp": 1.07451057, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.057554732491620374, + "language_loss": 1.01884329, + "learning_rate": 0.0009941378918223844, + "loss": 1.0300777, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.48876953, + "step": 402, + "time_per_iteration": 3.051617383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_mlp": 1.07618988, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.04510164642433069, + "language_loss": 0.94372368, + "learning_rate": 0.0009940902298360354, + "loss": 0.95496523, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.47924805, + "step": 403, + "time_per_iteration": 2.7302582263946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118279, + "balance_loss_mlp": 1.0687592, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.062376946911402976, + "language_loss": 1.04687834, + "learning_rate": 0.0009940423760268473, + "loss": 1.05806112, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.49536133, + "step": 404, + "time_per_iteration": 2.856938600540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118682, + "balance_loss_mlp": 1.07009196, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.046838991637930295, + "language_loss": 0.97888398, + "learning_rate": 0.0009939943304133982, + "loss": 0.99007082, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.48608398, + "step": 405, + "time_per_iteration": 2.6161091327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115161, + "balance_loss_mlp": 1.06881261, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.04496148345425058, + "language_loss": 1.04081011, + "learning_rate": 0.0009939460930143416, + "loss": 1.0519619, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.46337891, + "step": 406, + "time_per_iteration": 2.6310677528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119218, + "balance_loss_mlp": 1.07332289, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.037201804651944344, + "language_loss": 0.98071587, + "learning_rate": 0.0009938976638484043, + "loss": 0.99190807, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.45874023, + "step": 407, + "time_per_iteration": 2.8977036476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.06844616, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.04629061554837057, + "language_loss": 0.97991359, + "learning_rate": 0.0009938490429343887, + "loss": 0.99104249, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.44458008, + "step": 408, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07315516, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.04004461216150975, + "language_loss": 0.97974342, + "learning_rate": 0.0009938002302911709, + "loss": 0.99092889, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.4543457, + "step": 409, + "time_per_iteration": 2.738518238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123547, + "balance_loss_mlp": 1.07915401, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.07048914756312923, + "language_loss": 1.00401747, + "learning_rate": 0.0009937512259377015, + "loss": 1.01525307, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.44384766, + "step": 410, + "time_per_iteration": 2.670149564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110678, + "balance_loss_mlp": 1.0668565, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.049646402233970426, + "language_loss": 0.99659574, + "learning_rate": 0.000993702029893006, + "loss": 1.00770259, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.4387207, + "step": 411, + "time_per_iteration": 2.7853777408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118473, + "balance_loss_mlp": 1.07200527, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.04880092350488667, + "language_loss": 0.98862529, + "learning_rate": 0.0009936526421761838, + "loss": 0.99981004, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.46435547, + "step": 412, + "time_per_iteration": 3.030674457550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114951, + "balance_loss_mlp": 1.07043815, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.04383720282943398, + "language_loss": 1.01490402, + "learning_rate": 0.000993603062806409, + "loss": 1.02605367, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.4453125, + "step": 413, + "time_per_iteration": 2.7101500034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0637151, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.046157231925668944, + "language_loss": 1.04664707, + "learning_rate": 0.0009935532918029298, + "loss": 1.05774391, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.45947266, + "step": 414, + "time_per_iteration": 2.593390941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118947, + "balance_loss_mlp": 1.07278943, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.058468816323775735, + "language_loss": 0.97956645, + "learning_rate": 0.0009935033291850694, + "loss": 0.99075592, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.46166992, + "step": 415, + "time_per_iteration": 2.6693851947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_mlp": 1.0654031, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.061030352209764355, + "language_loss": 1.00225627, + "learning_rate": 0.0009934531749722247, + "loss": 1.01337099, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.46044922, + "step": 416, + "time_per_iteration": 2.578746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_mlp": 1.07337523, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.05071064772829009, + "language_loss": 0.98778659, + "learning_rate": 0.0009934028291838672, + "loss": 0.99898028, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.45996094, + "step": 417, + "time_per_iteration": 2.7096333503723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106202, + "balance_loss_mlp": 1.06166553, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.045680808340910005, + "language_loss": 0.94326293, + "learning_rate": 0.0009933522918395433, + "loss": 0.95432496, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.44555664, + "step": 418, + "time_per_iteration": 2.644414186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04959176, + "balance_loss_mlp": 4.71808767, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.3214703434406663, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.83210278, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 2.40625, + "step": 419, + "time_per_iteration": 4.868964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_mlp": 1.07108891, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08060687528614664, + "language_loss": 1.13036489, + "learning_rate": 0.000993250642561551, + "loss": 1.14152122, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.4453125, + "step": 420, + "time_per_iteration": 2.632162094116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121548, + "balance_loss_mlp": 1.07538986, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.08633853635548816, + "language_loss": 0.9784801, + "learning_rate": 0.0009931995306673466, + "loss": 0.98969555, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.46118164, + "step": 421, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134412, + "balance_loss_mlp": 1.08815861, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.038770411105538145, + "language_loss": 1.03907061, + "learning_rate": 0.000993148227296103, + "loss": 1.05041468, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.4621582, + "step": 422, + "time_per_iteration": 2.669496536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133253, + "balance_loss_mlp": 1.08707166, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.053095831055692516, + "language_loss": 0.9112367, + "learning_rate": 0.000993096732467738, + "loss": 0.92256927, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.46166992, + "step": 423, + "time_per_iteration": 2.961660861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150855, + "balance_loss_mlp": 1.10498345, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.08137036582560589, + "language_loss": 0.99760056, + "learning_rate": 0.0009930450462022435, + "loss": 1.00910902, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.45874023, + "step": 424, + "time_per_iteration": 2.7952311038970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03600409, + "balance_loss_mlp": 3.48901963, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.18349806711668631, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.82790214, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.1171875, + "step": 425, + "time_per_iteration": 4.8854875564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.11344862, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.06491953183218531, + "language_loss": 0.9776966, + "learning_rate": 0.0009929410994402065, + "loss": 0.98928833, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.45703125, + "step": 426, + "time_per_iteration": 4.275091886520386 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169515, + "balance_loss_mlp": 1.12223697, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.07437504582125473, + "language_loss": 1.02033544, + "learning_rate": 0.0009928888389840196, + "loss": 1.03203058, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.47241211, + "step": 427, + "time_per_iteration": 2.7036454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145234, + "balance_loss_mlp": 1.09941018, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.05964472172349544, + "language_loss": 1.03706717, + "learning_rate": 0.0009928363871714147, + "loss": 1.04851961, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.45849609, + "step": 428, + "time_per_iteration": 2.6669116020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.10254741, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.07530468467255677, + "language_loss": 0.97491598, + "learning_rate": 0.0009927837440227556, + "loss": 0.98641634, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.47485352, + "step": 429, + "time_per_iteration": 2.8463807106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120703, + "balance_loss_mlp": 1.07588065, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.04140843961960757, + "language_loss": 0.92054397, + "learning_rate": 0.0009927309095584798, + "loss": 0.93175101, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.44824219, + "step": 430, + "time_per_iteration": 2.9767606258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116415, + "balance_loss_mlp": 1.07278419, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.04726827868993605, + "language_loss": 1.04780793, + "learning_rate": 0.0009926778837991, + "loss": 1.05897212, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.43652344, + "step": 431, + "time_per_iteration": 2.5883395671844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.06749809, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.049074519776006666, + "language_loss": 1.0243988, + "learning_rate": 0.000992624666765202, + "loss": 1.0355196, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.44604492, + "step": 432, + "time_per_iteration": 2.7943906784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_mlp": 1.07200766, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.04417562175093811, + "language_loss": 1.00109053, + "learning_rate": 0.000992571258477447, + "loss": 1.01224887, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.43823242, + "step": 433, + "time_per_iteration": 2.836127758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_mlp": 1.07260084, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.04319706549365549, + "language_loss": 0.93695247, + "learning_rate": 0.0009925176589565695, + "loss": 0.94812053, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.44213867, + "step": 434, + "time_per_iteration": 2.8157734870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131219, + "balance_loss_mlp": 1.08756483, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.04172416189060796, + "language_loss": 1.04242814, + "learning_rate": 0.0009924638682233791, + "loss": 1.05374026, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.43652344, + "step": 435, + "time_per_iteration": 2.5577316284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503783, + "balance_loss_mlp": 2.3527205, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06968128915635463, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82068378, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.5078125, + "step": 436, + "time_per_iteration": 4.594938516616821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.08348453, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.0610737753852808, + "language_loss": 0.94037408, + "learning_rate": 0.0009923557132036668, + "loss": 0.95166528, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.45629883, + "step": 437, + "time_per_iteration": 3.0716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_mlp": 1.07430601, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.04662895628051273, + "language_loss": 0.97730738, + "learning_rate": 0.0009923013489591345, + "loss": 0.98849535, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.4453125, + "step": 438, + "time_per_iteration": 2.726792812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_mlp": 1.06685066, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04626496214247174, + "language_loss": 0.96079296, + "learning_rate": 0.0009922467935862681, + "loss": 0.97189873, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.4375, + "step": 439, + "time_per_iteration": 3.0908052921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119416, + "balance_loss_mlp": 1.07273376, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.048922855388473234, + "language_loss": 0.99432743, + "learning_rate": 0.0009921920471062478, + "loss": 1.00552154, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.46655273, + "step": 440, + "time_per_iteration": 2.622451066970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117818, + "balance_loss_mlp": 1.07342434, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.07502031783190574, + "language_loss": 0.9797709, + "learning_rate": 0.0009921371095403281, + "loss": 0.99094903, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.44433594, + "step": 441, + "time_per_iteration": 2.705152750015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011116, + "balance_loss_mlp": 1.06863689, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.04941418140969711, + "language_loss": 1.00754833, + "learning_rate": 0.0009920819809098379, + "loss": 1.01866436, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.42993164, + "step": 442, + "time_per_iteration": 2.5887317657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119689, + "balance_loss_mlp": 1.07715499, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.06964486535702215, + "language_loss": 0.96275294, + "learning_rate": 0.0009920266612361798, + "loss": 0.97394979, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.42578125, + "step": 443, + "time_per_iteration": 2.745222330093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_mlp": 1.06587708, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.05163049083883061, + "language_loss": 0.96866751, + "learning_rate": 0.0009919711505408308, + "loss": 0.97974443, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.41821289, + "step": 444, + "time_per_iteration": 2.780095100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106314, + "balance_loss_mlp": 1.0654248, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.054748359311131624, + "language_loss": 0.94535226, + "learning_rate": 0.000991915448845342, + "loss": 0.95641541, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.40893555, + "step": 445, + "time_per_iteration": 2.5229337215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.06279922, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.0575820537988498, + "language_loss": 1.03181779, + "learning_rate": 0.000991859556171339, + "loss": 1.04284596, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.40039062, + "step": 446, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_mlp": 1.06497526, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.04289742759235468, + "language_loss": 1.05262291, + "learning_rate": 0.000991803472540521, + "loss": 1.06367946, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.40673828, + "step": 447, + "time_per_iteration": 2.6220486164093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_mlp": 1.06550729, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.04330621576945977, + "language_loss": 1.00096428, + "learning_rate": 0.0009917471979746615, + "loss": 1.01202178, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.40234375, + "step": 448, + "time_per_iteration": 2.9767467975616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.07379115, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.03609686036920932, + "language_loss": 0.98485255, + "learning_rate": 0.0009916907324956086, + "loss": 0.99600053, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.41015625, + "step": 449, + "time_per_iteration": 2.701143980026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117003, + "balance_loss_mlp": 1.07480288, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.04834207301210501, + "language_loss": 0.95441091, + "learning_rate": 0.0009916340761252837, + "loss": 0.965581, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.42211914, + "step": 450, + "time_per_iteration": 2.6036393642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129901, + "balance_loss_mlp": 1.08910751, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.07269963588094165, + "language_loss": 0.9243114, + "learning_rate": 0.0009915772288856832, + "loss": 0.93561041, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.40820312, + "step": 451, + "time_per_iteration": 3.05719256401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125865, + "balance_loss_mlp": 1.08359361, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.05954656443346509, + "language_loss": 0.93746579, + "learning_rate": 0.000991520190798877, + "loss": 0.94872439, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.42285156, + "step": 452, + "time_per_iteration": 2.804128885269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_mlp": 1.07723105, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.05604676795867647, + "language_loss": 1.04000187, + "learning_rate": 0.0009914629618870089, + "loss": 1.05120206, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.42797852, + "step": 453, + "time_per_iteration": 2.8959083557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02032313, + "balance_loss_mlp": 1.86675501, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.06678910630402063, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.80708182, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.65625, + "step": 454, + "time_per_iteration": 4.753306865692139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974747, + "balance_loss_mlp": 1.80537415, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.06350102966569023, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83402705, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.6953125, + "step": 455, + "time_per_iteration": 4.909627914428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_mlp": 1.05778539, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.07384563339861851, + "language_loss": 0.95938599, + "learning_rate": 0.0009912901304235883, + "loss": 0.97038674, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.42333984, + "step": 456, + "time_per_iteration": 3.0303096771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_mlp": 1.05112898, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.061767025741825826, + "language_loss": 0.93898749, + "learning_rate": 0.000991232138434397, + "loss": 0.94991863, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.41992188, + "step": 457, + "time_per_iteration": 2.834221601486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089137, + "balance_loss_mlp": 1.04824805, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.05183647995223567, + "language_loss": 1.00765896, + "learning_rate": 0.000991173955731976, + "loss": 1.0185504, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.40869141, + "step": 458, + "time_per_iteration": 2.628783702850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_mlp": 1.05569601, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.052575936673692925, + "language_loss": 1.04489028, + "learning_rate": 0.0009911155823389137, + "loss": 1.0558753, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.42797852, + "step": 459, + "time_per_iteration": 2.964416742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_mlp": 1.06523609, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.05270293395412616, + "language_loss": 1.00385904, + "learning_rate": 0.000991057018277873, + "loss": 1.01492882, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.41748047, + "step": 460, + "time_per_iteration": 2.6944808959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_mlp": 1.06245136, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.04953210926048159, + "language_loss": 1.01399374, + "learning_rate": 0.0009909982635715898, + "loss": 1.02504039, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.42236328, + "step": 461, + "time_per_iteration": 2.6137924194335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_mlp": 1.05374336, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.050729417377465176, + "language_loss": 1.00123549, + "learning_rate": 0.0009909393182428751, + "loss": 1.01219559, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.42285156, + "step": 462, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109539, + "balance_loss_mlp": 1.06891286, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.043715633324142876, + "language_loss": 0.94138575, + "learning_rate": 0.000990880182314614, + "loss": 0.95248115, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.40625, + "step": 463, + "time_per_iteration": 2.733408212661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_mlp": 1.06121325, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.051961844945365605, + "language_loss": 0.94176865, + "learning_rate": 0.0009908208558097643, + "loss": 0.9527818, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.40087891, + "step": 464, + "time_per_iteration": 2.9006474018096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105621, + "balance_loss_mlp": 1.06508923, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.04470923680131565, + "language_loss": 0.9716863, + "learning_rate": 0.000990761338751359, + "loss": 0.98274255, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.40527344, + "step": 465, + "time_per_iteration": 2.775830030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410893, + "balance_loss_mlp": 1.25296497, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.0425617539044403, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75070524, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.578125, + "step": 466, + "time_per_iteration": 5.023500919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_mlp": 1.05869305, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.04007163966797277, + "language_loss": 0.9983623, + "learning_rate": 0.0009906417330663815, + "loss": 1.00936306, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.41381836, + "step": 467, + "time_per_iteration": 2.6194305419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099405, + "balance_loss_mlp": 1.05889773, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.03985353179312445, + "language_loss": 0.96447593, + "learning_rate": 0.0009905816444862442, + "loss": 0.97546995, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.4050293, + "step": 468, + "time_per_iteration": 2.623267889022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_mlp": 1.06568456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.038840192804800056, + "language_loss": 0.93513083, + "learning_rate": 0.0009905213654454216, + "loss": 0.94620228, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.41455078, + "step": 469, + "time_per_iteration": 2.9024641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_mlp": 1.06466317, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.04985478927164425, + "language_loss": 1.01848495, + "learning_rate": 0.0009904608959673158, + "loss": 1.02953827, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.40649414, + "step": 470, + "time_per_iteration": 2.7711682319641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097659, + "balance_loss_mlp": 1.0588448, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.04989175862356038, + "language_loss": 1.02851224, + "learning_rate": 0.000990400236075403, + "loss": 1.03948903, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.38793945, + "step": 471, + "time_per_iteration": 2.536189317703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109095, + "balance_loss_mlp": 1.05113411, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.03738902964718639, + "language_loss": 0.98994756, + "learning_rate": 0.0009903393857932338, + "loss": 1.000857, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.39794922, + "step": 472, + "time_per_iteration": 2.6588857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097802, + "balance_loss_mlp": 1.05908275, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.045733529486957185, + "language_loss": 0.97091877, + "learning_rate": 0.0009902783451444317, + "loss": 0.98189688, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.38720703, + "step": 473, + "time_per_iteration": 2.6981122493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091239, + "balance_loss_mlp": 1.05406976, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.04942472768420212, + "language_loss": 1.00819659, + "learning_rate": 0.0009902171141526956, + "loss": 1.01910901, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.37158203, + "step": 474, + "time_per_iteration": 2.527256727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099497, + "balance_loss_mlp": 1.06225586, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.04275448033987936, + "language_loss": 0.88210893, + "learning_rate": 0.000990155692841797, + "loss": 0.8931039, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.37231445, + "step": 475, + "time_per_iteration": 2.989063262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_mlp": 1.06084871, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.04412440376655801, + "language_loss": 1.00229144, + "learning_rate": 0.0009900940812355818, + "loss": 1.01326227, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.36254883, + "step": 476, + "time_per_iteration": 2.8778445720672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105736, + "balance_loss_mlp": 1.07011676, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.06417087981964828, + "language_loss": 0.97168529, + "learning_rate": 0.00099003227935797, + "loss": 0.98274267, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.35620117, + "step": 477, + "time_per_iteration": 2.708608627319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101416, + "balance_loss_mlp": 1.06369829, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.06707216335576115, + "language_loss": 1.01291215, + "learning_rate": 0.000989970287232955, + "loss": 1.02392626, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.37695312, + "step": 478, + "time_per_iteration": 2.783325672149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090795, + "balance_loss_mlp": 1.05431736, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.05564878549890474, + "language_loss": 0.9726451, + "learning_rate": 0.0009899081048846043, + "loss": 0.98355305, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.36474609, + "step": 479, + "time_per_iteration": 2.6017916202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097049, + "balance_loss_mlp": 1.05964088, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.06044394784495309, + "language_loss": 1.03484094, + "learning_rate": 0.0009898457323370593, + "loss": 1.04581141, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.37402344, + "step": 480, + "time_per_iteration": 2.575676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.0533123, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.05778783373137127, + "language_loss": 0.99753714, + "learning_rate": 0.000989783169614535, + "loss": 1.00844884, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.37817383, + "step": 481, + "time_per_iteration": 2.646942615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283887, + "balance_loss_mlp": 1.15876544, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.01956789957612316, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80036646, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.25, + "step": 482, + "time_per_iteration": 4.860741376876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_mlp": 1.06158745, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.06801501049369231, + "language_loss": 0.97102278, + "learning_rate": 0.000989657473741779, + "loss": 0.98201108, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.37231445, + "step": 483, + "time_per_iteration": 2.819138526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095911, + "balance_loss_mlp": 1.05979109, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.038333848574242754, + "language_loss": 0.98462784, + "learning_rate": 0.0009895943406403465, + "loss": 0.99558693, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.36132812, + "step": 484, + "time_per_iteration": 2.7088170051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_mlp": 1.06854701, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.05828015098596693, + "language_loss": 0.92231822, + "learning_rate": 0.0009895310174615338, + "loss": 0.933357, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.35351562, + "step": 485, + "time_per_iteration": 2.760511636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_mlp": 1.14983261, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.018538812380254305, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76984316, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.15625, + "step": 486, + "time_per_iteration": 4.656491994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_mlp": 1.0699296, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.04721263549483299, + "language_loss": 0.95839012, + "learning_rate": 0.0009894038009701782, + "loss": 0.96944392, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.35498047, + "step": 487, + "time_per_iteration": 2.6169629096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_mlp": 1.06868315, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.05102581257360949, + "language_loss": 0.98848963, + "learning_rate": 0.0009893399077070253, + "loss": 0.99952644, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.3503418, + "step": 488, + "time_per_iteration": 2.5845744609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_mlp": 1.07193291, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.05918319403016569, + "language_loss": 0.92944884, + "learning_rate": 0.0009892758244652718, + "loss": 0.94051951, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.35180664, + "step": 489, + "time_per_iteration": 2.660200357437134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091731, + "balance_loss_mlp": 1.05801892, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.041386989889926534, + "language_loss": 1.00010514, + "learning_rate": 0.0009892115512697968, + "loss": 1.01102245, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.33740234, + "step": 490, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_mlp": 1.05631554, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.04182034264497562, + "language_loss": 1.00108159, + "learning_rate": 0.0009891470881455537, + "loss": 1.01198137, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.33666992, + "step": 491, + "time_per_iteration": 2.746169328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108736, + "balance_loss_mlp": 1.05319476, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.0458284589248403, + "language_loss": 0.98654628, + "learning_rate": 0.0009890824351175692, + "loss": 0.99741989, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.34204102, + "step": 492, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.05654192, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.041327442652051224, + "language_loss": 1.0219661, + "learning_rate": 0.0009890175922109435, + "loss": 1.0328722, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.34082031, + "step": 493, + "time_per_iteration": 2.6482973098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010971, + "balance_loss_mlp": 1.06086028, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.06926989533772566, + "language_loss": 1.01090789, + "learning_rate": 0.0009889525594508513, + "loss": 1.02187896, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.36254883, + "step": 494, + "time_per_iteration": 3.0095505714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_mlp": 1.05596447, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.04986765426945594, + "language_loss": 0.94310975, + "learning_rate": 0.0009888873368625404, + "loss": 0.95402986, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.3605957, + "step": 495, + "time_per_iteration": 2.5451042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05426204, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.05650320770937666, + "language_loss": 0.98877072, + "learning_rate": 0.0009888219244713326, + "loss": 0.99966443, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.3515625, + "step": 496, + "time_per_iteration": 2.8157310485839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086342, + "balance_loss_mlp": 1.05100799, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.05039739829653265, + "language_loss": 0.99588835, + "learning_rate": 0.0009887563223026229, + "loss": 1.00675178, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.35375977, + "step": 497, + "time_per_iteration": 2.6563401222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244906, + "balance_loss_mlp": 1.14648652, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.01649790273231252, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80313075, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.98046875, + "step": 498, + "time_per_iteration": 4.8689799308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098776, + "balance_loss_mlp": 1.0630604, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.06260101269903841, + "language_loss": 0.97272921, + "learning_rate": 0.0009886245487346482, + "loss": 0.98371696, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35742188, + "step": 499, + "time_per_iteration": 3.0292818546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.08159947, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.055723050712230264, + "language_loss": 1.00704551, + "learning_rate": 0.0009885583773865422, + "loss": 1.01822114, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.35986328, + "step": 500, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117756, + "balance_loss_mlp": 1.08137345, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.06268683986847115, + "language_loss": 0.9714855, + "learning_rate": 0.0009884920163632524, + "loss": 0.98266304, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.36352539, + "step": 501, + "time_per_iteration": 2.666341781616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111747, + "balance_loss_mlp": 1.07638931, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.04553274405873497, + "language_loss": 1.01245189, + "learning_rate": 0.000988425465690543, + "loss": 1.02356935, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35375977, + "step": 502, + "time_per_iteration": 2.55082106590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06867552, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.04373339165225573, + "language_loss": 0.99427342, + "learning_rate": 0.0009883587253942505, + "loss": 1.00530469, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.34472656, + "step": 503, + "time_per_iteration": 2.7674455642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_mlp": 1.07378531, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.051161986083573203, + "language_loss": 1.04393589, + "learning_rate": 0.0009882917955002862, + "loss": 1.05501866, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.3449707, + "step": 504, + "time_per_iteration": 2.549203872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_mlp": 1.07116556, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.04840022534917253, + "language_loss": 0.95342839, + "learning_rate": 0.0009882246760346343, + "loss": 0.96448457, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.3449707, + "step": 505, + "time_per_iteration": 2.653627872467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115925, + "balance_loss_mlp": 1.08128262, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.08271599518488834, + "language_loss": 1.02799106, + "learning_rate": 0.0009881573670233533, + "loss": 1.03915036, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.34692383, + "step": 506, + "time_per_iteration": 2.5279319286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104761, + "balance_loss_mlp": 1.07061946, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.05291653517072512, + "language_loss": 0.96169406, + "learning_rate": 0.0009880898684925747, + "loss": 0.97274166, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.34179688, + "step": 507, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_mlp": 1.06039834, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.053809005456099755, + "language_loss": 0.94680405, + "learning_rate": 0.0009880221804685037, + "loss": 0.95776224, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.35424805, + "step": 508, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245061, + "balance_loss_mlp": 1.15503371, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.024665830319341657, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80589479, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.8984375, + "step": 509, + "time_per_iteration": 4.705655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094606, + "balance_loss_mlp": 1.05932045, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.06644626598388864, + "language_loss": 1.02131915, + "learning_rate": 0.0009878862360456733, + "loss": 1.03226519, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.35327148, + "step": 510, + "time_per_iteration": 2.682035446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097961, + "balance_loss_mlp": 1.06336641, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.06543943311749917, + "language_loss": 0.9266718, + "learning_rate": 0.0009878179796996922, + "loss": 0.9376514, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.34619141, + "step": 511, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105256, + "balance_loss_mlp": 1.07030368, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.054213046356477584, + "language_loss": 0.96428764, + "learning_rate": 0.0009877495339659754, + "loss": 0.97534013, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.34985352, + "step": 512, + "time_per_iteration": 2.746337413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105714, + "balance_loss_mlp": 1.07190621, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.0573170093193853, + "language_loss": 0.91841626, + "learning_rate": 0.000987680898871096, + "loss": 0.9294734, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.33837891, + "step": 513, + "time_per_iteration": 2.7060482501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110186, + "balance_loss_mlp": 1.07675993, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.0786420176645203, + "language_loss": 0.95400196, + "learning_rate": 0.0009876120744417, + "loss": 0.96510386, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33447266, + "step": 514, + "time_per_iteration": 2.9473536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105767, + "balance_loss_mlp": 1.07071972, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.04861145683213968, + "language_loss": 1.01586378, + "learning_rate": 0.0009875430607045078, + "loss": 1.02692139, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.35058594, + "step": 515, + "time_per_iteration": 2.6745734214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095325, + "balance_loss_mlp": 1.06044412, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.061184004848699555, + "language_loss": 0.96467805, + "learning_rate": 0.000987473857686313, + "loss": 0.97563124, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.34887695, + "step": 516, + "time_per_iteration": 2.70771861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_mlp": 1.06909752, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.06268031252544905, + "language_loss": 1.01795554, + "learning_rate": 0.0009874044654139824, + "loss": 1.02899015, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34399414, + "step": 517, + "time_per_iteration": 2.7501027584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104488, + "balance_loss_mlp": 1.07020378, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.05802057466070587, + "language_loss": 1.01047516, + "learning_rate": 0.0009873348839144563, + "loss": 1.02152014, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34301758, + "step": 518, + "time_per_iteration": 2.5247762203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125408, + "balance_loss_mlp": 1.09100425, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.057276560313135924, + "language_loss": 1.0153054, + "learning_rate": 0.000987265113214749, + "loss": 1.02655947, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34448242, + "step": 519, + "time_per_iteration": 2.569776773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151248, + "balance_loss_mlp": 1.11705852, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.06886779278024428, + "language_loss": 1.05486548, + "learning_rate": 0.0009871951533419476, + "loss": 1.066378, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.34204102, + "step": 520, + "time_per_iteration": 2.646489381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155904, + "balance_loss_mlp": 1.12085652, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.06947260655531057, + "language_loss": 0.93715644, + "learning_rate": 0.0009871250043232132, + "loss": 0.94871557, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.35058594, + "step": 521, + "time_per_iteration": 2.729825258255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145676, + "balance_loss_mlp": 1.11196363, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.05700460680955029, + "language_loss": 0.94319808, + "learning_rate": 0.0009870546661857797, + "loss": 0.95465487, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.33740234, + "step": 522, + "time_per_iteration": 2.589205026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.10572577, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.0627280587118585, + "language_loss": 1.04607201, + "learning_rate": 0.0009869841389569553, + "loss": 1.05746591, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.33666992, + "step": 523, + "time_per_iteration": 3.007927656173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_mlp": 1.07816648, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.07025860249961899, + "language_loss": 0.94709289, + "learning_rate": 0.0009869134226641206, + "loss": 0.95821834, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.34399414, + "step": 524, + "time_per_iteration": 2.5647661685943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096367, + "balance_loss_mlp": 1.06134343, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.0754869647085307, + "language_loss": 0.96719551, + "learning_rate": 0.0009868425173347303, + "loss": 0.97815919, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.3503418, + "step": 525, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_mlp": 1.04816294, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.04461045481777941, + "language_loss": 1.01427031, + "learning_rate": 0.0009867714229963125, + "loss": 1.02508664, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.3347168, + "step": 526, + "time_per_iteration": 2.7551424503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_mlp": 1.06672287, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.06519670287778681, + "language_loss": 0.99495387, + "learning_rate": 0.000986700139676468, + "loss": 1.00596797, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34716797, + "step": 527, + "time_per_iteration": 2.5689845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_mlp": 1.08317983, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.055001529425537175, + "language_loss": 0.97175169, + "learning_rate": 0.0009866286674028717, + "loss": 0.98293233, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.34936523, + "step": 528, + "time_per_iteration": 2.6308236122131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118149, + "balance_loss_mlp": 1.08307743, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.06791274268555884, + "language_loss": 0.93964088, + "learning_rate": 0.0009865570062032717, + "loss": 0.95082229, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.35083008, + "step": 529, + "time_per_iteration": 2.931939125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117806, + "balance_loss_mlp": 1.08104193, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.05469252484924326, + "language_loss": 0.97321147, + "learning_rate": 0.0009864851561054893, + "loss": 0.98438954, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.36743164, + "step": 530, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_mlp": 1.0567745, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.053032092698093954, + "language_loss": 0.97237867, + "learning_rate": 0.0009864131171374191, + "loss": 0.9832958, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34936523, + "step": 531, + "time_per_iteration": 2.671963930130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_mlp": 1.05704737, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.037042660663456926, + "language_loss": 0.97530323, + "learning_rate": 0.0009863408893270292, + "loss": 0.98621887, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.34521484, + "step": 532, + "time_per_iteration": 2.8692965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080567, + "balance_loss_mlp": 1.0459249, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.045189468397627275, + "language_loss": 0.93818736, + "learning_rate": 0.0009862684727023605, + "loss": 0.94899297, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34692383, + "step": 533, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_mlp": 1.04978406, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.041807858593286534, + "language_loss": 0.94846106, + "learning_rate": 0.0009861958672915283, + "loss": 0.95930672, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.34814453, + "step": 534, + "time_per_iteration": 2.7894833087921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088029, + "balance_loss_mlp": 1.05348206, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.04113334704287127, + "language_loss": 0.93477535, + "learning_rate": 0.0009861230731227201, + "loss": 0.94565558, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.34570312, + "step": 535, + "time_per_iteration": 2.8369100093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_mlp": 1.06589389, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.06472741174466715, + "language_loss": 0.9716177, + "learning_rate": 0.0009860500902241973, + "loss": 0.98262858, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.35205078, + "step": 536, + "time_per_iteration": 2.6308608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_mlp": 1.06559658, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.06015330648509861, + "language_loss": 1.02488375, + "learning_rate": 0.0009859769186242942, + "loss": 1.0358845, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.34521484, + "step": 537, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094076, + "balance_loss_mlp": 1.06188989, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04182272700248836, + "language_loss": 0.96166039, + "learning_rate": 0.0009859035583514187, + "loss": 0.97260106, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32177734, + "step": 538, + "time_per_iteration": 2.665483236312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107546, + "balance_loss_mlp": 1.07497787, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.03728554890083732, + "language_loss": 0.9932602, + "learning_rate": 0.0009858300094340517, + "loss": 1.00433564, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.32568359, + "step": 539, + "time_per_iteration": 2.772207021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_mlp": 1.07908368, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.05284254114338104, + "language_loss": 0.91679931, + "learning_rate": 0.0009857562719007473, + "loss": 0.92790818, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.31787109, + "step": 540, + "time_per_iteration": 2.633002519607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_mlp": 1.06964111, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.07454941449424961, + "language_loss": 0.93962657, + "learning_rate": 0.0009856823457801331, + "loss": 0.95063812, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.888354539871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098965, + "balance_loss_mlp": 1.06682634, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.06016078646373104, + "language_loss": 1.01014686, + "learning_rate": 0.00098560823110091, + "loss": 1.02113652, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32128906, + "step": 542, + "time_per_iteration": 2.612365484237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.05664408, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.07331709746631812, + "language_loss": 0.99634022, + "learning_rate": 0.000985533927891851, + "loss": 1.00722837, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.3215332, + "step": 543, + "time_per_iteration": 2.6642584800720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_mlp": 1.05406535, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.07406485241554656, + "language_loss": 0.99318308, + "learning_rate": 0.0009854594361818044, + "loss": 1.00405657, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33300781, + "step": 544, + "time_per_iteration": 2.650541067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087044, + "balance_loss_mlp": 1.05357027, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.05515562757052397, + "language_loss": 0.98072803, + "learning_rate": 0.0009853847559996897, + "loss": 0.99159849, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.3347168, + "step": 545, + "time_per_iteration": 2.7268693447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098973, + "balance_loss_mlp": 1.0640682, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.05014767442192859, + "language_loss": 0.9781934, + "learning_rate": 0.0009853098873745, + "loss": 0.98918307, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34936523, + "step": 546, + "time_per_iteration": 3.001844644546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094885, + "balance_loss_mlp": 1.06010008, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.06665960072991474, + "language_loss": 0.96499509, + "learning_rate": 0.0009852348303353027, + "loss": 0.97594392, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34814453, + "step": 547, + "time_per_iteration": 2.7768120765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109085, + "balance_loss_mlp": 1.05692363, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.04477171592325676, + "language_loss": 0.89746928, + "learning_rate": 0.000985159584911237, + "loss": 0.90837783, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33959961, + "step": 548, + "time_per_iteration": 3.1397063732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109064, + "balance_loss_mlp": 1.0567131, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.057455808878804256, + "language_loss": 0.97617745, + "learning_rate": 0.0009850841511315162, + "loss": 0.98708391, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.33959961, + "step": 549, + "time_per_iteration": 2.6143858432769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.05660701, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.04134640300819554, + "language_loss": 0.97230792, + "learning_rate": 0.0009850085290254256, + "loss": 0.98321134, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33740234, + "step": 550, + "time_per_iteration": 2.784057855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_mlp": 1.05478084, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.041486348142279396, + "language_loss": 0.9340632, + "learning_rate": 0.0009849327186223246, + "loss": 0.94494367, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.33276367, + "step": 551, + "time_per_iteration": 2.822755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086641, + "balance_loss_mlp": 1.0536921, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.044652358506572586, + "language_loss": 1.00453854, + "learning_rate": 0.000984856719951646, + "loss": 1.01540482, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.32958984, + "step": 552, + "time_per_iteration": 2.561384439468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_mlp": 1.05577254, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.05595352831954139, + "language_loss": 0.98322356, + "learning_rate": 0.0009847805330428943, + "loss": 0.99410868, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.32739258, + "step": 553, + "time_per_iteration": 2.8988356590270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04940784, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05618387686115577, + "language_loss": 1.02895415, + "learning_rate": 0.0009847041579256481, + "loss": 1.03977895, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.33081055, + "step": 554, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088152, + "balance_loss_mlp": 1.05548859, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.04459262579832553, + "language_loss": 0.99802542, + "learning_rate": 0.0009846275946295592, + "loss": 1.00890684, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32641602, + "step": 555, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108533, + "balance_loss_mlp": 1.05347764, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.04108965909817336, + "language_loss": 0.92502242, + "learning_rate": 0.0009845508431843518, + "loss": 0.93587577, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.31835938, + "step": 556, + "time_per_iteration": 3.0189473628997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087957, + "balance_loss_mlp": 1.05612838, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.05029379164990677, + "language_loss": 0.95060432, + "learning_rate": 0.0009844739036198233, + "loss": 0.96148396, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.31811523, + "step": 557, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06340766, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.047100661757994676, + "language_loss": 1.0152961, + "learning_rate": 0.0009843967759658448, + "loss": 1.02625763, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.32739258, + "step": 558, + "time_per_iteration": 2.6677682399749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264894, + "balance_loss_mlp": 1.19775486, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.03689581784010691, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74032652, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.671875, + "step": 559, + "time_per_iteration": 4.873044013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.07234466, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.06480790167761245, + "language_loss": 1.01098323, + "learning_rate": 0.000984241956509384, + "loss": 1.02203977, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.33325195, + "step": 560, + "time_per_iteration": 2.655430555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095265, + "balance_loss_mlp": 1.0617907, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.05361377514900226, + "language_loss": 1.00074768, + "learning_rate": 0.0009841642647670078, + "loss": 1.01170027, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.33496094, + "step": 561, + "time_per_iteration": 2.5627329349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.05633116, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.04993888185520414, + "language_loss": 0.93071151, + "learning_rate": 0.0009840863850553944, + "loss": 0.94160575, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33105469, + "step": 562, + "time_per_iteration": 3.0020592212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108807, + "balance_loss_mlp": 1.05686092, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.046287089248472475, + "language_loss": 0.97956204, + "learning_rate": 0.0009840083174047782, + "loss": 0.99044275, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.31176758, + "step": 563, + "time_per_iteration": 2.7123258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_mlp": 1.06275535, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.036863902598139514, + "language_loss": 0.91394317, + "learning_rate": 0.0009839300618454685, + "loss": 0.92488301, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31176758, + "step": 564, + "time_per_iteration": 2.855482578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_mlp": 1.05386496, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0447892393855046, + "language_loss": 0.97269231, + "learning_rate": 0.0009838516184078466, + "loss": 0.98355657, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.32568359, + "step": 565, + "time_per_iteration": 2.8027093410491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090742, + "balance_loss_mlp": 1.05881739, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.039430635834492286, + "language_loss": 0.95326865, + "learning_rate": 0.0009837729871223669, + "loss": 0.964176, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3190918, + "step": 566, + "time_per_iteration": 2.621044158935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097443, + "balance_loss_mlp": 1.06473231, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.03524126234366562, + "language_loss": 0.96988255, + "learning_rate": 0.0009836941680195568, + "loss": 0.98085701, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.32714844, + "step": 567, + "time_per_iteration": 2.8241846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_mlp": 1.06359148, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.05940738915226433, + "language_loss": 0.94011569, + "learning_rate": 0.0009836151611300166, + "loss": 0.95106757, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.31567383, + "step": 568, + "time_per_iteration": 3.2259325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093362, + "balance_loss_mlp": 1.06327355, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.04952949609465528, + "language_loss": 1.01886261, + "learning_rate": 0.0009835359664844194, + "loss": 1.02979624, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.30029297, + "step": 569, + "time_per_iteration": 2.61936616897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235986, + "balance_loss_mlp": 1.17113578, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.02580255803672051, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82272792, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.6484375, + "step": 570, + "time_per_iteration": 4.946800470352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_mlp": 1.06947398, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.04088785760268294, + "language_loss": 0.98121774, + "learning_rate": 0.0009833770140481118, + "loss": 0.99224108, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.32861328, + "step": 571, + "time_per_iteration": 2.6676580905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_mlp": 1.07113993, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.04146527084622454, + "language_loss": 0.88084227, + "learning_rate": 0.000983297256319112, + "loss": 0.89187813, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.32446289, + "step": 572, + "time_per_iteration": 3.1977450847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098726, + "balance_loss_mlp": 1.06503749, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.11112801331440751, + "language_loss": 0.93675387, + "learning_rate": 0.000983217310957477, + "loss": 0.94774115, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33691406, + "step": 573, + "time_per_iteration": 2.771477222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08530974, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.046936313049011164, + "language_loss": 0.98079342, + "learning_rate": 0.000983137177994244, + "loss": 0.99198341, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.3371582, + "step": 574, + "time_per_iteration": 2.842641830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127826, + "balance_loss_mlp": 1.0945909, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.047970587572460185, + "language_loss": 0.91368234, + "learning_rate": 0.0009830568574605235, + "loss": 0.92496061, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.33227539, + "step": 575, + "time_per_iteration": 2.9841148853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136053, + "balance_loss_mlp": 1.10260296, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.06212944390612344, + "language_loss": 0.95608473, + "learning_rate": 0.0009829763493874992, + "loss": 0.96744525, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3347168, + "step": 576, + "time_per_iteration": 3.094599485397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122949, + "balance_loss_mlp": 1.08918953, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.040009357062280086, + "language_loss": 1.0022918, + "learning_rate": 0.0009828956538064264, + "loss": 1.01352131, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.33764648, + "step": 577, + "time_per_iteration": 2.7913765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128001, + "balance_loss_mlp": 1.09428823, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07834189266391174, + "language_loss": 0.97103804, + "learning_rate": 0.0009828147707486344, + "loss": 0.98231804, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.33740234, + "step": 578, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.0659467, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.066476002167881, + "language_loss": 0.94244707, + "learning_rate": 0.0009827337002455245, + "loss": 0.95344198, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.33544922, + "step": 579, + "time_per_iteration": 2.6212143898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.05940461, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.0598380025645264, + "language_loss": 0.93403691, + "learning_rate": 0.0009826524423285712, + "loss": 0.94494587, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.31469727, + "step": 580, + "time_per_iteration": 2.916363000869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_mlp": 1.05466461, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.051352596452175936, + "language_loss": 0.95457065, + "learning_rate": 0.0009825709970293218, + "loss": 0.96543789, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.32055664, + "step": 581, + "time_per_iteration": 2.975459575653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094975, + "balance_loss_mlp": 1.06414759, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.06330579048660655, + "language_loss": 1.01360774, + "learning_rate": 0.0009824893643793956, + "loss": 1.02455735, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.30810547, + "step": 582, + "time_per_iteration": 3.0850436687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109989, + "balance_loss_mlp": 1.06772757, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.05517621871728721, + "language_loss": 0.96568394, + "learning_rate": 0.0009824075444104857, + "loss": 0.9766829, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3215332, + "step": 583, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104353, + "balance_loss_mlp": 1.07214284, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.05273776870459213, + "language_loss": 1.00669086, + "learning_rate": 0.000982325537154357, + "loss": 1.01773441, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.32202148, + "step": 584, + "time_per_iteration": 2.566066265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109419, + "balance_loss_mlp": 1.07768583, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.05755454669423396, + "language_loss": 1.01869726, + "learning_rate": 0.0009822433426428484, + "loss": 1.02979159, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31713867, + "step": 585, + "time_per_iteration": 2.611968994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_mlp": 1.08987498, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.06034275506000564, + "language_loss": 0.93750811, + "learning_rate": 0.0009821609609078697, + "loss": 0.94872963, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.32275391, + "step": 586, + "time_per_iteration": 2.584847927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0726887, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.06416707827025614, + "language_loss": 0.95279968, + "learning_rate": 0.0009820783919814045, + "loss": 0.96384937, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.32275391, + "step": 587, + "time_per_iteration": 2.7885184288024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06359744, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.049104346633589514, + "language_loss": 0.92135406, + "learning_rate": 0.0009819956358955095, + "loss": 0.93231547, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32543945, + "step": 588, + "time_per_iteration": 2.560117483139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_mlp": 1.05427432, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.05114307144868452, + "language_loss": 0.93675017, + "learning_rate": 0.0009819126926823127, + "loss": 0.94761813, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.32519531, + "step": 589, + "time_per_iteration": 2.517035722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.05966008, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.04613241529975588, + "language_loss": 0.94437975, + "learning_rate": 0.000981829562374016, + "loss": 0.95531201, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.33569336, + "step": 590, + "time_per_iteration": 2.8174262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_mlp": 1.05913091, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.05348492004263644, + "language_loss": 1.04949331, + "learning_rate": 0.0009817462450028933, + "loss": 1.0604248, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.34057617, + "step": 591, + "time_per_iteration": 2.6302859783172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.0668143, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.2030818500746725, + "language_loss": 0.92329478, + "learning_rate": 0.0009816627406012916, + "loss": 0.93430716, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.34472656, + "step": 592, + "time_per_iteration": 2.8384313583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.09943521, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.0774704650100976, + "language_loss": 0.91851664, + "learning_rate": 0.0009815790492016295, + "loss": 0.92987645, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36523438, + "step": 593, + "time_per_iteration": 2.9409682750701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136834, + "balance_loss_mlp": 1.10192943, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.09332707993556091, + "language_loss": 0.94690275, + "learning_rate": 0.0009814951708363993, + "loss": 0.95827115, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.34912109, + "step": 594, + "time_per_iteration": 2.8599631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221657, + "balance_loss_mlp": 1.16023993, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.030934197408724044, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79212642, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.61328125, + "step": 595, + "time_per_iteration": 4.801583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_mlp": 1.10138512, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.0746127254366864, + "language_loss": 0.94972038, + "learning_rate": 0.0009813268533395648, + "loss": 0.96109354, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.359375, + "step": 596, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_mlp": 1.0882678, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.061536990211155544, + "language_loss": 0.95371294, + "learning_rate": 0.0009812424142733073, + "loss": 0.96494377, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.34765625, + "step": 597, + "time_per_iteration": 2.5663998126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07387781, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.04795398370622496, + "language_loss": 0.91199464, + "learning_rate": 0.000981157788372175, + "loss": 0.92308056, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.34716797, + "step": 598, + "time_per_iteration": 3.004436492919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_mlp": 1.06864619, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.04762632796488997, + "language_loss": 0.94997883, + "learning_rate": 0.0009810729756690223, + "loss": 0.96100628, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.34106445, + "step": 599, + "time_per_iteration": 2.704676628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_mlp": 1.06947374, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.06699944809564747, + "language_loss": 0.98224139, + "learning_rate": 0.0009809879761967766, + "loss": 0.99328732, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35107422, + "step": 600, + "time_per_iteration": 2.953348159790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_mlp": 1.07922578, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.06801646297960097, + "language_loss": 0.96874714, + "learning_rate": 0.0009809027899884378, + "loss": 0.97988677, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.34765625, + "step": 601, + "time_per_iteration": 2.896559953689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104267, + "balance_loss_mlp": 1.07014918, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.062436318450634756, + "language_loss": 0.9484992, + "learning_rate": 0.0009808174170770779, + "loss": 0.95954192, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.34130859, + "step": 602, + "time_per_iteration": 2.814558982849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220455, + "balance_loss_mlp": 1.16704941, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.025680107820064087, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86118698, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.53515625, + "step": 603, + "time_per_iteration": 4.897503614425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118739, + "balance_loss_mlp": 1.08566999, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.05533944227900463, + "language_loss": 1.0028702, + "learning_rate": 0.0009806461112779462, + "loss": 1.01405764, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.33081055, + "step": 604, + "time_per_iteration": 2.6172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115094, + "balance_loss_mlp": 1.08281231, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.07231087595972972, + "language_loss": 0.97971618, + "learning_rate": 0.0009805601784566814, + "loss": 0.99086702, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.32250977, + "step": 605, + "time_per_iteration": 2.4791650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125941, + "balance_loss_mlp": 1.09208584, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.06015253149930396, + "language_loss": 1.02430916, + "learning_rate": 0.0009804740590654089, + "loss": 1.03556848, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.33862305, + "step": 606, + "time_per_iteration": 2.614476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124787, + "balance_loss_mlp": 1.09229016, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.08034134565527169, + "language_loss": 0.97153747, + "learning_rate": 0.0009803877531375635, + "loss": 0.9827854, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.32495117, + "step": 607, + "time_per_iteration": 2.851011276245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_mlp": 1.09228706, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.05400582488055185, + "language_loss": 0.97512484, + "learning_rate": 0.0009803012607066523, + "loss": 0.9864068, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.35913086, + "step": 608, + "time_per_iteration": 2.700596570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128537, + "balance_loss_mlp": 1.09294093, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.15792902837654846, + "language_loss": 0.95375645, + "learning_rate": 0.0009802145818062543, + "loss": 0.96504182, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.35620117, + "step": 609, + "time_per_iteration": 2.693417549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123637, + "balance_loss_mlp": 1.08742094, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.06851059455565046, + "language_loss": 0.99132365, + "learning_rate": 0.0009801277164700212, + "loss": 1.00256002, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36254883, + "step": 610, + "time_per_iteration": 2.5825185775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131797, + "balance_loss_mlp": 1.09541452, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.1113382534985323, + "language_loss": 0.96033651, + "learning_rate": 0.0009800406647316776, + "loss": 0.97165447, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.36376953, + "step": 611, + "time_per_iteration": 2.8625166416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231096, + "balance_loss_mlp": 1.18112373, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.03346184177846584, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78145558, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.49804688, + "step": 612, + "time_per_iteration": 4.748431444168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137214, + "balance_loss_mlp": 1.09880471, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.07612220197102978, + "language_loss": 0.95326376, + "learning_rate": 0.000979866002183916, + "loss": 0.96463591, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.38378906, + "step": 613, + "time_per_iteration": 2.6311473846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155666, + "balance_loss_mlp": 1.11482501, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.0832714106614858, + "language_loss": 0.96221644, + "learning_rate": 0.0009797783914423082, + "loss": 0.97377312, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.40844727, + "step": 614, + "time_per_iteration": 2.8568782806396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126933, + "balance_loss_mlp": 1.08721232, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.08355321383380138, + "language_loss": 0.91733479, + "learning_rate": 0.0009796905944342094, + "loss": 0.92860413, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.3972168, + "step": 615, + "time_per_iteration": 2.8348331451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07517743, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.05175964705030883, + "language_loss": 0.94486296, + "learning_rate": 0.0009796026111937057, + "loss": 0.9560017, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.38671875, + "step": 616, + "time_per_iteration": 2.609276056289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111065, + "balance_loss_mlp": 1.07393384, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.1779679576065946, + "language_loss": 0.94108498, + "learning_rate": 0.0009795144417549552, + "loss": 0.95219147, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.3671875, + "step": 617, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.07760203, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.0639893702788804, + "language_loss": 0.95137906, + "learning_rate": 0.0009794260861521883, + "loss": 0.96252483, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36987305, + "step": 618, + "time_per_iteration": 2.779780387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125632, + "balance_loss_mlp": 1.08908224, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.062080445707157726, + "language_loss": 0.94238096, + "learning_rate": 0.0009793375444197075, + "loss": 0.95363724, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.3659668, + "step": 619, + "time_per_iteration": 2.6269500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.12132859, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.05728911446624217, + "language_loss": 0.93181753, + "learning_rate": 0.000979248816591888, + "loss": 0.94341516, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.38452148, + "step": 620, + "time_per_iteration": 2.7879464626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155629, + "balance_loss_mlp": 1.11600351, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.05539388103354017, + "language_loss": 0.93241715, + "learning_rate": 0.0009791599027031766, + "loss": 0.94397342, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.39624023, + "step": 621, + "time_per_iteration": 3.058497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152711, + "balance_loss_mlp": 1.11439681, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.05959109763307043, + "language_loss": 0.93889141, + "learning_rate": 0.0009790708027880932, + "loss": 0.95041847, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.38330078, + "step": 622, + "time_per_iteration": 2.857905864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217773, + "balance_loss_mlp": 1.17447615, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.033264976771994935, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78645062, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.43359375, + "step": 623, + "time_per_iteration": 4.817517518997192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130228, + "balance_loss_mlp": 1.09372652, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.07130736684785184, + "language_loss": 0.99442542, + "learning_rate": 0.0009788920450172487, + "loss": 1.00572777, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.36499023, + "step": 624, + "time_per_iteration": 2.6089231967926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_mlp": 1.0987401, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.053387747347518576, + "language_loss": 0.97139525, + "learning_rate": 0.0009788023872308875, + "loss": 0.98273742, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35522461, + "step": 625, + "time_per_iteration": 2.5482659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171918, + "balance_loss_mlp": 1.12614214, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.016755812295179123, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76600921, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.45703125, + "step": 626, + "time_per_iteration": 4.767898797988892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142548, + "balance_loss_mlp": 1.10609388, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.053046953839951706, + "language_loss": 0.99526918, + "learning_rate": 0.0009786225140303285, + "loss": 1.00669467, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.36425781, + "step": 627, + "time_per_iteration": 2.666975975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145866, + "balance_loss_mlp": 1.10974586, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.06539343990980159, + "language_loss": 0.97403502, + "learning_rate": 0.0009785322986859634, + "loss": 0.98549366, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.36132812, + "step": 628, + "time_per_iteration": 2.6613006591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116443, + "balance_loss_mlp": 1.12830925, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.05337423256033143, + "language_loss": 0.99038112, + "learning_rate": 0.0009784418975588838, + "loss": 1.00202537, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.36108398, + "step": 629, + "time_per_iteration": 2.7266693115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.11248696, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.06598420413892771, + "language_loss": 0.97636682, + "learning_rate": 0.0009783513106841862, + "loss": 0.98784697, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.35522461, + "step": 630, + "time_per_iteration": 2.7734336853027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122661, + "balance_loss_mlp": 1.17663717, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.0364602282496576, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77959311, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.5, + "step": 631, + "time_per_iteration": 4.955650091171265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118689, + "balance_loss_mlp": 1.08283055, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.061523486228641615, + "language_loss": 0.94419873, + "learning_rate": 0.0009781695798326854, + "loss": 0.95538557, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35888672, + "step": 632, + "time_per_iteration": 2.6072514057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111815, + "balance_loss_mlp": 1.08319819, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.05761126083629287, + "language_loss": 0.93996418, + "learning_rate": 0.0009780784359264365, + "loss": 0.95114571, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.34985352, + "step": 633, + "time_per_iteration": 2.6186299324035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201074, + "balance_loss_mlp": 1.15548825, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.024414945484573326, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75389773, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.45507812, + "step": 634, + "time_per_iteration": 4.757866144180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_mlp": 1.05732846, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.05071444395915749, + "language_loss": 0.91919303, + "learning_rate": 0.000977895591329867, + "loss": 0.93010104, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.3347168, + "step": 635, + "time_per_iteration": 2.7802233695983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094425, + "balance_loss_mlp": 1.06006885, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.05652682698430024, + "language_loss": 0.93613631, + "learning_rate": 0.000977803890710533, + "loss": 0.94708061, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.34399414, + "step": 636, + "time_per_iteration": 2.719989538192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109182, + "balance_loss_mlp": 1.0546267, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.05019916823038997, + "language_loss": 0.97873759, + "learning_rate": 0.0009777120045912774, + "loss": 0.98965579, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.37231445, + "step": 637, + "time_per_iteration": 2.5960683822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099212, + "balance_loss_mlp": 1.06139851, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.05186361253186237, + "language_loss": 0.97095829, + "learning_rate": 0.0009776199330077736, + "loss": 0.9819504, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37841797, + "step": 638, + "time_per_iteration": 2.7152581214904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_mlp": 1.05121303, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.05467339203371928, + "language_loss": 0.99154645, + "learning_rate": 0.0009775276759957667, + "loss": 1.00242841, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.37011719, + "step": 639, + "time_per_iteration": 2.6985981464385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090176, + "balance_loss_mlp": 1.05465198, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.06600893718108056, + "language_loss": 0.97933781, + "learning_rate": 0.0009774352335910745, + "loss": 0.99023956, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.35546875, + "step": 640, + "time_per_iteration": 2.813744306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_mlp": 1.05298471, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.05927901471916764, + "language_loss": 0.99468219, + "learning_rate": 0.000977342605829586, + "loss": 1.00554824, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.33642578, + "step": 641, + "time_per_iteration": 2.73280668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110577, + "balance_loss_mlp": 1.07240582, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.07046674646118828, + "language_loss": 0.92099506, + "learning_rate": 0.0009772497927472623, + "loss": 0.93210077, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.38183594, + "step": 642, + "time_per_iteration": 3.1258397102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.09514427, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.07438352262018386, + "language_loss": 0.93366879, + "learning_rate": 0.0009771567943801368, + "loss": 0.94501698, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3972168, + "step": 643, + "time_per_iteration": 2.6720776557922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149366, + "balance_loss_mlp": 1.10912085, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.055730629552303436, + "language_loss": 0.96261084, + "learning_rate": 0.0009770636107643152, + "loss": 0.97410446, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.40234375, + "step": 644, + "time_per_iteration": 2.7093722820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144915, + "balance_loss_mlp": 1.10734022, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.05250459899213186, + "language_loss": 0.92937833, + "learning_rate": 0.0009769702419359738, + "loss": 0.94082749, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.37597656, + "step": 645, + "time_per_iteration": 2.661512613296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173428, + "balance_loss_mlp": 1.13146591, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.052890865129340166, + "language_loss": 0.94770992, + "learning_rate": 0.000976876687931362, + "loss": 0.95944417, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.41943359, + "step": 646, + "time_per_iteration": 2.972522258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164317, + "balance_loss_mlp": 1.12555003, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.07033761546633982, + "language_loss": 0.91270661, + "learning_rate": 0.0009767829487868005, + "loss": 0.92434984, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.38769531, + "step": 647, + "time_per_iteration": 2.6150805950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164183, + "balance_loss_mlp": 1.12281775, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.07269814667774141, + "language_loss": 0.95938772, + "learning_rate": 0.000976689024538682, + "loss": 0.97102952, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.41381836, + "step": 648, + "time_per_iteration": 2.6567764282226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_mlp": 1.11497951, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.06659282576896536, + "language_loss": 0.94783676, + "learning_rate": 0.0009765949152234716, + "loss": 0.95937783, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.39135742, + "step": 649, + "time_per_iteration": 2.9032628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118823, + "balance_loss_mlp": 1.15084565, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.027365485913225348, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79874313, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.37304688, + "step": 650, + "time_per_iteration": 4.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145487, + "balance_loss_mlp": 1.10395491, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.07758701561639549, + "language_loss": 0.88880539, + "learning_rate": 0.0009764061415379919, + "loss": 0.90026021, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.41552734, + "step": 651, + "time_per_iteration": 3.2588987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_mlp": 1.09766221, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08409279007421946, + "language_loss": 0.94380724, + "learning_rate": 0.0009763114772410109, + "loss": 0.95518184, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.39794922, + "step": 652, + "time_per_iteration": 2.5698702335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112103, + "balance_loss_mlp": 1.08359814, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.056536251661147445, + "language_loss": 0.92061114, + "learning_rate": 0.0009762166280235146, + "loss": 0.93182147, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37451172, + "step": 653, + "time_per_iteration": 2.938668966293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_mlp": 1.08191729, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.0771848817407848, + "language_loss": 0.94092464, + "learning_rate": 0.0009761215939223267, + "loss": 0.95209974, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.35644531, + "step": 654, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_mlp": 1.06834149, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.07424845664771389, + "language_loss": 0.9475044, + "learning_rate": 0.0009760263749743428, + "loss": 0.95853353, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.34570312, + "step": 655, + "time_per_iteration": 2.5710902214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101838, + "balance_loss_mlp": 1.06771994, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.053259035011575195, + "language_loss": 0.94285154, + "learning_rate": 0.0009759309712165299, + "loss": 0.95386994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34130859, + "step": 656, + "time_per_iteration": 2.70626163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101868, + "balance_loss_mlp": 1.06858444, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.0693418830287988, + "language_loss": 0.9812479, + "learning_rate": 0.0009758353826859272, + "loss": 0.99226654, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.33300781, + "step": 657, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_mlp": 1.0663563, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.06782991509763603, + "language_loss": 0.96008623, + "learning_rate": 0.0009757396094196456, + "loss": 0.97111744, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36791992, + "step": 658, + "time_per_iteration": 2.8277065753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115911, + "balance_loss_mlp": 1.07926583, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.053606842709613675, + "language_loss": 0.89398581, + "learning_rate": 0.0009756436514548673, + "loss": 0.90514493, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36645508, + "step": 659, + "time_per_iteration": 2.796175718307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120986, + "balance_loss_mlp": 1.0811224, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.060525818769901533, + "language_loss": 0.92384607, + "learning_rate": 0.0009755475088288466, + "loss": 0.93505597, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.39916992, + "step": 660, + "time_per_iteration": 2.678682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133341, + "balance_loss_mlp": 1.09271395, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08191197530717065, + "language_loss": 0.958794, + "learning_rate": 0.0009754511815789095, + "loss": 0.97012746, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.40600586, + "step": 661, + "time_per_iteration": 2.7371177673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130898, + "balance_loss_mlp": 1.09093928, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08687138171908054, + "language_loss": 0.92166948, + "learning_rate": 0.0009753546697424533, + "loss": 0.93297845, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.39941406, + "step": 662, + "time_per_iteration": 2.704432249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125889, + "balance_loss_mlp": 1.08700323, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.06194581367760624, + "language_loss": 0.95628935, + "learning_rate": 0.0009752579733569475, + "loss": 0.96754825, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.38891602, + "step": 663, + "time_per_iteration": 2.682892084121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165087, + "balance_loss_mlp": 1.1326623, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.0245621431528993, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76046479, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.32421875, + "step": 664, + "time_per_iteration": 4.981603622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146598, + "balance_loss_mlp": 1.1060189, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.07818489478946229, + "language_loss": 0.96962506, + "learning_rate": 0.0009750640270890217, + "loss": 0.98109102, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.40576172, + "step": 665, + "time_per_iteration": 2.7139556407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139516, + "balance_loss_mlp": 1.10115409, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.10418725554084544, + "language_loss": 1.02824736, + "learning_rate": 0.0009749667772818983, + "loss": 1.03964257, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.38354492, + "step": 666, + "time_per_iteration": 3.000227689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148195, + "balance_loss_mlp": 1.11481678, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.027847994605201966, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78084135, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.33398438, + "step": 667, + "time_per_iteration": 4.858838319778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.1255703, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.0747922247275706, + "language_loss": 1.00932169, + "learning_rate": 0.0009747717245101093, + "loss": 1.0209403, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.36303711, + "step": 668, + "time_per_iteration": 2.4917514324188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172854, + "balance_loss_mlp": 1.13518405, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0795363237311063, + "language_loss": 0.91087645, + "learning_rate": 0.00097467392162117, + "loss": 0.92260504, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.37719727, + "step": 669, + "time_per_iteration": 2.601151466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196603, + "balance_loss_mlp": 1.15540457, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.0744221392925499, + "language_loss": 0.95630497, + "learning_rate": 0.0009745759344474708, + "loss": 0.96827102, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.41162109, + "step": 670, + "time_per_iteration": 2.878068447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200159, + "balance_loss_mlp": 1.16012812, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.07162427386273244, + "language_loss": 0.95158428, + "learning_rate": 0.0009744777630270536, + "loss": 0.96358585, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.40063477, + "step": 671, + "time_per_iteration": 2.5778517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220294, + "balance_loss_mlp": 1.17752171, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.07459259564874297, + "language_loss": 0.99775112, + "learning_rate": 0.000974379407398032, + "loss": 1.00995398, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.42797852, + "step": 672, + "time_per_iteration": 2.862168073654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_mlp": 1.15175724, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.05795101219152752, + "language_loss": 0.86696863, + "learning_rate": 0.0009742808675985913, + "loss": 0.87888587, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.3996582, + "step": 673, + "time_per_iteration": 3.0987160205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011832, + "balance_loss_mlp": 1.14142871, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.06292984682523013, + "language_loss": 0.96893597, + "learning_rate": 0.0009741821436669876, + "loss": 0.98076797, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.41772461, + "step": 674, + "time_per_iteration": 2.565317153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160814, + "balance_loss_mlp": 1.12123656, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.07127578315040689, + "language_loss": 0.99621803, + "learning_rate": 0.0009740832356415492, + "loss": 1.00782621, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.39550781, + "step": 675, + "time_per_iteration": 2.4777724742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144275, + "balance_loss_mlp": 1.10538852, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.07563598794059366, + "language_loss": 0.94837546, + "learning_rate": 0.0009739841435606756, + "loss": 0.95981824, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.38867188, + "step": 676, + "time_per_iteration": 2.9838767051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_mlp": 1.09186864, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.06693149578557214, + "language_loss": 0.94293654, + "learning_rate": 0.0009738848674628377, + "loss": 0.95424765, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.39233398, + "step": 677, + "time_per_iteration": 2.7052054405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130656, + "balance_loss_mlp": 1.0923903, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.05501746541124835, + "language_loss": 0.94784498, + "learning_rate": 0.000973785407386578, + "loss": 0.95915151, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.38232422, + "step": 678, + "time_per_iteration": 2.7535152435302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_mlp": 1.09727383, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.05430769504454563, + "language_loss": 0.91185606, + "learning_rate": 0.0009736857633705103, + "loss": 0.92322862, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.3996582, + "step": 679, + "time_per_iteration": 2.8686013221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135266, + "balance_loss_mlp": 1.09575987, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.06387426976514826, + "language_loss": 0.97335434, + "learning_rate": 0.0009735859354533196, + "loss": 0.984707, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.39501953, + "step": 680, + "time_per_iteration": 2.6952273845672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09626174, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.07637025474680663, + "language_loss": 0.97434723, + "learning_rate": 0.0009734859236737628, + "loss": 0.98571181, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.40185547, + "step": 681, + "time_per_iteration": 2.607431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_mlp": 1.09720194, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.06515090437153119, + "language_loss": 0.9831785, + "learning_rate": 0.0009733857280706678, + "loss": 0.99454683, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.39599609, + "step": 682, + "time_per_iteration": 2.5730957984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140739, + "balance_loss_mlp": 1.1007328, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.08408851923922504, + "language_loss": 0.89817083, + "learning_rate": 0.000973285348682934, + "loss": 0.90957826, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.39990234, + "step": 683, + "time_per_iteration": 2.7041609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_mlp": 1.08460057, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.021197399820989362, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7901845, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.35546875, + "step": 684, + "time_per_iteration": 4.7803051471710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145399, + "balance_loss_mlp": 1.10579789, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.06796914093678033, + "language_loss": 0.90116858, + "learning_rate": 0.0009730840387095046, + "loss": 0.91262257, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.39575195, + "step": 685, + "time_per_iteration": 3.289513111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154301, + "balance_loss_mlp": 1.11412716, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.0690044047280534, + "language_loss": 0.95956922, + "learning_rate": 0.0009729831082019642, + "loss": 0.97111225, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.40185547, + "step": 686, + "time_per_iteration": 2.8214356899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131343, + "balance_loss_mlp": 1.09383941, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08080780289155233, + "language_loss": 0.93596351, + "learning_rate": 0.0009728819940660958, + "loss": 0.94727689, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.375, + "step": 687, + "time_per_iteration": 2.749385118484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011246, + "balance_loss_mlp": 1.08542764, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.08853955851107219, + "language_loss": 0.91695315, + "learning_rate": 0.0009727806963411557, + "loss": 0.92819917, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.39135742, + "step": 688, + "time_per_iteration": 2.592099666595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_mlp": 1.08777368, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.06370494383790047, + "language_loss": 0.92130053, + "learning_rate": 0.000972679215066471, + "loss": 0.93258381, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.40551758, + "step": 689, + "time_per_iteration": 2.7344043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114145, + "balance_loss_mlp": 1.10246885, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.08478699193898473, + "language_loss": 1.04583168, + "learning_rate": 0.0009725775502814401, + "loss": 1.05724621, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.3894043, + "step": 690, + "time_per_iteration": 2.5881311893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155383, + "balance_loss_mlp": 1.1147325, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.07994389842197654, + "language_loss": 0.90077579, + "learning_rate": 0.0009724757020255327, + "loss": 0.91232961, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.40649414, + "step": 691, + "time_per_iteration": 2.8452539443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_mlp": 1.12566948, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09039906445052394, + "language_loss": 0.91914684, + "learning_rate": 0.0009723736703382902, + "loss": 0.93079573, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.39208984, + "step": 692, + "time_per_iteration": 2.5472824573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198661, + "balance_loss_mlp": 1.15557849, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07689546631051256, + "language_loss": 0.86461794, + "learning_rate": 0.0009722714552593244, + "loss": 0.87660456, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.4309082, + "step": 693, + "time_per_iteration": 2.6273465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199876, + "balance_loss_mlp": 1.15560198, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08142665414192346, + "language_loss": 1.00438499, + "learning_rate": 0.000972169056828319, + "loss": 1.01638389, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.44262695, + "step": 694, + "time_per_iteration": 2.477491617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221806, + "balance_loss_mlp": 1.17741275, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.07001491486919184, + "language_loss": 0.90590984, + "learning_rate": 0.0009720664750850283, + "loss": 0.91812789, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.4440918, + "step": 695, + "time_per_iteration": 2.7817704677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209285, + "balance_loss_mlp": 1.16870594, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.07077521288835904, + "language_loss": 0.97240067, + "learning_rate": 0.0009719637100692784, + "loss": 0.98449349, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.40625, + "step": 696, + "time_per_iteration": 2.7099833488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214589, + "balance_loss_mlp": 1.17069626, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.06395797985697109, + "language_loss": 0.87399805, + "learning_rate": 0.0009718607618209661, + "loss": 0.88614392, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.43896484, + "step": 697, + "time_per_iteration": 2.8280160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226846, + "balance_loss_mlp": 1.18445516, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.08853583224950028, + "language_loss": 0.91527486, + "learning_rate": 0.0009717576303800595, + "loss": 0.92754334, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.42382812, + "step": 698, + "time_per_iteration": 3.0102553367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206724, + "balance_loss_mlp": 1.16385674, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.07140979809376953, + "language_loss": 0.90443981, + "learning_rate": 0.0009716543157865975, + "loss": 0.91650712, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.4284668, + "step": 699, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192988, + "balance_loss_mlp": 1.15047789, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.0971528894423257, + "language_loss": 0.87731719, + "learning_rate": 0.0009715508180806907, + "loss": 0.88924706, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.42504883, + "step": 700, + "time_per_iteration": 3.183608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.12189686, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07253928509691168, + "language_loss": 0.94940412, + "learning_rate": 0.0009714471373025202, + "loss": 0.96104908, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.42578125, + "step": 701, + "time_per_iteration": 3.4071736335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_mlp": 1.10978746, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07349692890686976, + "language_loss": 0.93387866, + "learning_rate": 0.0009713432734923386, + "loss": 0.94542348, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.44702148, + "step": 702, + "time_per_iteration": 2.61545467376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149917, + "balance_loss_mlp": 1.10523736, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.07475145021416552, + "language_loss": 0.90919894, + "learning_rate": 0.0009712392266904696, + "loss": 0.92069811, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.44702148, + "step": 703, + "time_per_iteration": 2.739295482635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156115, + "balance_loss_mlp": 1.11219811, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.09690331363255131, + "language_loss": 0.90325272, + "learning_rate": 0.0009711349969373076, + "loss": 0.91481388, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.43945312, + "step": 704, + "time_per_iteration": 3.1653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175158, + "balance_loss_mlp": 1.12780786, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.09111648779989767, + "language_loss": 0.84997714, + "learning_rate": 0.0009710305842733178, + "loss": 0.86172873, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.47314453, + "step": 705, + "time_per_iteration": 2.7402727603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117117, + "balance_loss_mlp": 1.12737262, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.10189351673448747, + "language_loss": 0.9379847, + "learning_rate": 0.0009709259887390373, + "loss": 0.94969636, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.43774414, + "step": 706, + "time_per_iteration": 2.5640039443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.10467625, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.07946562356881365, + "language_loss": 0.95178437, + "learning_rate": 0.0009708212103750737, + "loss": 0.96325481, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.42382812, + "step": 707, + "time_per_iteration": 2.6138036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153312, + "balance_loss_mlp": 1.1095618, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.07708082078191984, + "language_loss": 0.91549516, + "learning_rate": 0.0009707162492221051, + "loss": 0.9270283, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.43725586, + "step": 708, + "time_per_iteration": 2.879612684249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143626, + "balance_loss_mlp": 1.10121179, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.08764140181907645, + "language_loss": 0.92509496, + "learning_rate": 0.0009706111053208815, + "loss": 0.93653119, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.42431641, + "step": 709, + "time_per_iteration": 2.804469347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156089, + "balance_loss_mlp": 1.10947847, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.07097269092186763, + "language_loss": 0.89579999, + "learning_rate": 0.0009705057787122232, + "loss": 0.90736091, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.46630859, + "step": 710, + "time_per_iteration": 2.568406105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174212, + "balance_loss_mlp": 1.12874603, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.06463299548184855, + "language_loss": 0.94250202, + "learning_rate": 0.0009704002694370216, + "loss": 0.9542442, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.45410156, + "step": 711, + "time_per_iteration": 2.525240659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116324, + "balance_loss_mlp": 1.11820245, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.06677275778781674, + "language_loss": 0.90675253, + "learning_rate": 0.0009702945775362388, + "loss": 0.91838491, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.45043945, + "step": 712, + "time_per_iteration": 2.572566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171995, + "balance_loss_mlp": 1.12478852, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.06549167744569931, + "language_loss": 0.91151595, + "learning_rate": 0.0009701887030509086, + "loss": 0.92323589, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.47167969, + "step": 713, + "time_per_iteration": 2.645202875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_mlp": 1.11450684, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.07696267649297317, + "language_loss": 0.95333648, + "learning_rate": 0.0009700826460221346, + "loss": 0.96490526, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.42382812, + "step": 714, + "time_per_iteration": 2.649831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187773, + "balance_loss_mlp": 1.13980293, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.08597126409557068, + "language_loss": 0.96336859, + "learning_rate": 0.0009699764064910921, + "loss": 0.97524625, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.47998047, + "step": 715, + "time_per_iteration": 2.8645238876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178216, + "balance_loss_mlp": 1.1317718, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08366808602410432, + "language_loss": 0.90892398, + "learning_rate": 0.0009698699844990268, + "loss": 0.92070615, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.46435547, + "step": 716, + "time_per_iteration": 2.635460376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171183, + "balance_loss_mlp": 1.12731409, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.051528021496160425, + "language_loss": 0.91132116, + "learning_rate": 0.0009697633800872555, + "loss": 0.923033, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.4387207, + "step": 717, + "time_per_iteration": 2.887854814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189757, + "balance_loss_mlp": 1.1432178, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.07388540586481528, + "language_loss": 0.94422555, + "learning_rate": 0.0009696565932971655, + "loss": 0.95612311, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.46557617, + "step": 718, + "time_per_iteration": 2.8565313816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171127, + "balance_loss_mlp": 1.12580407, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.06166568969162735, + "language_loss": 0.92794299, + "learning_rate": 0.0009695496241702153, + "loss": 0.93965423, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.45361328, + "step": 719, + "time_per_iteration": 2.827193021774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178611, + "balance_loss_mlp": 1.13152349, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.07046673128739296, + "language_loss": 0.8903814, + "learning_rate": 0.0009694424727479339, + "loss": 0.9021675, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.47094727, + "step": 720, + "time_per_iteration": 2.958855628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12150323, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.07332050167219753, + "language_loss": 0.91946507, + "learning_rate": 0.0009693351390719213, + "loss": 0.93114913, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.46899414, + "step": 721, + "time_per_iteration": 2.6910197734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012083, + "balance_loss_mlp": 1.15742183, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.06188248769550966, + "language_loss": 0.93531096, + "learning_rate": 0.000969227623183848, + "loss": 0.94739395, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.50830078, + "step": 722, + "time_per_iteration": 2.791097640991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.14776587, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06666345220966835, + "language_loss": 0.93550557, + "learning_rate": 0.0009691199251254554, + "loss": 0.94745386, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.47045898, + "step": 723, + "time_per_iteration": 2.8282151222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173107, + "balance_loss_mlp": 1.13059711, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.07191970231420823, + "language_loss": 0.88703346, + "learning_rate": 0.0009690120449385555, + "loss": 0.89876461, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.42504883, + "step": 724, + "time_per_iteration": 2.775456190109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158197, + "balance_loss_mlp": 1.11332655, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.06680700276551169, + "language_loss": 0.95181078, + "learning_rate": 0.0009689039826650312, + "loss": 0.96339279, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.44824219, + "step": 725, + "time_per_iteration": 2.7623417377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164951, + "balance_loss_mlp": 1.12756717, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.03995326528410751, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77688015, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.37304688, + "step": 726, + "time_per_iteration": 4.914167642593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146003, + "balance_loss_mlp": 1.09567261, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.07822541163530779, + "language_loss": 0.90488958, + "learning_rate": 0.0009686873120259941, + "loss": 0.91634959, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.50341797, + "step": 727, + "time_per_iteration": 2.563333749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132914, + "balance_loss_mlp": 1.09092879, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.0725242002086287, + "language_loss": 0.89161742, + "learning_rate": 0.0009685787037446004, + "loss": 0.90294659, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.41992188, + "step": 728, + "time_per_iteration": 2.7803192138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137226, + "balance_loss_mlp": 1.09192598, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.10183800223701604, + "language_loss": 0.9064362, + "learning_rate": 0.0009684699135448201, + "loss": 0.91780847, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.453125, + "step": 729, + "time_per_iteration": 2.750023603439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142476, + "balance_loss_mlp": 1.0995841, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.06503689668024501, + "language_loss": 0.94054115, + "learning_rate": 0.0009683609414688895, + "loss": 0.95196593, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.42895508, + "step": 730, + "time_per_iteration": 2.708470344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116576, + "balance_loss_mlp": 1.11652613, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.07277464462784268, + "language_loss": 0.89072424, + "learning_rate": 0.0009682517875591154, + "loss": 0.9023819, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.49243164, + "step": 731, + "time_per_iteration": 2.734145402908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173563, + "balance_loss_mlp": 1.12640429, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.08810260071203486, + "language_loss": 0.88790858, + "learning_rate": 0.0009681424518578749, + "loss": 0.8996442, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.47192383, + "step": 732, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166119, + "balance_loss_mlp": 1.11900759, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.07053265121681873, + "language_loss": 0.9010576, + "learning_rate": 0.000968032934407616, + "loss": 0.91271877, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.47143555, + "step": 733, + "time_per_iteration": 2.625128746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161975, + "balance_loss_mlp": 1.11514974, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.08143861058365946, + "language_loss": 0.84579933, + "learning_rate": 0.0009679232352508571, + "loss": 0.85741913, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.46850586, + "step": 734, + "time_per_iteration": 2.7461798191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145124, + "balance_loss_mlp": 1.10046864, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.0788084271092868, + "language_loss": 0.83272535, + "learning_rate": 0.0009678133544301871, + "loss": 0.84417665, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.44677734, + "step": 735, + "time_per_iteration": 2.68129301071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130971, + "balance_loss_mlp": 1.08731616, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.05044431767963513, + "language_loss": 0.93706036, + "learning_rate": 0.0009677032919882658, + "loss": 0.94837004, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.43652344, + "step": 736, + "time_per_iteration": 2.663874387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141167, + "balance_loss_mlp": 1.0970124, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.07155994363363784, + "language_loss": 0.94151366, + "learning_rate": 0.000967593047967823, + "loss": 0.95292532, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.44116211, + "step": 737, + "time_per_iteration": 2.512871265411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.10376751, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.07145762863961741, + "language_loss": 0.89657855, + "learning_rate": 0.0009674826224116593, + "loss": 0.90808284, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.46655273, + "step": 738, + "time_per_iteration": 2.797337293624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_mlp": 1.09865868, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.07589062836694223, + "language_loss": 0.89765012, + "learning_rate": 0.0009673720153626455, + "loss": 0.90910375, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.46728516, + "step": 739, + "time_per_iteration": 2.5743062496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.09274864, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07239717331604524, + "language_loss": 0.89863205, + "learning_rate": 0.0009672612268637235, + "loss": 0.9100163, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.45678711, + "step": 740, + "time_per_iteration": 2.6074059009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125723, + "balance_loss_mlp": 1.08125818, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.08552249660547784, + "language_loss": 0.8725301, + "learning_rate": 0.0009671502569579048, + "loss": 0.88378727, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.44458008, + "step": 741, + "time_per_iteration": 2.729733467102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116563, + "balance_loss_mlp": 1.07338512, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.05753110737252733, + "language_loss": 0.92330521, + "learning_rate": 0.0009670391056882719, + "loss": 0.93447083, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.43188477, + "step": 742, + "time_per_iteration": 2.69399356842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115871, + "balance_loss_mlp": 1.07367063, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.06711892894426404, + "language_loss": 0.91615599, + "learning_rate": 0.0009669277730979776, + "loss": 0.92731464, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.421875, + "step": 743, + "time_per_iteration": 3.1732802391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123454, + "balance_loss_mlp": 1.079561, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.07488288596065623, + "language_loss": 0.88249421, + "learning_rate": 0.0009668162592302449, + "loss": 0.89372879, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.43896484, + "step": 744, + "time_per_iteration": 2.88962459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_mlp": 1.09551311, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.08170086657731683, + "language_loss": 0.8873378, + "learning_rate": 0.0009667045641283676, + "loss": 0.89875567, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.46289062, + "step": 745, + "time_per_iteration": 2.6374380588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136601, + "balance_loss_mlp": 1.09158731, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.07376324969806651, + "language_loss": 0.9752661, + "learning_rate": 0.0009665926878357092, + "loss": 0.98663211, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.44995117, + "step": 746, + "time_per_iteration": 2.908377170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138589, + "balance_loss_mlp": 1.09283662, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.055840413500964095, + "language_loss": 0.93229979, + "learning_rate": 0.0009664806303957043, + "loss": 0.94368571, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.45751953, + "step": 747, + "time_per_iteration": 2.6940197944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_mlp": 1.11397541, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.07422855656653271, + "language_loss": 0.89923358, + "learning_rate": 0.0009663683918518571, + "loss": 0.91087878, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.50463867, + "step": 748, + "time_per_iteration": 2.8905599117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_mlp": 1.10977423, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.06951396400432043, + "language_loss": 0.88074797, + "learning_rate": 0.0009662559722477428, + "loss": 0.89237428, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.52880859, + "step": 749, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111749, + "balance_loss_mlp": 1.09059644, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.031134761916572575, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77280462, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.26953125, + "step": 750, + "time_per_iteration": 4.978729009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_mlp": 1.09359622, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.06451546089111031, + "language_loss": 0.9124738, + "learning_rate": 0.0009660305900333632, + "loss": 0.92388898, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.47973633, + "step": 751, + "time_per_iteration": 2.6556403636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145582, + "balance_loss_mlp": 1.09849465, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08083819383046088, + "language_loss": 0.8480792, + "learning_rate": 0.0009659176275105992, + "loss": 0.85953498, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.47070312, + "step": 752, + "time_per_iteration": 2.6868016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154886, + "balance_loss_mlp": 1.10667825, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.0601727082776222, + "language_loss": 0.87400204, + "learning_rate": 0.0009658044841025701, + "loss": 0.88555086, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.48217773, + "step": 753, + "time_per_iteration": 2.7701456546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189813, + "balance_loss_mlp": 1.136765, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.0800468655776831, + "language_loss": 0.83957088, + "learning_rate": 0.0009656911598532021, + "loss": 0.85146904, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.53051758, + "step": 754, + "time_per_iteration": 2.630211353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192149, + "balance_loss_mlp": 1.13943434, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.0631545589319864, + "language_loss": 0.9278729, + "learning_rate": 0.0009655776548064917, + "loss": 0.93979442, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.52758789, + "step": 755, + "time_per_iteration": 2.6447510719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.12506902, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.06497808848967317, + "language_loss": 0.90460694, + "learning_rate": 0.0009654639690065054, + "loss": 0.91637456, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.51708984, + "step": 756, + "time_per_iteration": 2.910578727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116602, + "balance_loss_mlp": 1.11785972, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.0580393303136577, + "language_loss": 0.90340179, + "learning_rate": 0.00096535010249738, + "loss": 0.91506201, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.48120117, + "step": 757, + "time_per_iteration": 2.7232277393341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149847, + "balance_loss_mlp": 1.10092402, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.07370663524734816, + "language_loss": 0.8531146, + "learning_rate": 0.0009652360553233224, + "loss": 0.86461306, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.48901367, + "step": 758, + "time_per_iteration": 2.7501397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.03528047, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.02263224740377231, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74837828, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.28710938, + "step": 759, + "time_per_iteration": 4.953639268875122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150341, + "balance_loss_mlp": 1.1019187, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.05750780582661247, + "language_loss": 0.83513778, + "learning_rate": 0.0009650074191575883, + "loss": 0.84664118, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.48388672, + "step": 760, + "time_per_iteration": 3.202252149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152626, + "balance_loss_mlp": 1.10179496, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.05303129095981597, + "language_loss": 0.88240772, + "learning_rate": 0.0009648928302546766, + "loss": 0.89393395, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.50878906, + "step": 761, + "time_per_iteration": 2.65380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_mlp": 1.09960222, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.06114398209353547, + "language_loss": 0.87573165, + "learning_rate": 0.0009647780608643613, + "loss": 0.88720453, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.47705078, + "step": 762, + "time_per_iteration": 3.3394339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.10831833, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.09093438426480749, + "language_loss": 0.90765309, + "learning_rate": 0.0009646631110312001, + "loss": 0.91919315, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.45678711, + "step": 763, + "time_per_iteration": 2.622671604156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.11200595, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.047784585244551814, + "language_loss": 0.90468627, + "learning_rate": 0.0009645479807998203, + "loss": 0.91626436, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.45751953, + "step": 764, + "time_per_iteration": 2.7322580814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156125, + "balance_loss_mlp": 1.11487842, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06523928090243644, + "language_loss": 0.94106412, + "learning_rate": 0.0009644326702149196, + "loss": 0.95262539, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.41235352, + "step": 765, + "time_per_iteration": 2.7013158798217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174535, + "balance_loss_mlp": 1.12761474, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.08055574364553787, + "language_loss": 0.86730242, + "learning_rate": 0.0009643171793212653, + "loss": 0.87904775, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.46923828, + "step": 766, + "time_per_iteration": 3.083709478378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_mlp": 1.11473966, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.07722330054572468, + "language_loss": 0.92188174, + "learning_rate": 0.0009642015081636952, + "loss": 0.93350834, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.47949219, + "step": 767, + "time_per_iteration": 2.6836585998535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.1132586, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.07123168873353844, + "language_loss": 0.90995437, + "learning_rate": 0.0009640856567871166, + "loss": 0.9215681, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.48168945, + "step": 768, + "time_per_iteration": 2.543670177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156907, + "balance_loss_mlp": 1.10626745, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07039727350928661, + "language_loss": 0.9123286, + "learning_rate": 0.0009639696252365072, + "loss": 0.92389768, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.50634766, + "step": 769, + "time_per_iteration": 3.027188539505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146634, + "balance_loss_mlp": 1.10326576, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.06094559984807647, + "language_loss": 0.83659029, + "learning_rate": 0.0009638534135569144, + "loss": 0.84805667, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.43359375, + "step": 770, + "time_per_iteration": 2.9126267433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_mlp": 1.09489226, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.06702358278695762, + "language_loss": 0.92293191, + "learning_rate": 0.0009637370217934554, + "loss": 0.93433982, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.45922852, + "step": 771, + "time_per_iteration": 2.6426541805267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.08600211, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.04968709901212579, + "language_loss": 0.84857935, + "learning_rate": 0.0009636204499913175, + "loss": 0.85987568, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.43603516, + "step": 772, + "time_per_iteration": 2.830029010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_mlp": 1.08478057, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06444605868824185, + "language_loss": 0.90028566, + "learning_rate": 0.0009635036981957581, + "loss": 0.91150796, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.37451172, + "step": 773, + "time_per_iteration": 2.850893259048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128047, + "balance_loss_mlp": 1.08546507, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.07558916443605426, + "language_loss": 0.92137265, + "learning_rate": 0.0009633867664521043, + "loss": 0.93265319, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.42553711, + "step": 774, + "time_per_iteration": 2.8405416011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154281, + "balance_loss_mlp": 1.10614467, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.07793461844194936, + "language_loss": 0.8938297, + "learning_rate": 0.0009632696548057527, + "loss": 0.9053725, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.48168945, + "step": 775, + "time_per_iteration": 2.5543088912963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158921, + "balance_loss_mlp": 1.11419404, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.07948352168051111, + "language_loss": 0.86982578, + "learning_rate": 0.0009631523633021704, + "loss": 0.88141501, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.44702148, + "step": 776, + "time_per_iteration": 2.8373982906341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151944, + "balance_loss_mlp": 1.10726452, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.07613081492567164, + "language_loss": 0.90593684, + "learning_rate": 0.0009630348919868936, + "loss": 0.91745627, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.4465332, + "step": 777, + "time_per_iteration": 2.688340187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164011, + "balance_loss_mlp": 1.1162796, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.07284380806791231, + "language_loss": 0.83743048, + "learning_rate": 0.0009629172409055293, + "loss": 0.84907055, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.47753906, + "step": 778, + "time_per_iteration": 2.496121406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_mlp": 1.13260555, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.0582041699055768, + "language_loss": 0.89173234, + "learning_rate": 0.0009627994101037531, + "loss": 0.9034642, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.40576172, + "step": 779, + "time_per_iteration": 2.7287445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116917, + "balance_loss_mlp": 1.12670779, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.06429714570378213, + "language_loss": 0.91374522, + "learning_rate": 0.0009626813996273114, + "loss": 0.92543697, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.42431641, + "step": 780, + "time_per_iteration": 2.8357532024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174237, + "balance_loss_mlp": 1.13258517, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.07735356487079731, + "language_loss": 0.90820873, + "learning_rate": 0.0009625632095220198, + "loss": 0.91995108, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.41625977, + "step": 781, + "time_per_iteration": 2.8360986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165333, + "balance_loss_mlp": 1.12408686, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.07591811383481707, + "language_loss": 0.88784671, + "learning_rate": 0.0009624448398337637, + "loss": 0.89950007, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.41259766, + "step": 782, + "time_per_iteration": 2.550873041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_mlp": 1.09920812, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.06500535683801296, + "language_loss": 0.90907973, + "learning_rate": 0.0009623262906084984, + "loss": 0.92046738, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.39550781, + "step": 783, + "time_per_iteration": 3.002237319946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127947, + "balance_loss_mlp": 1.08622408, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.06722303964642193, + "language_loss": 0.92323947, + "learning_rate": 0.0009622075618922486, + "loss": 0.93451893, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.41699219, + "step": 784, + "time_per_iteration": 2.669541120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117088, + "balance_loss_mlp": 1.07636571, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.06286377137641418, + "language_loss": 0.88948303, + "learning_rate": 0.0009620886537311091, + "loss": 0.90065384, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.40722656, + "step": 785, + "time_per_iteration": 2.6505391597747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132411, + "balance_loss_mlp": 1.08563375, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.06858268632652799, + "language_loss": 0.87318397, + "learning_rate": 0.000961969566171244, + "loss": 0.88450807, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.46777344, + "step": 786, + "time_per_iteration": 2.5492002964019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143812, + "balance_loss_mlp": 1.10037243, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.06762455123923776, + "language_loss": 0.9226557, + "learning_rate": 0.0009618502992588873, + "loss": 0.93409383, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.43481445, + "step": 787, + "time_per_iteration": 2.6596381664276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153043, + "balance_loss_mlp": 1.10714722, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07210135364095939, + "language_loss": 0.90213263, + "learning_rate": 0.0009617308530403424, + "loss": 0.91366303, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.45922852, + "step": 788, + "time_per_iteration": 2.9965012073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133874, + "balance_loss_mlp": 1.09358144, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0646084728999688, + "language_loss": 0.89177096, + "learning_rate": 0.0009616112275619825, + "loss": 0.90310967, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.40283203, + "step": 789, + "time_per_iteration": 2.702927350997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.08760214, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.04914514873585108, + "language_loss": 0.85434246, + "learning_rate": 0.0009614914228702503, + "loss": 0.86562753, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.40917969, + "step": 790, + "time_per_iteration": 2.734309196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120439, + "balance_loss_mlp": 1.08031344, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.0510031662309952, + "language_loss": 0.90581405, + "learning_rate": 0.0009613714390116581, + "loss": 0.91701841, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.40112305, + "step": 791, + "time_per_iteration": 2.9846036434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119178, + "balance_loss_mlp": 1.07890868, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.06466161117660295, + "language_loss": 0.87842512, + "learning_rate": 0.0009612512760327879, + "loss": 0.88961697, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.40283203, + "step": 792, + "time_per_iteration": 2.879507303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.0749234, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.06761791569724282, + "language_loss": 0.86834276, + "learning_rate": 0.0009611309339802909, + "loss": 0.87955594, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.46435547, + "step": 793, + "time_per_iteration": 2.4628419876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125819, + "balance_loss_mlp": 1.08180666, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.06955338926819006, + "language_loss": 0.85776877, + "learning_rate": 0.0009610104129008881, + "loss": 0.86902696, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.43994141, + "step": 794, + "time_per_iteration": 3.1157610416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112048, + "balance_loss_mlp": 1.07751703, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.0812849574801687, + "language_loss": 0.89832217, + "learning_rate": 0.0009608897128413701, + "loss": 0.90952694, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.4296875, + "step": 795, + "time_per_iteration": 2.7580387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_mlp": 1.08070254, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.07320179377966478, + "language_loss": 0.87414771, + "learning_rate": 0.0009607688338485965, + "loss": 0.88536048, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.40576172, + "step": 796, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112803, + "balance_loss_mlp": 1.08358848, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.08676784428227541, + "language_loss": 0.92063487, + "learning_rate": 0.0009606477759694969, + "loss": 0.93191516, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.4440918, + "step": 797, + "time_per_iteration": 3.0136139392852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129989, + "balance_loss_mlp": 1.08547592, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07379760567815713, + "language_loss": 0.89430279, + "learning_rate": 0.0009605265392510703, + "loss": 0.90560269, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.44555664, + "step": 798, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_mlp": 1.10169339, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.06797963908333281, + "language_loss": 0.93481082, + "learning_rate": 0.0009604051237403846, + "loss": 0.94626689, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.43896484, + "step": 799, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167168, + "balance_loss_mlp": 1.1217972, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.06891264186704958, + "language_loss": 0.88271165, + "learning_rate": 0.0009602835294845776, + "loss": 0.89438331, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.45361328, + "step": 800, + "time_per_iteration": 2.4739739894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12188447, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.06820302888180714, + "language_loss": 0.91848779, + "learning_rate": 0.0009601617565308565, + "loss": 0.93017173, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.46557617, + "step": 801, + "time_per_iteration": 2.599102020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196072, + "balance_loss_mlp": 1.14941311, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.08155438121007776, + "language_loss": 0.88506758, + "learning_rate": 0.0009600398049264977, + "loss": 0.89702827, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.46679688, + "step": 802, + "time_per_iteration": 2.9645981788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193217, + "balance_loss_mlp": 1.14574742, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.10468166660144326, + "language_loss": 0.93512642, + "learning_rate": 0.0009599176747188469, + "loss": 0.94705856, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.47485352, + "step": 803, + "time_per_iteration": 2.7997000217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160191, + "balance_loss_mlp": 1.11856318, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.07174757520021151, + "language_loss": 0.84728193, + "learning_rate": 0.0009597953659553196, + "loss": 0.85888386, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.41625977, + "step": 804, + "time_per_iteration": 2.700530529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_mlp": 1.09408379, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.4143347029392257, + "language_loss": 0.9033978, + "learning_rate": 0.0009596728786833997, + "loss": 0.91473466, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.39575195, + "step": 805, + "time_per_iteration": 2.6122889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150784, + "balance_loss_mlp": 1.10772574, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.061887733402931855, + "language_loss": 0.91321814, + "learning_rate": 0.0009595502129506415, + "loss": 0.92472601, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.43066406, + "step": 806, + "time_per_iteration": 3.336061716079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180508, + "balance_loss_mlp": 1.13694847, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.06807019640067784, + "language_loss": 0.84292483, + "learning_rate": 0.0009594273688046678, + "loss": 0.85472989, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.43579102, + "step": 807, + "time_per_iteration": 2.709182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210101, + "balance_loss_mlp": 1.15960383, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.0856522073787927, + "language_loss": 0.8780278, + "learning_rate": 0.000959304346293171, + "loss": 0.89012885, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.50512695, + "step": 808, + "time_per_iteration": 2.6307153701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236008, + "balance_loss_mlp": 1.18305564, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.09531038088821206, + "language_loss": 0.90107393, + "learning_rate": 0.0009591811454639125, + "loss": 0.91343403, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.52954102, + "step": 809, + "time_per_iteration": 2.742725372314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197955, + "balance_loss_mlp": 1.15184498, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.06212883071305714, + "language_loss": 0.902493, + "learning_rate": 0.0009590577663647234, + "loss": 0.91447246, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.4609375, + "step": 810, + "time_per_iteration": 2.711411237716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187108, + "balance_loss_mlp": 1.13837492, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.06321996034865444, + "language_loss": 0.88015836, + "learning_rate": 0.0009589342090435036, + "loss": 0.8920294, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.48779297, + "step": 811, + "time_per_iteration": 2.763784170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.12610841, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07315119709604147, + "language_loss": 0.89953744, + "learning_rate": 0.0009588104735482223, + "loss": 0.91127443, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.47631836, + "step": 812, + "time_per_iteration": 2.645106077194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169234, + "balance_loss_mlp": 1.12019134, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.06895714089970095, + "language_loss": 0.86002952, + "learning_rate": 0.0009586865599269177, + "loss": 0.87172186, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.49047852, + "step": 813, + "time_per_iteration": 2.6313953399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144128, + "balance_loss_mlp": 1.09851837, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.06467027207336487, + "language_loss": 0.90443802, + "learning_rate": 0.0009585624682276977, + "loss": 0.91587937, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.45605469, + "step": 814, + "time_per_iteration": 2.7377047538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144046, + "balance_loss_mlp": 1.09705353, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.06824176290368998, + "language_loss": 0.89156437, + "learning_rate": 0.0009584381984987386, + "loss": 0.90300483, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.47021484, + "step": 815, + "time_per_iteration": 2.5524120330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134862, + "balance_loss_mlp": 1.09225655, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.061358262400161866, + "language_loss": 0.92449033, + "learning_rate": 0.0009583137507882864, + "loss": 0.93583906, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.42626953, + "step": 816, + "time_per_iteration": 2.699207305908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.08698916, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.06309616730716378, + "language_loss": 0.82620019, + "learning_rate": 0.000958189125144656, + "loss": 0.8375479, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.47851562, + "step": 817, + "time_per_iteration": 2.6626293659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142354, + "balance_loss_mlp": 1.09493256, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08013787804574789, + "language_loss": 0.90297949, + "learning_rate": 0.0009580643216162313, + "loss": 0.91440302, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.47436523, + "step": 818, + "time_per_iteration": 2.6708288192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.09368527, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.06582812199168771, + "language_loss": 0.82167578, + "learning_rate": 0.0009579393402514652, + "loss": 0.83310658, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.49389648, + "step": 819, + "time_per_iteration": 2.577592611312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_mlp": 1.09898734, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.07647809261390527, + "language_loss": 0.92362559, + "learning_rate": 0.0009578141810988801, + "loss": 0.93505466, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.43920898, + "step": 820, + "time_per_iteration": 2.5464515686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152369, + "balance_loss_mlp": 1.10678363, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07136182637629812, + "language_loss": 0.92042351, + "learning_rate": 0.0009576888442070668, + "loss": 0.93194717, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.45556641, + "step": 821, + "time_per_iteration": 2.5755786895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114609, + "balance_loss_mlp": 1.10288835, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08295395391365894, + "language_loss": 0.94583452, + "learning_rate": 0.0009575633296246854, + "loss": 0.95729542, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.43212891, + "step": 822, + "time_per_iteration": 2.5701425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162298, + "balance_loss_mlp": 1.11821485, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.06548151577025092, + "language_loss": 0.85385978, + "learning_rate": 0.0009574376374004652, + "loss": 0.86548281, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.44116211, + "step": 823, + "time_per_iteration": 2.622905731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_mlp": 1.12019491, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.1009087476503521, + "language_loss": 0.82624936, + "learning_rate": 0.000957311767583204, + "loss": 0.83794677, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.49536133, + "step": 824, + "time_per_iteration": 2.5683999061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196161, + "balance_loss_mlp": 1.1752758, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.05150472419389455, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83267754, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.20898438, + "step": 825, + "time_per_iteration": 4.722898960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176776, + "balance_loss_mlp": 1.12170124, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.10062471557735768, + "language_loss": 0.94017303, + "learning_rate": 0.0009570594953650961, + "loss": 0.95194077, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.55029297, + "step": 826, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173437, + "balance_loss_mlp": 1.12091362, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.0719939675894647, + "language_loss": 0.8219676, + "learning_rate": 0.00095693309306219, + "loss": 0.83370197, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.52612305, + "step": 827, + "time_per_iteration": 3.0926811695098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_mlp": 1.12434745, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.06038838021195225, + "language_loss": 0.90083122, + "learning_rate": 0.0009568065133621244, + "loss": 0.91261542, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54077148, + "step": 828, + "time_per_iteration": 3.315122604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164888, + "balance_loss_mlp": 1.12013662, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.07025990147709567, + "language_loss": 0.87178355, + "learning_rate": 0.0009566797563140422, + "loss": 0.88343245, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.44775391, + "step": 829, + "time_per_iteration": 2.8680243492126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116659, + "balance_loss_mlp": 1.11912107, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.061296828426512996, + "language_loss": 0.89984798, + "learning_rate": 0.0009565528219671547, + "loss": 0.91151381, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.47460938, + "step": 830, + "time_per_iteration": 2.9325318336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.1076839, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07652275644998038, + "language_loss": 0.86699682, + "learning_rate": 0.0009564257103707418, + "loss": 0.87860584, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.53198242, + "step": 831, + "time_per_iteration": 2.598191976547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184474, + "balance_loss_mlp": 1.12973261, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08337472663089728, + "language_loss": 0.92543364, + "learning_rate": 0.0009562984215741533, + "loss": 0.93727839, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54736328, + "step": 832, + "time_per_iteration": 2.676666736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_mlp": 1.11177731, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.05762908483075192, + "language_loss": 0.8408711, + "learning_rate": 0.0009561709556268065, + "loss": 0.85247904, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.49047852, + "step": 833, + "time_per_iteration": 2.7075538635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162528, + "balance_loss_mlp": 1.11141133, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.06044842900072245, + "language_loss": 0.96042889, + "learning_rate": 0.0009560433125781884, + "loss": 0.97205412, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.51171875, + "step": 834, + "time_per_iteration": 2.7619521617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.09130979, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.06441579465763399, + "language_loss": 0.94159138, + "learning_rate": 0.0009559154924778544, + "loss": 0.95304114, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.53686523, + "step": 835, + "time_per_iteration": 2.7467222213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_mlp": 1.08218372, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.07312538570388089, + "language_loss": 0.86469144, + "learning_rate": 0.0009557874953754284, + "loss": 0.87598646, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.47314453, + "step": 836, + "time_per_iteration": 3.0907793045043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126281, + "balance_loss_mlp": 1.07618928, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08101808751207061, + "language_loss": 0.85894346, + "learning_rate": 0.0009556593213206038, + "loss": 0.87020624, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.5012207, + "step": 837, + "time_per_iteration": 2.7060487270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.07765627, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.060960398488271, + "language_loss": 0.89031309, + "learning_rate": 0.0009555309703631414, + "loss": 0.9015379, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.44848633, + "step": 838, + "time_per_iteration": 2.6838622093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131245, + "balance_loss_mlp": 1.07853079, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.0637381399971671, + "language_loss": 0.88547724, + "learning_rate": 0.0009554024425528722, + "loss": 0.89678967, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.52685547, + "step": 839, + "time_per_iteration": 2.7301504611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124486, + "balance_loss_mlp": 1.07978272, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0692663948027758, + "language_loss": 0.90811443, + "learning_rate": 0.0009552737379396948, + "loss": 0.91935933, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.44726562, + "step": 840, + "time_per_iteration": 2.6181893348693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129368, + "balance_loss_mlp": 1.08208978, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06449676765287365, + "language_loss": 0.89640445, + "learning_rate": 0.0009551448565735767, + "loss": 0.90769809, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.47265625, + "step": 841, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135502, + "balance_loss_mlp": 1.08555281, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.07291825437583387, + "language_loss": 0.86443651, + "learning_rate": 0.0009550157985045543, + "loss": 0.87579155, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.49926758, + "step": 842, + "time_per_iteration": 3.0523600578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_mlp": 1.08724499, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.06222432903322319, + "language_loss": 0.90556312, + "learning_rate": 0.0009548865637827321, + "loss": 0.91690183, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.46630859, + "step": 843, + "time_per_iteration": 2.6370396614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113271, + "balance_loss_mlp": 1.08757734, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.07459586377830821, + "language_loss": 0.91347718, + "learning_rate": 0.0009547571524582838, + "loss": 0.92480427, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.45141602, + "step": 844, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142931, + "balance_loss_mlp": 1.09460354, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.08463351541898638, + "language_loss": 0.94371468, + "learning_rate": 0.0009546275645814512, + "loss": 0.95514405, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.48339844, + "step": 845, + "time_per_iteration": 2.632861375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117157, + "balance_loss_mlp": 1.12107265, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.08033911629378378, + "language_loss": 0.92129737, + "learning_rate": 0.0009544978002025446, + "loss": 0.93301302, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.50561523, + "step": 846, + "time_per_iteration": 2.7044737339019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193795, + "balance_loss_mlp": 1.14096177, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.052695226385161484, + "language_loss": 0.88037688, + "learning_rate": 0.0009543678593719434, + "loss": 0.89231491, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.52880859, + "step": 847, + "time_per_iteration": 2.798231601715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208721, + "balance_loss_mlp": 1.15734136, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.056853368929671785, + "language_loss": 0.88963962, + "learning_rate": 0.0009542377421400945, + "loss": 0.90172684, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.51391602, + "step": 848, + "time_per_iteration": 2.7955727577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122402, + "balance_loss_mlp": 1.16584587, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06352967983147602, + "language_loss": 0.85259467, + "learning_rate": 0.0009541074485575145, + "loss": 0.86483485, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.58154297, + "step": 849, + "time_per_iteration": 2.703871488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_mlp": 1.17088127, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07774946886845908, + "language_loss": 0.93468195, + "learning_rate": 0.0009539769786747874, + "loss": 0.94693196, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.54125977, + "step": 850, + "time_per_iteration": 2.6687557697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012154, + "balance_loss_mlp": 1.16130245, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.057605035940766894, + "language_loss": 0.82393861, + "learning_rate": 0.0009538463325425665, + "loss": 0.83609259, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.54101562, + "step": 851, + "time_per_iteration": 2.751335382461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199288, + "balance_loss_mlp": 1.1491015, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.06621147850271279, + "language_loss": 0.87526274, + "learning_rate": 0.0009537155102115728, + "loss": 0.88725561, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.50170898, + "step": 852, + "time_per_iteration": 2.568573474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168884, + "balance_loss_mlp": 1.12236834, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.07419725806034035, + "language_loss": 0.85374665, + "learning_rate": 0.0009535845117325961, + "loss": 0.86543554, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.46533203, + "step": 853, + "time_per_iteration": 2.628973960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137862, + "balance_loss_mlp": 1.09511375, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.05551255594321189, + "language_loss": 0.94495642, + "learning_rate": 0.0009534533371564946, + "loss": 0.95633507, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.42724609, + "step": 854, + "time_per_iteration": 2.780510902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133546, + "balance_loss_mlp": 1.09003448, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.08632067881035285, + "language_loss": 0.90547508, + "learning_rate": 0.0009533219865341949, + "loss": 0.91681051, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.43530273, + "step": 855, + "time_per_iteration": 2.583874464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_mlp": 1.07188785, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.06082853882497287, + "language_loss": 0.88071746, + "learning_rate": 0.0009531904599166916, + "loss": 0.89188123, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.44482422, + "step": 856, + "time_per_iteration": 2.626354217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_mlp": 1.06231081, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.0709999882269981, + "language_loss": 0.86807954, + "learning_rate": 0.0009530587573550478, + "loss": 0.87915355, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.45068359, + "step": 857, + "time_per_iteration": 2.5761454105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142125, + "balance_loss_mlp": 1.11237001, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04095057850479287, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75461513, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.296875, + "step": 858, + "time_per_iteration": 5.055138349533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_mlp": 1.06165087, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.08838989258306214, + "language_loss": 0.91845137, + "learning_rate": 0.0009527948246039337, + "loss": 0.92946172, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.39379883, + "step": 859, + "time_per_iteration": 2.582608461380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111338, + "balance_loss_mlp": 1.0715934, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.06489567580347368, + "language_loss": 0.89263308, + "learning_rate": 0.000952662594516931, + "loss": 0.90374649, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.39746094, + "step": 860, + "time_per_iteration": 3.067707061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_mlp": 1.07018054, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.055059247831062384, + "language_loss": 0.88479781, + "learning_rate": 0.0009525301886907234, + "loss": 0.89590299, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.40307617, + "step": 861, + "time_per_iteration": 2.8873865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112184, + "balance_loss_mlp": 1.07758975, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.06995538812096423, + "language_loss": 0.89499515, + "learning_rate": 0.0009523976071767155, + "loss": 0.90621358, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.44262695, + "step": 862, + "time_per_iteration": 2.6588613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.08183372, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.06313062043432274, + "language_loss": 0.89038265, + "learning_rate": 0.00095226485002638, + "loss": 0.90163255, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.43115234, + "step": 863, + "time_per_iteration": 2.797896146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.07232881, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.054774526957085325, + "language_loss": 0.90381318, + "learning_rate": 0.0009521319172912576, + "loss": 0.91494584, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.40917969, + "step": 864, + "time_per_iteration": 2.7238612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_mlp": 1.08132839, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.05854649520245602, + "language_loss": 0.96491337, + "learning_rate": 0.0009519988090229579, + "loss": 0.97618109, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.4543457, + "step": 865, + "time_per_iteration": 2.683509111404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_mlp": 1.07907248, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.05699467986566688, + "language_loss": 0.89545953, + "learning_rate": 0.0009518655252731576, + "loss": 0.90669084, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.44067383, + "step": 866, + "time_per_iteration": 2.729865550994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_mlp": 1.08456326, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.06482393342324422, + "language_loss": 0.9171015, + "learning_rate": 0.0009517320660936022, + "loss": 0.9284128, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.46557617, + "step": 867, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133142, + "balance_loss_mlp": 1.08843839, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.06614373571764609, + "language_loss": 0.84472704, + "learning_rate": 0.0009515984315361051, + "loss": 0.85605848, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.44702148, + "step": 868, + "time_per_iteration": 2.796868085861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121806, + "balance_loss_mlp": 1.07657838, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08270078218547869, + "language_loss": 0.88773656, + "learning_rate": 0.000951464621652548, + "loss": 0.89895463, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.45239258, + "step": 869, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141117, + "balance_loss_mlp": 1.09751046, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.06072661062765564, + "language_loss": 0.80103016, + "learning_rate": 0.0009513306364948804, + "loss": 0.81244129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.43579102, + "step": 870, + "time_per_iteration": 2.799009084701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_mlp": 1.10373545, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09261319168225486, + "language_loss": 0.90277344, + "learning_rate": 0.0009511964761151197, + "loss": 0.91426206, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.45117188, + "step": 871, + "time_per_iteration": 2.5934712886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158743, + "balance_loss_mlp": 1.1145407, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.06739805293344515, + "language_loss": 0.91524243, + "learning_rate": 0.0009510621405653521, + "loss": 0.92682987, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.44213867, + "step": 872, + "time_per_iteration": 2.5557620525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11627746, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.06267535529199315, + "language_loss": 0.85553813, + "learning_rate": 0.0009509276298977309, + "loss": 0.86710668, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.40576172, + "step": 873, + "time_per_iteration": 2.9965007305145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187981, + "balance_loss_mlp": 1.13760364, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.07409010972210926, + "language_loss": 0.82916558, + "learning_rate": 0.0009507929441644778, + "loss": 0.84104538, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.50415039, + "step": 874, + "time_per_iteration": 3.5573699474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118449, + "balance_loss_mlp": 1.14097893, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.07388150752212762, + "language_loss": 0.8737148, + "learning_rate": 0.0009506580834178826, + "loss": 0.88555974, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.43530273, + "step": 875, + "time_per_iteration": 2.7659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215839, + "balance_loss_mlp": 1.16841793, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.06935842584614806, + "language_loss": 0.92793226, + "learning_rate": 0.0009505230477103028, + "loss": 0.94009066, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.47436523, + "step": 876, + "time_per_iteration": 2.7306137084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_mlp": 1.18224776, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10053146783154573, + "language_loss": 0.82997662, + "learning_rate": 0.0009503878370941641, + "loss": 0.84224302, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.44433594, + "step": 877, + "time_per_iteration": 2.7356183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211684, + "balance_loss_mlp": 1.16793382, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.10508781605450683, + "language_loss": 0.9020679, + "learning_rate": 0.0009502524516219595, + "loss": 0.91418481, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.43798828, + "step": 878, + "time_per_iteration": 2.7525370121002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185028, + "balance_loss_mlp": 1.14232683, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.07887273759437702, + "language_loss": 0.91364408, + "learning_rate": 0.0009501168913462506, + "loss": 0.92549431, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.42724609, + "step": 879, + "time_per_iteration": 2.7009639739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115086, + "balance_loss_mlp": 1.11919844, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04902821320434346, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80272782, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.31640625, + "step": 880, + "time_per_iteration": 4.812703609466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116281, + "balance_loss_mlp": 1.11748707, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.06555145426806878, + "language_loss": 0.86756283, + "learning_rate": 0.0009498452465949042, + "loss": 0.87919092, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.453125, + "step": 881, + "time_per_iteration": 3.230407476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159747, + "balance_loss_mlp": 1.1133033, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.0753185527775994, + "language_loss": 0.92756218, + "learning_rate": 0.0009497091622247285, + "loss": 0.93915963, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.46459961, + "step": 882, + "time_per_iteration": 2.7412030696868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141259, + "balance_loss_mlp": 1.09734213, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.07197762243887564, + "language_loss": 0.94941783, + "learning_rate": 0.0009495729032619723, + "loss": 0.96083045, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.43945312, + "step": 883, + "time_per_iteration": 2.6705245971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_mlp": 1.09724283, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07033792867334165, + "language_loss": 0.85310471, + "learning_rate": 0.0009494364697595354, + "loss": 0.86451751, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.44018555, + "step": 884, + "time_per_iteration": 2.9024457931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115677, + "balance_loss_mlp": 1.10977769, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.0673266035955572, + "language_loss": 0.90739167, + "learning_rate": 0.0009492998617703867, + "loss": 0.91895938, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.47045898, + "step": 885, + "time_per_iteration": 2.6497459411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151127, + "balance_loss_mlp": 1.10813999, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.0863252086663651, + "language_loss": 0.89101255, + "learning_rate": 0.0009491630793475619, + "loss": 0.90252388, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.42993164, + "step": 886, + "time_per_iteration": 2.6258063316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159756, + "balance_loss_mlp": 1.11231089, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.0686214928272948, + "language_loss": 0.85993534, + "learning_rate": 0.0009490261225441643, + "loss": 0.87153292, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.47412109, + "step": 887, + "time_per_iteration": 2.9036519527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168173, + "balance_loss_mlp": 1.12370825, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07914830411429463, + "language_loss": 0.91452426, + "learning_rate": 0.0009488889914133656, + "loss": 0.92620599, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.44458008, + "step": 888, + "time_per_iteration": 3.0038132667541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155268, + "balance_loss_mlp": 1.10706019, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07300075385020723, + "language_loss": 0.90558064, + "learning_rate": 0.0009487516860084047, + "loss": 0.91713333, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.48193359, + "step": 889, + "time_per_iteration": 2.7158679962158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147984, + "balance_loss_mlp": 1.0996089, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.09172908653222724, + "language_loss": 0.90068781, + "learning_rate": 0.0009486142063825884, + "loss": 0.91216767, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.48364258, + "step": 890, + "time_per_iteration": 2.5330443382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.06175303, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.031797672969882694, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73511147, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.23144531, + "step": 891, + "time_per_iteration": 4.953175783157349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_mlp": 1.11835372, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.06989736404119995, + "language_loss": 0.91231126, + "learning_rate": 0.0009483387246819542, + "loss": 0.92398739, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.49243164, + "step": 892, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.0426023, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.022698270048783192, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83350885, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.2265625, + "step": 893, + "time_per_iteration": 4.662828683853149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12312233, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.06047387129149895, + "language_loss": 0.90360647, + "learning_rate": 0.0009480625467392688, + "loss": 0.91527206, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.43481445, + "step": 894, + "time_per_iteration": 2.615447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046079, + "balance_loss_mlp": 1.02433491, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.017910617622931155, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79040754, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.21777344, + "step": 895, + "time_per_iteration": 4.802469968795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196327, + "balance_loss_mlp": 1.15264833, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0591778940977726, + "language_loss": 0.88960874, + "learning_rate": 0.0009477856729834196, + "loss": 0.90157199, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.43652344, + "step": 896, + "time_per_iteration": 2.743036985397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214543, + "balance_loss_mlp": 1.17217648, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.09709817551063968, + "language_loss": 0.91585428, + "learning_rate": 0.0009476469753098809, + "loss": 0.92799973, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.42358398, + "step": 897, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206917, + "balance_loss_mlp": 1.16080689, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08785360527314089, + "language_loss": 0.87616539, + "learning_rate": 0.0009475081038443738, + "loss": 0.88823456, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.46118164, + "step": 898, + "time_per_iteration": 2.5958664417266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178905, + "balance_loss_mlp": 1.13436794, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.08099470404026293, + "language_loss": 0.87109447, + "learning_rate": 0.0009473690586408124, + "loss": 0.88288355, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.44482422, + "step": 899, + "time_per_iteration": 2.885279417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.13184392, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.060075693842180825, + "language_loss": 0.87349975, + "learning_rate": 0.0009472298397531792, + "loss": 0.88526928, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.45141602, + "step": 900, + "time_per_iteration": 2.6987335681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117213, + "balance_loss_mlp": 1.12244344, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.06597136758704356, + "language_loss": 0.87749296, + "learning_rate": 0.0009470904472355235, + "loss": 0.88921428, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.49707031, + "step": 901, + "time_per_iteration": 2.6920526027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_mlp": 1.08898544, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.06929151708835651, + "language_loss": 0.8084361, + "learning_rate": 0.0009469508811419626, + "loss": 0.81977129, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.44555664, + "step": 902, + "time_per_iteration": 2.7087764739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_mlp": 1.01825094, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.018918236495105482, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7265144, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.831868648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130376, + "balance_loss_mlp": 1.08429003, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.06904883588321564, + "language_loss": 0.84871197, + "learning_rate": 0.0009466712284439292, + "loss": 0.86001575, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.46118164, + "step": 904, + "time_per_iteration": 2.727154493331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135856, + "balance_loss_mlp": 1.08867335, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.0797697294198037, + "language_loss": 0.90077758, + "learning_rate": 0.0009465311419480276, + "loss": 0.9121362, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.47216797, + "step": 905, + "time_per_iteration": 2.659696340560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130539, + "balance_loss_mlp": 1.0859549, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.0780460064240459, + "language_loss": 0.89685637, + "learning_rate": 0.0009463908820933622, + "loss": 0.90816176, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.44604492, + "step": 906, + "time_per_iteration": 2.845508337020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_mlp": 1.10657179, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.06621529993663824, + "language_loss": 0.83420271, + "learning_rate": 0.0009462504489343868, + "loss": 0.84573436, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.46582031, + "step": 907, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152101, + "balance_loss_mlp": 1.10246193, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0823987818854668, + "language_loss": 0.9018122, + "learning_rate": 0.0009461098425256222, + "loss": 0.91333324, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.49633789, + "step": 908, + "time_per_iteration": 2.5904529094696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.11457169, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.0762262609163865, + "language_loss": 0.87090451, + "learning_rate": 0.0009459690629216567, + "loss": 0.88250846, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.45874023, + "step": 909, + "time_per_iteration": 2.61710524559021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155202, + "balance_loss_mlp": 1.10921121, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06657664395828655, + "language_loss": 0.88943893, + "learning_rate": 0.0009458281101771457, + "loss": 0.90099096, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.46020508, + "step": 910, + "time_per_iteration": 2.6421282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176316, + "balance_loss_mlp": 1.12810779, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.08799417436837091, + "language_loss": 0.8354404, + "learning_rate": 0.0009456869843468122, + "loss": 0.84720349, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.48217773, + "step": 911, + "time_per_iteration": 2.8633837699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178078, + "balance_loss_mlp": 1.12688971, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.08410877580390771, + "language_loss": 0.79552639, + "learning_rate": 0.0009455456854854459, + "loss": 0.80730712, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.51220703, + "step": 912, + "time_per_iteration": 2.661038875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180916, + "balance_loss_mlp": 1.13564038, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.17307911593328887, + "language_loss": 0.85480136, + "learning_rate": 0.0009454042136479039, + "loss": 0.86661053, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.45263672, + "step": 913, + "time_per_iteration": 2.561790943145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198041, + "balance_loss_mlp": 1.15183568, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.06959724621682493, + "language_loss": 0.8438077, + "learning_rate": 0.0009452625688891103, + "loss": 0.85578811, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.4621582, + "step": 914, + "time_per_iteration": 2.5396227836608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092507, + "balance_loss_mlp": 1.07600832, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.034614734916794516, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79827243, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.16503906, + "step": 915, + "time_per_iteration": 4.550157308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_mlp": 1.21347213, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08235911171958209, + "language_loss": 0.94223297, + "learning_rate": 0.0009449787608278015, + "loss": 0.95488179, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.51489258, + "step": 916, + "time_per_iteration": 2.8292665481567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243722, + "balance_loss_mlp": 1.19525158, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08361954447634375, + "language_loss": 0.9338274, + "learning_rate": 0.0009448365976354704, + "loss": 0.94626462, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.48461914, + "step": 917, + "time_per_iteration": 2.543883800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216482, + "balance_loss_mlp": 1.16622329, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.08482517786251102, + "language_loss": 0.91736883, + "learning_rate": 0.0009446942617422558, + "loss": 0.9295336, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.50317383, + "step": 918, + "time_per_iteration": 2.6130669116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118235, + "balance_loss_mlp": 1.13740778, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.07957198864097685, + "language_loss": 0.8648746, + "learning_rate": 0.0009445517532034176, + "loss": 0.87669808, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.44970703, + "step": 919, + "time_per_iteration": 2.7341010570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116033, + "balance_loss_mlp": 1.11002386, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.08371374964142012, + "language_loss": 0.9020586, + "learning_rate": 0.0009444090720742824, + "loss": 0.9136619, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.50341797, + "step": 920, + "time_per_iteration": 2.628169298171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158892, + "balance_loss_mlp": 1.1083951, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.07483188289837522, + "language_loss": 0.89025688, + "learning_rate": 0.0009442662184102439, + "loss": 0.90184581, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.50512695, + "step": 921, + "time_per_iteration": 2.7538435459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154828, + "balance_loss_mlp": 1.11210358, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.05276545299780942, + "language_loss": 0.88537991, + "learning_rate": 0.000944123192266763, + "loss": 0.89692819, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.42724609, + "step": 922, + "time_per_iteration": 2.788759469985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190709, + "balance_loss_mlp": 1.13887644, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.07681776188261369, + "language_loss": 0.84657156, + "learning_rate": 0.0009439799936993671, + "loss": 0.85847867, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.51904297, + "step": 923, + "time_per_iteration": 2.7123734951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196866, + "balance_loss_mlp": 1.14787149, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.09732559260361714, + "language_loss": 0.89131558, + "learning_rate": 0.0009438366227636511, + "loss": 0.90328419, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.49047852, + "step": 924, + "time_per_iteration": 2.6907341480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171193, + "balance_loss_mlp": 1.12396216, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07379366042998667, + "language_loss": 0.86971134, + "learning_rate": 0.0009436930795152763, + "loss": 0.88142323, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.47241211, + "step": 925, + "time_per_iteration": 2.865673065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168215, + "balance_loss_mlp": 1.12174773, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07469970420174622, + "language_loss": 0.8767308, + "learning_rate": 0.0009435493640099713, + "loss": 0.88841295, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.46411133, + "step": 926, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_mlp": 1.10388088, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.06972760602295516, + "language_loss": 0.85458124, + "learning_rate": 0.0009434054763035314, + "loss": 0.86612737, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.50756836, + "step": 927, + "time_per_iteration": 2.5972957611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 1.09983397, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.05666425765353489, + "language_loss": 0.86302543, + "learning_rate": 0.0009432614164518185, + "loss": 0.8745054, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.48168945, + "step": 928, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150828, + "balance_loss_mlp": 1.09780383, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07484249942420804, + "language_loss": 0.85464913, + "learning_rate": 0.000943117184510762, + "loss": 0.86615741, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 2.9855945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124448, + "balance_loss_mlp": 1.10556555, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.03465095249088487, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79914415, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.18847656, + "step": 930, + "time_per_iteration": 5.016055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148447, + "balance_loss_mlp": 1.09997642, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.07304481613225793, + "language_loss": 0.89790976, + "learning_rate": 0.0009428282045846674, + "loss": 0.90939426, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.48461914, + "step": 931, + "time_per_iteration": 2.787473678588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134009, + "balance_loss_mlp": 1.08797026, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.05043968313129053, + "language_loss": 0.90432143, + "learning_rate": 0.0009426834567118214, + "loss": 0.91566151, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.46044922, + "step": 932, + "time_per_iteration": 3.1106340885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149699, + "balance_loss_mlp": 1.10091829, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.0884624873286247, + "language_loss": 0.81563932, + "learning_rate": 0.0009425385369740155, + "loss": 0.82713628, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.48779297, + "step": 933, + "time_per_iteration": 3.056328296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.1138767, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.0672899912264689, + "language_loss": 0.88411558, + "learning_rate": 0.0009423934454275125, + "loss": 0.8957603, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.50561523, + "step": 934, + "time_per_iteration": 2.827507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162381, + "balance_loss_mlp": 1.11333871, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.07880287247644589, + "language_loss": 0.92845738, + "learning_rate": 0.0009422481821286418, + "loss": 0.94008112, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.49072266, + "step": 935, + "time_per_iteration": 2.7188265323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164456, + "balance_loss_mlp": 1.11918044, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.07978340192275198, + "language_loss": 0.88968349, + "learning_rate": 0.0009421027471337998, + "loss": 0.90132797, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.45239258, + "step": 936, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176728, + "balance_loss_mlp": 1.1271131, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.07049523693926517, + "language_loss": 0.83782339, + "learning_rate": 0.0009419571404994493, + "loss": 0.84959066, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.49584961, + "step": 937, + "time_per_iteration": 2.641847610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_mlp": 1.11354589, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.06745021535989586, + "language_loss": 0.91665328, + "learning_rate": 0.00094181136228212, + "loss": 0.92827624, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.48803711, + "step": 938, + "time_per_iteration": 2.622314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146811, + "balance_loss_mlp": 1.10334706, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06209482952821168, + "language_loss": 0.87085009, + "learning_rate": 0.0009416654125384077, + "loss": 0.88231826, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.43432617, + "step": 939, + "time_per_iteration": 2.735565423965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167753, + "balance_loss_mlp": 1.15230346, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.039552666267989665, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80940127, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15429688, + "step": 940, + "time_per_iteration": 4.9464662075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_mlp": 1.10293126, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.06405620484007693, + "language_loss": 0.85002685, + "learning_rate": 0.000941372998698552, + "loss": 0.86150396, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.44750977, + "step": 941, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152914, + "balance_loss_mlp": 1.10344219, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.07883971857950696, + "language_loss": 0.82437575, + "learning_rate": 0.0009412265347159336, + "loss": 0.8359049, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.49487305, + "step": 942, + "time_per_iteration": 2.727071762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135445, + "balance_loss_mlp": 1.09083664, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.10057326993772005, + "language_loss": 0.85614288, + "learning_rate": 0.0009410798994339829, + "loss": 0.86749732, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.44604492, + "step": 943, + "time_per_iteration": 2.6305696964263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.09248304, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.05478952043416941, + "language_loss": 0.88907182, + "learning_rate": 0.000940933092909628, + "loss": 0.90042174, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.42529297, + "step": 944, + "time_per_iteration": 2.631101369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.10530019, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.06051663433249254, + "language_loss": 0.84961444, + "learning_rate": 0.0009407861151998649, + "loss": 0.8611083, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.44067383, + "step": 945, + "time_per_iteration": 2.5717978477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116837, + "balance_loss_mlp": 1.12040067, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.06666982795430461, + "language_loss": 0.87044382, + "learning_rate": 0.0009406389663617552, + "loss": 0.88212758, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.47998047, + "step": 946, + "time_per_iteration": 2.6768407821655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170796, + "balance_loss_mlp": 1.12757087, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0759743739596538, + "language_loss": 0.87192827, + "learning_rate": 0.000940491646452427, + "loss": 0.88363624, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.43212891, + "step": 947, + "time_per_iteration": 2.7174758911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174804, + "balance_loss_mlp": 1.1271199, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.06285362616764655, + "language_loss": 0.91503757, + "learning_rate": 0.000940344155529075, + "loss": 0.92678559, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.47680664, + "step": 948, + "time_per_iteration": 2.6130924224853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175811, + "balance_loss_mlp": 1.12643504, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.07182633578445446, + "language_loss": 0.88395435, + "learning_rate": 0.0009401964936489605, + "loss": 0.89571244, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.4934082, + "step": 949, + "time_per_iteration": 2.518735885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154077, + "balance_loss_mlp": 1.11173368, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.08616214546245322, + "language_loss": 0.86381257, + "learning_rate": 0.0009400486608694108, + "loss": 0.87535334, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.42358398, + "step": 950, + "time_per_iteration": 2.7356269359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_mlp": 1.10071373, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.05684050086710682, + "language_loss": 0.88146299, + "learning_rate": 0.0009399006572478195, + "loss": 0.89294124, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.47119141, + "step": 951, + "time_per_iteration": 3.0829784870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113898, + "balance_loss_mlp": 1.09449124, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06809630737889293, + "language_loss": 0.91594249, + "learning_rate": 0.0009397524828416468, + "loss": 0.92733228, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.44482422, + "step": 952, + "time_per_iteration": 2.710500478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141316, + "balance_loss_mlp": 1.09339356, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.06814185159234107, + "language_loss": 0.97457635, + "learning_rate": 0.0009396041377084192, + "loss": 0.98598951, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.47949219, + "step": 953, + "time_per_iteration": 2.6530585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011339, + "balance_loss_mlp": 1.08716977, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.06688505748067412, + "language_loss": 0.88496006, + "learning_rate": 0.0009394556219057295, + "loss": 0.896299, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.46704102, + "step": 954, + "time_per_iteration": 2.662543773651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.08948374, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08148035498798997, + "language_loss": 0.84775722, + "learning_rate": 0.0009393069354912362, + "loss": 0.85911626, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.46386719, + "step": 955, + "time_per_iteration": 2.7262632846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_mlp": 1.0954181, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07343823471440349, + "language_loss": 0.83466816, + "learning_rate": 0.0009391580785226649, + "loss": 0.8460598, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.43774414, + "step": 956, + "time_per_iteration": 2.8661141395568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_mlp": 1.04708123, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.029557521366383285, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80407178, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.19628906, + "step": 957, + "time_per_iteration": 4.751030921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.08978534, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.06490118531587029, + "language_loss": 0.87677503, + "learning_rate": 0.0009388598531545196, + "loss": 0.88812232, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.44946289, + "step": 958, + "time_per_iteration": 2.8378970623016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143042, + "balance_loss_mlp": 1.09702718, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.07391212127287443, + "language_loss": 0.86896807, + "learning_rate": 0.000938710484870727, + "loss": 0.88039851, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.46044922, + "step": 959, + "time_per_iteration": 4.31168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128823, + "balance_loss_mlp": 1.08416748, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0638837232249089, + "language_loss": 0.86957002, + "learning_rate": 0.0009385609462644189, + "loss": 0.88085824, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.44702148, + "step": 960, + "time_per_iteration": 2.6793572902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_mlp": 1.07233214, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07248975394705585, + "language_loss": 0.86711299, + "learning_rate": 0.0009384112373936514, + "loss": 0.87830293, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.46679688, + "step": 961, + "time_per_iteration": 2.6220860481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119858, + "balance_loss_mlp": 1.07334304, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.06813544125014795, + "language_loss": 0.92053163, + "learning_rate": 0.0009382613583165467, + "loss": 0.93173021, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.46533203, + "step": 962, + "time_per_iteration": 2.8032093048095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108588, + "balance_loss_mlp": 1.06142831, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07296294799157402, + "language_loss": 0.9064188, + "learning_rate": 0.0009381113090912928, + "loss": 0.91750467, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.47167969, + "step": 963, + "time_per_iteration": 2.7358789443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_mlp": 1.06741881, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.07962159601741099, + "language_loss": 0.90353996, + "learning_rate": 0.000937961089776144, + "loss": 0.91463923, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.42480469, + "step": 964, + "time_per_iteration": 2.5761237144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.07924736, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09082243760489998, + "language_loss": 0.83673573, + "learning_rate": 0.0009378107004294208, + "loss": 0.84802246, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.49438477, + "step": 965, + "time_per_iteration": 2.9681291580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132442, + "balance_loss_mlp": 1.08542585, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08405098410424734, + "language_loss": 0.92054594, + "learning_rate": 0.0009376601411095096, + "loss": 0.93187034, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.4699707, + "step": 966, + "time_per_iteration": 2.696122407913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.09773731, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07104128547690361, + "language_loss": 0.87554526, + "learning_rate": 0.0009375094118748622, + "loss": 0.88693225, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.40991211, + "step": 967, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179223, + "balance_loss_mlp": 1.13373268, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.0728928893981835, + "language_loss": 0.91626799, + "learning_rate": 0.0009373585127839976, + "loss": 0.92806023, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.45507812, + "step": 968, + "time_per_iteration": 2.9854021072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212732, + "balance_loss_mlp": 1.16905367, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08777237711590531, + "language_loss": 0.91368866, + "learning_rate": 0.0009372074438954994, + "loss": 0.92581606, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.43652344, + "step": 969, + "time_per_iteration": 2.5014536380767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211792, + "balance_loss_mlp": 1.16539574, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.0704882552763471, + "language_loss": 0.92436379, + "learning_rate": 0.0009370562052680181, + "loss": 0.93648171, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.46411133, + "step": 970, + "time_per_iteration": 2.453458070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120766, + "balance_loss_mlp": 1.16183591, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.07372597108689087, + "language_loss": 0.89988613, + "learning_rate": 0.0009369047969602695, + "loss": 0.91196281, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.45825195, + "step": 971, + "time_per_iteration": 2.703948497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192702, + "balance_loss_mlp": 1.14396954, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.08557962606734577, + "language_loss": 0.8750906, + "learning_rate": 0.0009367532190310357, + "loss": 0.88701761, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.48657227, + "step": 972, + "time_per_iteration": 4.1564977169036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148896, + "balance_loss_mlp": 1.1052649, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.06811184838385763, + "language_loss": 0.89467651, + "learning_rate": 0.0009366014715391644, + "loss": 0.90616548, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.43603516, + "step": 973, + "time_per_iteration": 2.695730209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134701, + "balance_loss_mlp": 1.09307301, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.054567817192194557, + "language_loss": 0.84347546, + "learning_rate": 0.0009364495545435693, + "loss": 0.85482252, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.41625977, + "step": 974, + "time_per_iteration": 2.828831672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146218, + "balance_loss_mlp": 1.09970224, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.08256927623824414, + "language_loss": 0.89333141, + "learning_rate": 0.0009362974681032297, + "loss": 0.90479362, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.46484375, + "step": 975, + "time_per_iteration": 2.5982418060302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143654, + "balance_loss_mlp": 1.09909391, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.07754570301250979, + "language_loss": 0.89447427, + "learning_rate": 0.0009361452122771907, + "loss": 0.90591079, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.44555664, + "step": 976, + "time_per_iteration": 2.881242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_mlp": 1.08834195, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.0965092241218366, + "language_loss": 0.84541976, + "learning_rate": 0.0009359927871245635, + "loss": 0.85675669, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.45361328, + "step": 977, + "time_per_iteration": 2.4720265865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113596, + "balance_loss_mlp": 1.09039843, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09227923665031239, + "language_loss": 0.87538362, + "learning_rate": 0.0009358401927045246, + "loss": 0.88674331, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.45581055, + "step": 978, + "time_per_iteration": 2.8225297927856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_mlp": 1.0945406, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.05953389716062443, + "language_loss": 0.88990903, + "learning_rate": 0.0009356874290763166, + "loss": 0.90131652, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.46264648, + "step": 979, + "time_per_iteration": 3.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_mlp": 1.09494936, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.06969100284100371, + "language_loss": 0.89955008, + "learning_rate": 0.0009355344962992474, + "loss": 0.91095543, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.45581055, + "step": 980, + "time_per_iteration": 2.6008429527282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138568, + "balance_loss_mlp": 1.09291101, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07021551702573088, + "language_loss": 0.88888156, + "learning_rate": 0.0009353813944326908, + "loss": 0.90026724, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.45654297, + "step": 981, + "time_per_iteration": 2.9102253913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141833, + "balance_loss_mlp": 1.09352899, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0640154196439605, + "language_loss": 0.83560127, + "learning_rate": 0.0009352281235360863, + "loss": 0.84701967, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.4831543, + "step": 982, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.10627127, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.06254433649037737, + "language_loss": 0.85791624, + "learning_rate": 0.0009350746836689389, + "loss": 0.86940861, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.4296875, + "step": 983, + "time_per_iteration": 2.524491548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.02905524, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.024687708549402564, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82486492, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.18261719, + "step": 984, + "time_per_iteration": 5.200335741043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156154, + "balance_loss_mlp": 1.1069684, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08202626484000469, + "language_loss": 0.84151661, + "learning_rate": 0.0009347672972613634, + "loss": 0.85307819, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.49145508, + "step": 985, + "time_per_iteration": 2.6939473152160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011517, + "balance_loss_mlp": 1.10756862, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.061889675774481866, + "language_loss": 0.8651796, + "learning_rate": 0.0009346133508402735, + "loss": 0.87669659, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.44140625, + "step": 986, + "time_per_iteration": 2.695004463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146205, + "balance_loss_mlp": 1.1000948, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07730871241699967, + "language_loss": 0.84821075, + "learning_rate": 0.0009344592356873166, + "loss": 0.85967278, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.46118164, + "step": 987, + "time_per_iteration": 2.635143518447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_mlp": 1.0975666, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.058246004489727894, + "language_loss": 0.79289091, + "learning_rate": 0.0009343049518623255, + "loss": 0.80432773, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.46142578, + "step": 988, + "time_per_iteration": 2.7257165908813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126709, + "balance_loss_mlp": 1.08503366, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.06464318177286693, + "language_loss": 0.83752143, + "learning_rate": 0.0009341504994251985, + "loss": 0.84878862, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.41674805, + "step": 989, + "time_per_iteration": 2.8336057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_mlp": 1.03692603, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.01962059038868396, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74572587, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.15136719, + "step": 990, + "time_per_iteration": 4.980287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.07682681, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.06360467015426281, + "language_loss": 0.82411575, + "learning_rate": 0.0009338410889544574, + "loss": 0.83530033, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.41601562, + "step": 991, + "time_per_iteration": 3.0192768573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123102, + "balance_loss_mlp": 1.0790422, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.06107834506241764, + "language_loss": 0.88440853, + "learning_rate": 0.000933686131040967, + "loss": 0.89563954, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.44067383, + "step": 992, + "time_per_iteration": 2.795952796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118187, + "balance_loss_mlp": 1.07479525, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.08075044213119366, + "language_loss": 0.91145802, + "learning_rate": 0.0009335310047555883, + "loss": 0.92263985, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.43383789, + "step": 993, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144036, + "balance_loss_mlp": 1.10052443, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.06789475617385991, + "language_loss": 0.89048505, + "learning_rate": 0.0009333757101585467, + "loss": 0.90192544, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.43554688, + "step": 994, + "time_per_iteration": 2.659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_mlp": 1.11687493, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.05475551086737561, + "language_loss": 0.94071913, + "learning_rate": 0.0009332202473101329, + "loss": 0.95231587, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.42822266, + "step": 995, + "time_per_iteration": 2.672307014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153956, + "balance_loss_mlp": 1.11011088, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.060816834986447306, + "language_loss": 0.8370983, + "learning_rate": 0.0009330646162707028, + "loss": 0.84863788, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.4387207, + "step": 996, + "time_per_iteration": 2.7483248710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.11274719, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.05013127115514869, + "language_loss": 0.85195571, + "learning_rate": 0.0009329088171006779, + "loss": 0.86350954, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.42626953, + "step": 997, + "time_per_iteration": 3.1445202827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_mlp": 1.1197654, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.07353815647154911, + "language_loss": 0.86074895, + "learning_rate": 0.0009327528498605446, + "loss": 0.87238026, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.43383789, + "step": 998, + "time_per_iteration": 2.536146402359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159094, + "balance_loss_mlp": 1.11844337, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.06861677349241169, + "language_loss": 0.9080506, + "learning_rate": 0.0009325967146108548, + "loss": 0.91964149, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.40649414, + "step": 999, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151834, + "balance_loss_mlp": 1.11049271, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.0672850368289366, + "language_loss": 0.88138115, + "learning_rate": 0.0009324404114122258, + "loss": 0.89289951, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.41357422, + "step": 1000, + "time_per_iteration": 2.677651882171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.12221444, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.06402741154285656, + "language_loss": 0.8710497, + "learning_rate": 0.0009322839403253397, + "loss": 0.88269627, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.42431641, + "step": 1001, + "time_per_iteration": 2.7528679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169088, + "balance_loss_mlp": 1.12440836, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07104878229054386, + "language_loss": 0.84949791, + "learning_rate": 0.0009321273014109439, + "loss": 0.86118877, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.44702148, + "step": 1002, + "time_per_iteration": 2.9990484714508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114889, + "balance_loss_mlp": 1.10523582, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.0673469195429183, + "language_loss": 0.85240018, + "learning_rate": 0.0009319704947298513, + "loss": 0.8638891, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.43676758, + "step": 1003, + "time_per_iteration": 2.8755459785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141118, + "balance_loss_mlp": 1.10127831, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.0925310675323854, + "language_loss": 0.89122581, + "learning_rate": 0.0009318135203429393, + "loss": 0.902637, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.3984375, + "step": 1004, + "time_per_iteration": 2.771192789077759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_mlp": 1.0866611, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.05779097302789, + "language_loss": 0.88602638, + "learning_rate": 0.0009316563783111511, + "loss": 0.8973062, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.41308594, + "step": 1005, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_mlp": 1.08638334, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06006842888316194, + "language_loss": 0.83199531, + "learning_rate": 0.0009314990686954943, + "loss": 0.84330451, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.44506836, + "step": 1006, + "time_per_iteration": 2.935081720352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_mlp": 1.09561515, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.0666735983489841, + "language_loss": 0.81657201, + "learning_rate": 0.000931341591557042, + "loss": 0.82798046, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.45263672, + "step": 1007, + "time_per_iteration": 3.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155404, + "balance_loss_mlp": 1.1041683, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.08115294197805281, + "language_loss": 0.87899536, + "learning_rate": 0.0009311839469569325, + "loss": 0.89054936, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.51171875, + "step": 1008, + "time_per_iteration": 2.6384472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150065, + "balance_loss_mlp": 1.10030699, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.07776470075981182, + "language_loss": 0.88065994, + "learning_rate": 0.0009310261349563687, + "loss": 0.89216053, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.49804688, + "step": 1009, + "time_per_iteration": 2.703058958053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_mlp": 1.11160064, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.05519618089274153, + "language_loss": 0.86250293, + "learning_rate": 0.0009308681556166186, + "loss": 0.87407839, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.45922852, + "step": 1010, + "time_per_iteration": 2.8404791355133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177928, + "balance_loss_mlp": 1.12480855, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10323239067467582, + "language_loss": 0.8870275, + "learning_rate": 0.0009307100089990152, + "loss": 0.89880681, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.53100586, + "step": 1011, + "time_per_iteration": 2.7103512287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185201, + "balance_loss_mlp": 1.13530004, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.08766026563197518, + "language_loss": 0.84582877, + "learning_rate": 0.0009305516951649568, + "loss": 0.8576808, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.49902344, + "step": 1012, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175674, + "balance_loss_mlp": 1.12818122, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07259628373080033, + "language_loss": 0.87723738, + "learning_rate": 0.0009303932141759057, + "loss": 0.8889941, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.47485352, + "step": 1013, + "time_per_iteration": 2.7738490104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161359, + "balance_loss_mlp": 1.11200666, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.07589756885314788, + "language_loss": 0.84698361, + "learning_rate": 0.0009302345660933902, + "loss": 0.85859716, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.49291992, + "step": 1014, + "time_per_iteration": 2.7809414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152692, + "balance_loss_mlp": 1.10579538, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.06636914889533592, + "language_loss": 0.85938931, + "learning_rate": 0.0009300757509790026, + "loss": 0.87091625, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.46875, + "step": 1015, + "time_per_iteration": 2.886200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151123, + "balance_loss_mlp": 1.10324848, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.08384883211824797, + "language_loss": 0.91210115, + "learning_rate": 0.0009299167688944005, + "loss": 0.92361236, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.47827148, + "step": 1016, + "time_per_iteration": 2.5308799743652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135215, + "balance_loss_mlp": 1.09036839, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07612639660839114, + "language_loss": 0.86733758, + "learning_rate": 0.0009297576199013063, + "loss": 0.87868977, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.44873047, + "step": 1017, + "time_per_iteration": 2.699352264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_mlp": 1.14159799, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.04987694814110311, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74158609, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.1484375, + "step": 1018, + "time_per_iteration": 4.927512168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099249, + "balance_loss_mlp": 1.08494341, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.032347612483235935, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80525547, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14257812, + "step": 1019, + "time_per_iteration": 5.494646787643433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_mlp": 1.08855522, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.06601293097738069, + "language_loss": 0.87223667, + "learning_rate": 0.0009292791720892659, + "loss": 0.88352561, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.40332031, + "step": 1020, + "time_per_iteration": 2.8718464374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_mlp": 1.08823943, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07136038826441608, + "language_loss": 0.89387941, + "learning_rate": 0.0009291193560807218, + "loss": 0.90521628, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.45483398, + "step": 1021, + "time_per_iteration": 2.588604211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132851, + "balance_loss_mlp": 1.09141409, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.06738480994857221, + "language_loss": 0.87651652, + "learning_rate": 0.0009289593734732688, + "loss": 0.88784504, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.41430664, + "step": 1022, + "time_per_iteration": 2.5915818214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.09036541, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.06942729809827348, + "language_loss": 0.94984972, + "learning_rate": 0.0009287992243290175, + "loss": 0.96114612, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.39282227, + "step": 1023, + "time_per_iteration": 2.4477546215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142342, + "balance_loss_mlp": 1.09880638, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.1017247644504036, + "language_loss": 0.91891634, + "learning_rate": 0.0009286389087101435, + "loss": 0.93033981, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 2.765334129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142412, + "balance_loss_mlp": 1.09942544, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07195718640229302, + "language_loss": 0.8893857, + "learning_rate": 0.0009284784266788864, + "loss": 0.90080982, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.42993164, + "step": 1025, + "time_per_iteration": 2.7323853969573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_mlp": 1.10327554, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.069193395974369, + "language_loss": 0.93259764, + "learning_rate": 0.0009283177782975512, + "loss": 0.94401753, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.38696289, + "step": 1026, + "time_per_iteration": 2.9729068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114551, + "balance_loss_mlp": 1.10142589, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08755988500201482, + "language_loss": 0.88955659, + "learning_rate": 0.000928156963628507, + "loss": 0.90101171, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.44067383, + "step": 1027, + "time_per_iteration": 2.594200849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138947, + "balance_loss_mlp": 1.09855926, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.07316483198701504, + "language_loss": 0.89277303, + "learning_rate": 0.0009279959827341877, + "loss": 0.90416259, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.40405273, + "step": 1028, + "time_per_iteration": 2.7378368377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140451, + "balance_loss_mlp": 1.09727335, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.059550544329949856, + "language_loss": 0.88526183, + "learning_rate": 0.0009278348356770915, + "loss": 0.89666629, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.43188477, + "step": 1029, + "time_per_iteration": 2.5737922191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133825, + "balance_loss_mlp": 1.0914098, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.06393748023743129, + "language_loss": 0.8587814, + "learning_rate": 0.0009276735225197814, + "loss": 0.87011963, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.42431641, + "step": 1030, + "time_per_iteration": 2.648477077484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146668, + "balance_loss_mlp": 1.10170269, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.06069855374703422, + "language_loss": 0.86812896, + "learning_rate": 0.0009275120433248847, + "loss": 0.87959564, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.44946289, + "step": 1031, + "time_per_iteration": 2.6862802505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.10327268, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.06482797348212818, + "language_loss": 0.87033594, + "learning_rate": 0.0009273503981550931, + "loss": 0.8818205, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.45166016, + "step": 1032, + "time_per_iteration": 3.0549416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157268, + "balance_loss_mlp": 1.11235023, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.07571303407420105, + "language_loss": 0.87661642, + "learning_rate": 0.0009271885870731626, + "loss": 0.88818914, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.44946289, + "step": 1033, + "time_per_iteration": 2.4938008785247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172373, + "balance_loss_mlp": 1.12495148, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.07801561202279184, + "language_loss": 0.89466584, + "learning_rate": 0.0009270266101419143, + "loss": 0.90638959, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.47460938, + "step": 1034, + "time_per_iteration": 2.61181378364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.12681675, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.07487269237991181, + "language_loss": 0.85762119, + "learning_rate": 0.0009268644674242328, + "loss": 0.86931992, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.43066406, + "step": 1035, + "time_per_iteration": 2.6761085987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163027, + "balance_loss_mlp": 1.1147716, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.06997084642295975, + "language_loss": 0.81697071, + "learning_rate": 0.0009267021589830678, + "loss": 0.828601, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.4831543, + "step": 1036, + "time_per_iteration": 2.6166343688964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162737, + "balance_loss_mlp": 1.14547551, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.04224955266067769, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78789818, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.17285156, + "step": 1037, + "time_per_iteration": 4.932336330413818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124804, + "balance_loss_mlp": 1.08224678, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.07370646472771722, + "language_loss": 0.9354341, + "learning_rate": 0.000926377045182406, + "loss": 0.94668216, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.42553711, + "step": 1038, + "time_per_iteration": 2.89486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122228, + "balance_loss_mlp": 1.07704759, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.06351485696264159, + "language_loss": 0.88915765, + "learning_rate": 0.0009262142399491296, + "loss": 0.9003799, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.4519043, + "step": 1039, + "time_per_iteration": 3.0843544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132553, + "balance_loss_mlp": 1.08784938, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.06429886269356283, + "language_loss": 0.89007306, + "learning_rate": 0.0009260512692448105, + "loss": 0.9013986, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.44677734, + "step": 1040, + "time_per_iteration": 2.7221181392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143871, + "balance_loss_mlp": 1.10071695, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0714265416650486, + "language_loss": 0.85044324, + "learning_rate": 0.000925888133132719, + "loss": 0.86188197, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.43164062, + "step": 1041, + "time_per_iteration": 2.7112865447998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113685, + "balance_loss_mlp": 1.09566069, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.0301437897992815, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072412, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.18066406, + "step": 1042, + "time_per_iteration": 4.913869380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.13338971, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.11345429965909062, + "language_loss": 0.82242954, + "learning_rate": 0.0009255613649386244, + "loss": 0.83422714, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.46337891, + "step": 1043, + "time_per_iteration": 2.6586339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153581, + "balance_loss_mlp": 1.11133325, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07362734504976313, + "language_loss": 0.79954398, + "learning_rate": 0.0009253977329834838, + "loss": 0.81107974, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.42236328, + "step": 1044, + "time_per_iteration": 2.7028462886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143902, + "balance_loss_mlp": 1.0951457, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.07842723007783056, + "language_loss": 0.8753317, + "learning_rate": 0.0009252339358742965, + "loss": 0.88677073, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.48779297, + "step": 1045, + "time_per_iteration": 2.8069612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139165, + "balance_loss_mlp": 1.0902648, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07197327624603128, + "language_loss": 0.84128577, + "learning_rate": 0.000925069973674654, + "loss": 0.85267735, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.48925781, + "step": 1046, + "time_per_iteration": 2.603602409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136955, + "balance_loss_mlp": 1.09303868, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06199919012721526, + "language_loss": 0.89849102, + "learning_rate": 0.000924905846448212, + "loss": 0.90986055, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.43896484, + "step": 1047, + "time_per_iteration": 2.733009099960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166193, + "balance_loss_mlp": 1.11726964, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.08010189097684783, + "language_loss": 0.86224002, + "learning_rate": 0.0009247415542586906, + "loss": 0.87390196, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.48950195, + "step": 1048, + "time_per_iteration": 2.8471555709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186895, + "balance_loss_mlp": 1.13675559, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.050762349186412876, + "language_loss": 0.83535373, + "learning_rate": 0.0009245770971698735, + "loss": 0.84722269, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.50170898, + "step": 1049, + "time_per_iteration": 2.889474630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183513, + "balance_loss_mlp": 1.13671136, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.07506320746734087, + "language_loss": 0.8918047, + "learning_rate": 0.0009244124752456087, + "loss": 0.90363979, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.46826172, + "step": 1050, + "time_per_iteration": 2.5762786865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205722, + "balance_loss_mlp": 1.15453339, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.08917577036116058, + "language_loss": 0.86475039, + "learning_rate": 0.0009242476885498081, + "loss": 0.87680757, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.51220703, + "step": 1051, + "time_per_iteration": 2.720395565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193502, + "balance_loss_mlp": 1.14009643, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.08090891256677915, + "language_loss": 0.81871718, + "learning_rate": 0.0009240827371464474, + "loss": 0.83065224, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.53442383, + "step": 1052, + "time_per_iteration": 2.535388231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162702, + "balance_loss_mlp": 1.11833251, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.08177732735855556, + "language_loss": 0.84886205, + "learning_rate": 0.0009239176210995666, + "loss": 0.86048913, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.4440918, + "step": 1053, + "time_per_iteration": 3.4955379962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148392, + "balance_loss_mlp": 1.0973227, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.9822109545682867, + "language_loss": 0.94933617, + "learning_rate": 0.0009237523404732695, + "loss": 0.96082008, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51074219, + "step": 1054, + "time_per_iteration": 2.90132737159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137464, + "balance_loss_mlp": 1.09118664, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.09331279688006895, + "language_loss": 0.85504258, + "learning_rate": 0.0009235868953317235, + "loss": 0.86641729, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.46264648, + "step": 1055, + "time_per_iteration": 2.813202381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212355, + "balance_loss_mlp": 1.16388512, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08645469446577787, + "language_loss": 0.86679947, + "learning_rate": 0.0009234212857391602, + "loss": 0.87892294, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.48486328, + "step": 1056, + "time_per_iteration": 3.184723377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_mlp": 1.23723245, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.11402704661401492, + "language_loss": 0.90548229, + "learning_rate": 0.000923255511759875, + "loss": 0.91837716, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.52319336, + "step": 1057, + "time_per_iteration": 2.8404476642608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374128, + "balance_loss_mlp": 1.3215096, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.12448379126392096, + "language_loss": 0.86306804, + "learning_rate": 0.000923089573458227, + "loss": 0.87680936, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.52661133, + "step": 1058, + "time_per_iteration": 2.921942949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411943, + "balance_loss_mlp": 1.35701096, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.12614323996078466, + "language_loss": 0.84856015, + "learning_rate": 0.0009229234708986392, + "loss": 0.8626796, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.54931641, + "step": 1059, + "time_per_iteration": 2.922795057296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01629047, + "balance_loss_mlp": 1.60253465, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.12493252943786969, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83295941, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.265625, + "step": 1060, + "time_per_iteration": 4.733684062957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333622, + "balance_loss_mlp": 1.27976346, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.0936460184690869, + "language_loss": 0.86563337, + "learning_rate": 0.0009225907732636548, + "loss": 0.87896961, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.53881836, + "step": 1061, + "time_per_iteration": 2.761353015899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296883, + "balance_loss_mlp": 1.24183202, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.09002543594031559, + "language_loss": 0.87698424, + "learning_rate": 0.0009224241783174227, + "loss": 0.88995302, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.55078125, + "step": 1062, + "time_per_iteration": 2.7161052227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252808, + "balance_loss_mlp": 1.19947362, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.08928798499879465, + "language_loss": 0.87254798, + "learning_rate": 0.0009222574193715802, + "loss": 0.88507611, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.53369141, + "step": 1063, + "time_per_iteration": 2.779623031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122391, + "balance_loss_mlp": 1.16757131, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06606001070927259, + "language_loss": 0.87212694, + "learning_rate": 0.000922090496490869, + "loss": 0.88436604, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.56323242, + "step": 1064, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217897, + "balance_loss_mlp": 1.16120076, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.3109146854617931, + "language_loss": 0.90918952, + "learning_rate": 0.0009219234097400937, + "loss": 0.92136848, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.56665039, + "step": 1065, + "time_per_iteration": 2.804588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245438, + "balance_loss_mlp": 1.18359244, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06908392980849179, + "language_loss": 0.84456235, + "learning_rate": 0.0009217561591841237, + "loss": 0.85701674, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.61816406, + "step": 1066, + "time_per_iteration": 3.303875207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287048, + "balance_loss_mlp": 1.21867001, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.1162597514909173, + "language_loss": 0.82140827, + "learning_rate": 0.0009215887448878913, + "loss": 0.83427876, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.68408203, + "step": 1067, + "time_per_iteration": 2.568690776824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293452, + "balance_loss_mlp": 1.22288036, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08586469474305494, + "language_loss": 0.85986763, + "learning_rate": 0.0009214211669163922, + "loss": 0.87280214, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.70654297, + "step": 1068, + "time_per_iteration": 2.700090169906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_mlp": 1.21408105, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.06609725061841937, + "language_loss": 0.94520444, + "learning_rate": 0.0009212534253346862, + "loss": 0.95800096, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.65478516, + "step": 1069, + "time_per_iteration": 2.696699857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285979, + "balance_loss_mlp": 1.21912634, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.07442061186670905, + "language_loss": 0.85475862, + "learning_rate": 0.0009210855202078964, + "loss": 0.86761844, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.66845703, + "step": 1070, + "time_per_iteration": 2.5769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_mlp": 1.21771979, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.07631989099853977, + "language_loss": 0.88063252, + "learning_rate": 0.0009209174516012091, + "loss": 0.89347488, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.66601562, + "step": 1071, + "time_per_iteration": 2.6154239177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261362, + "balance_loss_mlp": 1.19317448, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.05883273983798781, + "language_loss": 0.90461957, + "learning_rate": 0.0009207492195798747, + "loss": 0.91723317, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.68164062, + "step": 1072, + "time_per_iteration": 2.764965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261739, + "balance_loss_mlp": 1.18997467, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.07316980575900926, + "language_loss": 0.86156094, + "learning_rate": 0.0009205808242092061, + "loss": 0.87417829, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.71728516, + "step": 1073, + "time_per_iteration": 2.6222856044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258718, + "balance_loss_mlp": 1.18952858, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.06600331144021966, + "language_loss": 0.83598334, + "learning_rate": 0.0009204122655545808, + "loss": 0.84857053, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.69189453, + "step": 1074, + "time_per_iteration": 3.313964605331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252953, + "balance_loss_mlp": 1.18571925, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.06834339296378739, + "language_loss": 0.82186073, + "learning_rate": 0.0009202435436814388, + "loss": 0.83439028, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.67236328, + "step": 1075, + "time_per_iteration": 2.68725848197937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260409, + "balance_loss_mlp": 1.1926024, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.07476886245144747, + "language_loss": 0.91110998, + "learning_rate": 0.0009200746586552836, + "loss": 0.92371404, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.67773438, + "step": 1076, + "time_per_iteration": 2.889910936355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238308, + "balance_loss_mlp": 1.17145491, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.06855298516082668, + "language_loss": 0.84957182, + "learning_rate": 0.0009199056105416825, + "loss": 0.86195493, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.66894531, + "step": 1077, + "time_per_iteration": 3.0826096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242457, + "balance_loss_mlp": 1.17312455, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.0732932371665923, + "language_loss": 0.87494361, + "learning_rate": 0.0009197363994062654, + "loss": 0.8873682, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.69287109, + "step": 1078, + "time_per_iteration": 2.814481735229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121373, + "balance_loss_mlp": 1.15455508, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.060498447021287705, + "language_loss": 0.85097158, + "learning_rate": 0.0009195670253147262, + "loss": 0.86310887, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.59179688, + "step": 1079, + "time_per_iteration": 2.989818572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216427, + "balance_loss_mlp": 1.15286458, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.0563328194871683, + "language_loss": 0.83052152, + "learning_rate": 0.0009193974883328216, + "loss": 0.84268576, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.63574219, + "step": 1080, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209864, + "balance_loss_mlp": 1.14553857, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06150097183917509, + "language_loss": 0.87932825, + "learning_rate": 0.0009192277885263718, + "loss": 0.89142686, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.64306641, + "step": 1081, + "time_per_iteration": 2.65731143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198356, + "balance_loss_mlp": 1.13264751, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.05302154537588453, + "language_loss": 0.86579674, + "learning_rate": 0.0009190579259612602, + "loss": 0.87778032, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.65722656, + "step": 1082, + "time_per_iteration": 3.2999303340911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207666, + "balance_loss_mlp": 1.14300656, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.07988409962843289, + "language_loss": 0.87673134, + "learning_rate": 0.000918887900703433, + "loss": 0.88880801, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.64648438, + "step": 1083, + "time_per_iteration": 2.7956981658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204411, + "balance_loss_mlp": 1.14361465, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07357181622228276, + "language_loss": 0.91242653, + "learning_rate": 0.0009187177128188999, + "loss": 0.92447066, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.60693359, + "step": 1084, + "time_per_iteration": 2.4656450748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194774, + "balance_loss_mlp": 1.16902518, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.038082499218869, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78351313, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.2578125, + "step": 1085, + "time_per_iteration": 4.855400323867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181967, + "balance_loss_mlp": 1.12419796, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07376491342946172, + "language_loss": 0.86747313, + "learning_rate": 0.000918376849434071, + "loss": 0.87929279, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.57739258, + "step": 1086, + "time_per_iteration": 2.5493998527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192292, + "balance_loss_mlp": 1.1305418, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.07728027722551846, + "language_loss": 0.9155581, + "learning_rate": 0.0009182061740661098, + "loss": 0.92748106, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.61767578, + "step": 1087, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192286, + "balance_loss_mlp": 1.13144195, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.057753656338862314, + "language_loss": 0.85712528, + "learning_rate": 0.0009180353363361127, + "loss": 0.86904812, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.60888672, + "step": 1088, + "time_per_iteration": 3.1143646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180296, + "balance_loss_mlp": 1.11868906, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.07221088423930573, + "language_loss": 0.83469599, + "learning_rate": 0.0009178643363104044, + "loss": 0.84649897, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.61621094, + "step": 1089, + "time_per_iteration": 3.092656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199477, + "balance_loss_mlp": 1.138394, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.08745424257973078, + "language_loss": 0.92463166, + "learning_rate": 0.0009176931740553735, + "loss": 0.93662637, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.61083984, + "step": 1090, + "time_per_iteration": 2.53558349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207875, + "balance_loss_mlp": 1.14850855, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.07295134358518522, + "language_loss": 0.83623219, + "learning_rate": 0.0009175218496374708, + "loss": 0.84831095, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.59277344, + "step": 1091, + "time_per_iteration": 3.3514459133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226261, + "balance_loss_mlp": 1.16503549, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.0645587086921242, + "language_loss": 0.86590576, + "learning_rate": 0.0009173503631232103, + "loss": 0.87816834, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.61181641, + "step": 1092, + "time_per_iteration": 3.3893167972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122226, + "balance_loss_mlp": 1.16194034, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.12026645314545058, + "language_loss": 0.8245008, + "learning_rate": 0.0009171787145791691, + "loss": 0.83672333, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.60351562, + "step": 1093, + "time_per_iteration": 3.251084327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251584, + "balance_loss_mlp": 1.18854666, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.08481501206118727, + "language_loss": 0.8143028, + "learning_rate": 0.000917006904071987, + "loss": 0.82681859, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.63037109, + "step": 1094, + "time_per_iteration": 2.613060712814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_mlp": 1.20551634, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.08143629367900677, + "language_loss": 0.87639427, + "learning_rate": 0.0009168349316683669, + "loss": 0.88911939, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.66992188, + "step": 1095, + "time_per_iteration": 2.705172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269472, + "balance_loss_mlp": 1.20462179, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.05512017255927588, + "language_loss": 0.83512938, + "learning_rate": 0.0009166627974350741, + "loss": 0.8478241, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.64746094, + "step": 1096, + "time_per_iteration": 2.8979411125183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259233, + "balance_loss_mlp": 1.19390619, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.06519728045913388, + "language_loss": 0.90715098, + "learning_rate": 0.0009164905014389373, + "loss": 0.91974336, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.65283203, + "step": 1097, + "time_per_iteration": 2.7965359687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291926, + "balance_loss_mlp": 1.22445381, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.07891140172991894, + "language_loss": 0.87571776, + "learning_rate": 0.0009163180437468476, + "loss": 0.88863701, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.67480469, + "step": 1098, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012877, + "balance_loss_mlp": 1.22065675, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.06282838131309415, + "language_loss": 0.86816525, + "learning_rate": 0.000916145424425759, + "loss": 0.88104224, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.67041016, + "step": 1099, + "time_per_iteration": 2.6685678958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305165, + "balance_loss_mlp": 1.23554707, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.08616648204830919, + "language_loss": 0.916682, + "learning_rate": 0.0009159726435426885, + "loss": 0.92973363, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.69628906, + "step": 1100, + "time_per_iteration": 3.0852713584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282199, + "balance_loss_mlp": 1.21677744, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.07323647205544051, + "language_loss": 0.91053265, + "learning_rate": 0.0009157997011647154, + "loss": 0.92335469, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.65380859, + "step": 1101, + "time_per_iteration": 2.6137943267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_mlp": 1.20784807, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.05451247925490285, + "language_loss": 0.87014931, + "learning_rate": 0.0009156265973589817, + "loss": 0.88285577, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.62792969, + "step": 1102, + "time_per_iteration": 2.7920916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255362, + "balance_loss_mlp": 1.1928488, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.06310879580708054, + "language_loss": 0.90527534, + "learning_rate": 0.0009154533321926926, + "loss": 0.91782892, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.62548828, + "step": 1103, + "time_per_iteration": 2.646440029144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234037, + "balance_loss_mlp": 1.17214394, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.07831819024350671, + "language_loss": 0.88472342, + "learning_rate": 0.0009152799057331156, + "loss": 0.89706385, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.61865234, + "step": 1104, + "time_per_iteration": 3.122450590133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214804, + "balance_loss_mlp": 1.15462673, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.06719929320387279, + "language_loss": 0.91964042, + "learning_rate": 0.0009151063180475805, + "loss": 0.9317885, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.6015625, + "step": 1105, + "time_per_iteration": 2.5321173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181276, + "balance_loss_mlp": 1.12772751, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.07726558156265032, + "language_loss": 0.8518455, + "learning_rate": 0.0009149325692034803, + "loss": 0.86365819, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.53613281, + "step": 1106, + "time_per_iteration": 2.6019790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129115, + "balance_loss_mlp": 1.10660839, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.0458739418309424, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80332541, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.22460938, + "step": 1107, + "time_per_iteration": 4.859830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180766, + "balance_loss_mlp": 1.12478542, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08338906086238376, + "language_loss": 0.88186961, + "learning_rate": 0.0009145845883094678, + "loss": 0.89367729, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.56005859, + "step": 1108, + "time_per_iteration": 3.04249906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.10114598, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07708602471843919, + "language_loss": 0.85793281, + "learning_rate": 0.000914410356394654, + "loss": 0.86946738, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.5234375, + "step": 1109, + "time_per_iteration": 4.412867307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163449, + "balance_loss_mlp": 1.10751617, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.08187458054057056, + "language_loss": 0.85334879, + "learning_rate": 0.0009142359635914709, + "loss": 0.86498332, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.55957031, + "step": 1110, + "time_per_iteration": 3.023928642272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148781, + "balance_loss_mlp": 1.09570932, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.0669404625356857, + "language_loss": 0.85089076, + "learning_rate": 0.0009140614099676245, + "loss": 0.86237848, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.53076172, + "step": 1111, + "time_per_iteration": 2.625797748565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148537, + "balance_loss_mlp": 1.09632301, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.06784083874149466, + "language_loss": 0.83744586, + "learning_rate": 0.0009138866955908821, + "loss": 0.84893119, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.52246094, + "step": 1112, + "time_per_iteration": 2.9033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152374, + "balance_loss_mlp": 1.10042286, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.0756009236441896, + "language_loss": 0.81778276, + "learning_rate": 0.0009137118205290738, + "loss": 0.82930648, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.51977539, + "step": 1113, + "time_per_iteration": 3.00955867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163421, + "balance_loss_mlp": 1.10677314, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.07649003777848401, + "language_loss": 0.90946341, + "learning_rate": 0.0009135367848500924, + "loss": 0.92109764, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.56591797, + "step": 1114, + "time_per_iteration": 2.50858211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167845, + "balance_loss_mlp": 1.11472559, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.0823134598214501, + "language_loss": 0.87556803, + "learning_rate": 0.0009133615886218927, + "loss": 0.88724649, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.53125, + "step": 1115, + "time_per_iteration": 2.717454195022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178358, + "balance_loss_mlp": 1.11651218, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.06887665628973552, + "language_loss": 0.89567351, + "learning_rate": 0.0009131862319124917, + "loss": 0.90745711, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.61816406, + "step": 1116, + "time_per_iteration": 2.623767852783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176568, + "balance_loss_mlp": 1.1235671, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08365937432877864, + "language_loss": 0.85244483, + "learning_rate": 0.0009130107147899691, + "loss": 0.86421049, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.53051758, + "step": 1117, + "time_per_iteration": 2.795011281967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178642, + "balance_loss_mlp": 1.12561774, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.06665693704910039, + "language_loss": 0.8600654, + "learning_rate": 0.0009128350373224665, + "loss": 0.8718518, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.53076172, + "step": 1118, + "time_per_iteration": 2.5644795894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011837, + "balance_loss_mlp": 1.15928602, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.058896568697900505, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82640129, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.24414062, + "step": 1119, + "time_per_iteration": 4.683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204932, + "balance_loss_mlp": 1.15031052, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07135490421069918, + "language_loss": 0.85804355, + "learning_rate": 0.0009124832016254005, + "loss": 0.87009287, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.54663086, + "step": 1120, + "time_per_iteration": 2.6158647537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206508, + "balance_loss_mlp": 1.14571166, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.055578106746994274, + "language_loss": 0.89113355, + "learning_rate": 0.0009123070435324316, + "loss": 0.9031986, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.60791016, + "step": 1121, + "time_per_iteration": 2.755823850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.07988179, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.03051163671975961, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78977883, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.22265625, + "step": 1122, + "time_per_iteration": 4.996071100234985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211089, + "balance_loss_mlp": 1.15358257, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.06035521524280068, + "language_loss": 0.87145722, + "learning_rate": 0.0009119542471995752, + "loss": 0.88356811, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.57446289, + "step": 1123, + "time_per_iteration": 2.8323612213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204972, + "balance_loss_mlp": 1.14675009, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.060035653180353525, + "language_loss": 0.8248235, + "learning_rate": 0.0009117776090966554, + "loss": 0.83687323, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.58251953, + "step": 1124, + "time_per_iteration": 2.954216480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216387, + "balance_loss_mlp": 1.1558764, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.07791040933307145, + "language_loss": 0.876288, + "learning_rate": 0.0009116008111274899, + "loss": 0.88845193, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.60498047, + "step": 1125, + "time_per_iteration": 3.2826616764068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08216333, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.030294405796961115, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80209303, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.20214844, + "step": 1126, + "time_per_iteration": 4.8284173011779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.1455152, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.10762952047928877, + "language_loss": 0.8553561, + "learning_rate": 0.0009112467358650396, + "loss": 0.86737764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.56640625, + "step": 1127, + "time_per_iteration": 3.1621291637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192643, + "balance_loss_mlp": 1.13561273, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.06435190440672867, + "language_loss": 0.87181705, + "learning_rate": 0.0009110694587092192, + "loss": 0.88374346, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.56982422, + "step": 1128, + "time_per_iteration": 2.7597765922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194699, + "balance_loss_mlp": 1.13452196, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.06894978951163175, + "language_loss": 0.8223331, + "learning_rate": 0.0009108920219620815, + "loss": 0.83428001, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.6015625, + "step": 1129, + "time_per_iteration": 2.6658482551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198898, + "balance_loss_mlp": 1.14072335, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06550313542995663, + "language_loss": 0.90210444, + "learning_rate": 0.0009107144256925133, + "loss": 0.91409343, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.58154297, + "step": 1130, + "time_per_iteration": 2.7298777103424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211101, + "balance_loss_mlp": 1.15464389, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08430456831611369, + "language_loss": 0.82975614, + "learning_rate": 0.0009105366699694638, + "loss": 0.84186715, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.56445312, + "step": 1131, + "time_per_iteration": 2.7422807216644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121305, + "balance_loss_mlp": 1.15263498, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.05499133039406014, + "language_loss": 0.82219702, + "learning_rate": 0.0009103587548619439, + "loss": 0.83432752, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.60400391, + "step": 1132, + "time_per_iteration": 2.8834011554718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202147, + "balance_loss_mlp": 1.14468873, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.12855794167944481, + "language_loss": 0.87174821, + "learning_rate": 0.0009101806804390261, + "loss": 0.88376963, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.57421875, + "step": 1133, + "time_per_iteration": 2.8493435382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186046, + "balance_loss_mlp": 1.13082814, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.07046865468216726, + "language_loss": 0.91345453, + "learning_rate": 0.0009100024467698453, + "loss": 0.92531502, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.55175781, + "step": 1134, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184337, + "balance_loss_mlp": 1.12613893, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.07929007457036284, + "language_loss": 0.8353889, + "learning_rate": 0.0009098240539235981, + "loss": 0.84723222, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.58227539, + "step": 1135, + "time_per_iteration": 2.6736483573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176396, + "balance_loss_mlp": 1.12122619, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.06661367385494366, + "language_loss": 0.88575935, + "learning_rate": 0.0009096455019695423, + "loss": 0.89752328, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.55224609, + "step": 1136, + "time_per_iteration": 2.8438823223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172318, + "balance_loss_mlp": 1.1156702, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07075177433605506, + "language_loss": 0.90707165, + "learning_rate": 0.000909466790976998, + "loss": 0.91879487, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.56616211, + "step": 1137, + "time_per_iteration": 2.4795870780944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185042, + "balance_loss_mlp": 1.12801182, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07051320604800417, + "language_loss": 0.83409071, + "learning_rate": 0.0009092879210153473, + "loss": 0.84594113, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.57080078, + "step": 1138, + "time_per_iteration": 3.1328911781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186779, + "balance_loss_mlp": 1.13284826, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.06458215213012623, + "language_loss": 0.89566886, + "learning_rate": 0.0009091088921540333, + "loss": 0.90753663, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.54003906, + "step": 1139, + "time_per_iteration": 2.5608675479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_mlp": 1.03115106, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.027642480599540168, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76555562, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.15722656, + "step": 1140, + "time_per_iteration": 4.908522605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117392, + "balance_loss_mlp": 1.11908412, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.0906322081519832, + "language_loss": 0.84775734, + "learning_rate": 0.0009087503580104985, + "loss": 0.85949653, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.54882812, + "step": 1141, + "time_per_iteration": 2.696129083633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181851, + "balance_loss_mlp": 1.12558413, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.16226849767110665, + "language_loss": 0.80068243, + "learning_rate": 0.0009085708528674728, + "loss": 0.81250095, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.56347656, + "step": 1142, + "time_per_iteration": 2.7995505332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157793, + "balance_loss_mlp": 1.09985733, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08217329602320493, + "language_loss": 0.874843, + "learning_rate": 0.0009083911891031745, + "loss": 0.88642091, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.57958984, + "step": 1143, + "time_per_iteration": 3.1351919174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115076, + "balance_loss_mlp": 1.09578109, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.06169995263224583, + "language_loss": 0.92273706, + "learning_rate": 0.0009082113667873553, + "loss": 0.93424463, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.55029297, + "step": 1144, + "time_per_iteration": 3.1171934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153616, + "balance_loss_mlp": 1.10087752, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.07183124767141379, + "language_loss": 0.91221762, + "learning_rate": 0.0009080313859898283, + "loss": 0.9237538, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.52758789, + "step": 1145, + "time_per_iteration": 2.506591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153774, + "balance_loss_mlp": 1.09986758, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07077080612529597, + "language_loss": 0.92340779, + "learning_rate": 0.0009078512467804684, + "loss": 0.93494552, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.53881836, + "step": 1146, + "time_per_iteration": 2.591327667236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172392, + "balance_loss_mlp": 1.11800838, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.07651793216141736, + "language_loss": 0.91144007, + "learning_rate": 0.0009076709492292119, + "loss": 0.92316401, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.54418945, + "step": 1147, + "time_per_iteration": 2.609628438949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.11723804, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.07920780045429675, + "language_loss": 0.89603102, + "learning_rate": 0.0009074904934060562, + "loss": 0.90772295, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.51928711, + "step": 1148, + "time_per_iteration": 2.6755712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173959, + "balance_loss_mlp": 1.11697721, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.08245317941840166, + "language_loss": 0.8559376, + "learning_rate": 0.0009073098793810607, + "loss": 0.86767721, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.57006836, + "step": 1149, + "time_per_iteration": 2.9874348640441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177675, + "balance_loss_mlp": 1.12293434, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.08525751827962168, + "language_loss": 0.88982397, + "learning_rate": 0.000907129107224346, + "loss": 0.90160072, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.54785156, + "step": 1150, + "time_per_iteration": 2.739461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180589, + "balance_loss_mlp": 1.12658715, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.05205595876874212, + "language_loss": 0.88991034, + "learning_rate": 0.0009069481770060939, + "loss": 0.90171623, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.54077148, + "step": 1151, + "time_per_iteration": 2.7024669647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187248, + "balance_loss_mlp": 1.13212562, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06739531662392768, + "language_loss": 0.84448045, + "learning_rate": 0.000906767088796548, + "loss": 0.85635293, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.55126953, + "step": 1152, + "time_per_iteration": 3.4467508792877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117836, + "balance_loss_mlp": 1.12571764, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.05411857974090042, + "language_loss": 0.8779093, + "learning_rate": 0.0009065858426660127, + "loss": 0.8896929, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.52661133, + "step": 1153, + "time_per_iteration": 2.6216752529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182109, + "balance_loss_mlp": 1.12736845, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.07769931213358174, + "language_loss": 0.84979808, + "learning_rate": 0.0009064044386848543, + "loss": 0.86161917, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.54833984, + "step": 1154, + "time_per_iteration": 2.91601824760437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172512, + "balance_loss_mlp": 1.11381316, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.0711084155390928, + "language_loss": 0.89741302, + "learning_rate": 0.0009062228769234997, + "loss": 0.90913814, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.58691406, + "step": 1155, + "time_per_iteration": 2.5972864627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116208, + "balance_loss_mlp": 1.10690951, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.09100503083112628, + "language_loss": 0.81526613, + "learning_rate": 0.0009060411574524376, + "loss": 0.82688695, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.55224609, + "step": 1156, + "time_per_iteration": 2.6763274669647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182591, + "balance_loss_mlp": 1.12684917, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.06563385289017937, + "language_loss": 0.88585329, + "learning_rate": 0.0009058592803422178, + "loss": 0.89767921, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.55810547, + "step": 1157, + "time_per_iteration": 3.1414153575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_mlp": 1.00955701, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.012760142008093896, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79737109, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.17285156, + "step": 1158, + "time_per_iteration": 4.802858352661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171905, + "balance_loss_mlp": 1.12126482, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.060083734909452326, + "language_loss": 0.90886426, + "learning_rate": 0.00090549505348681, + "loss": 0.92058331, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.50683594, + "step": 1159, + "time_per_iteration": 2.5810928344726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168069, + "balance_loss_mlp": 1.11137354, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07069918091424116, + "language_loss": 0.85149121, + "learning_rate": 0.0009053127038830275, + "loss": 0.86317194, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.56689453, + "step": 1160, + "time_per_iteration": 3.009434223175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162107, + "balance_loss_mlp": 1.1050297, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.07200535138488619, + "language_loss": 0.87409687, + "learning_rate": 0.000905130196922898, + "loss": 0.88571799, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.57080078, + "step": 1161, + "time_per_iteration": 2.5972068309783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157457, + "balance_loss_mlp": 1.10223973, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.053497533436564174, + "language_loss": 0.8808614, + "learning_rate": 0.0009049475326772769, + "loss": 0.89243597, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.55224609, + "step": 1162, + "time_per_iteration": 2.580254316329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167432, + "balance_loss_mlp": 1.11092722, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.105825736895628, + "language_loss": 0.83639884, + "learning_rate": 0.0009047647112170811, + "loss": 0.84807312, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.56469727, + "step": 1163, + "time_per_iteration": 2.7509572505950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170262, + "balance_loss_mlp": 1.11041939, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.11729347611284674, + "language_loss": 0.8833853, + "learning_rate": 0.0009045817326132876, + "loss": 0.89508796, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.59814453, + "step": 1164, + "time_per_iteration": 3.6648380756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170775, + "balance_loss_mlp": 1.11226714, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.05704665841604838, + "language_loss": 0.83974147, + "learning_rate": 0.0009043985969369357, + "loss": 0.85144925, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.58544922, + "step": 1165, + "time_per_iteration": 2.868560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176977, + "balance_loss_mlp": 1.11665666, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.059940537627208516, + "language_loss": 0.84960037, + "learning_rate": 0.0009042153042591245, + "loss": 0.86137015, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.60302734, + "step": 1166, + "time_per_iteration": 2.8023743629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116839, + "balance_loss_mlp": 1.11271954, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.054742371261080745, + "language_loss": 0.85761929, + "learning_rate": 0.0009040318546510146, + "loss": 0.86930317, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.55639648, + "step": 1167, + "time_per_iteration": 3.141993999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117745, + "balance_loss_mlp": 1.1215651, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.07712318573741421, + "language_loss": 0.8582288, + "learning_rate": 0.0009038482481838275, + "loss": 0.87000328, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.55957031, + "step": 1168, + "time_per_iteration": 2.675204038619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116517, + "balance_loss_mlp": 1.1128844, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05640688657343365, + "language_loss": 0.88303328, + "learning_rate": 0.0009036644849288455, + "loss": 0.89468497, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.52319336, + "step": 1169, + "time_per_iteration": 3.0777511596679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.09441662, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.07174166621143864, + "language_loss": 0.86291218, + "learning_rate": 0.0009034805649574118, + "loss": 0.87439895, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.54394531, + "step": 1170, + "time_per_iteration": 2.7120091915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157496, + "balance_loss_mlp": 1.10513926, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.05497638968028837, + "language_loss": 0.85883957, + "learning_rate": 0.0009032964883409308, + "loss": 0.87041461, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.52441406, + "step": 1171, + "time_per_iteration": 2.8770556449890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_mlp": 1.03001809, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.027786176955518046, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74097812, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.17285156, + "step": 1172, + "time_per_iteration": 4.997943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150837, + "balance_loss_mlp": 1.0977174, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.06380875138992877, + "language_loss": 0.87640917, + "learning_rate": 0.0009029278654587462, + "loss": 0.88791752, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.53173828, + "step": 1173, + "time_per_iteration": 2.6070940494537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148484, + "balance_loss_mlp": 1.09546018, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.057211485944593306, + "language_loss": 0.83189976, + "learning_rate": 0.0009027433193361548, + "loss": 0.84338462, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.53027344, + "step": 1174, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.09708285, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06182212989299174, + "language_loss": 0.86948568, + "learning_rate": 0.00090255861685474, + "loss": 0.88097882, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.52246094, + "step": 1175, + "time_per_iteration": 2.7387607097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146248, + "balance_loss_mlp": 1.09284246, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.06871471519475823, + "language_loss": 0.92170686, + "learning_rate": 0.0009023737580862095, + "loss": 0.93316931, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.53442383, + "step": 1176, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160546, + "balance_loss_mlp": 1.11035883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0563237464245993, + "language_loss": 0.83948356, + "learning_rate": 0.0009021887431023321, + "loss": 0.851089, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.50219727, + "step": 1177, + "time_per_iteration": 2.5911412239074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161678, + "balance_loss_mlp": 1.11063254, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.06510699727290163, + "language_loss": 0.88054293, + "learning_rate": 0.0009020035719749369, + "loss": 0.8921597, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.51098633, + "step": 1178, + "time_per_iteration": 2.747715473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_mlp": 1.1255312, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0760827261000747, + "language_loss": 0.78592283, + "learning_rate": 0.0009018182447759136, + "loss": 0.79774463, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.56616211, + "step": 1179, + "time_per_iteration": 2.9912376403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177946, + "balance_loss_mlp": 1.12287188, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.05857060866656224, + "language_loss": 0.80403864, + "learning_rate": 0.0009016327615772126, + "loss": 0.81581813, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.55126953, + "step": 1180, + "time_per_iteration": 2.951934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178867, + "balance_loss_mlp": 1.1241498, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.07803208794693026, + "language_loss": 0.88654709, + "learning_rate": 0.0009014471224508451, + "loss": 0.8983357, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.54711914, + "step": 1181, + "time_per_iteration": 2.6834704875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175396, + "balance_loss_mlp": 1.12280107, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.07891792311297686, + "language_loss": 0.84171915, + "learning_rate": 0.0009012613274688823, + "loss": 0.85347319, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.52636719, + "step": 1182, + "time_per_iteration": 2.6773135662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193932, + "balance_loss_mlp": 1.13711679, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.06685387295915801, + "language_loss": 0.88334668, + "learning_rate": 0.0009010753767034565, + "loss": 0.89528602, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.56811523, + "step": 1183, + "time_per_iteration": 2.53671932220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192681, + "balance_loss_mlp": 1.13732028, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.05676884979808662, + "language_loss": 0.79381895, + "learning_rate": 0.0009008892702267599, + "loss": 0.80574578, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.55297852, + "step": 1184, + "time_per_iteration": 2.9609317779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218637, + "balance_loss_mlp": 1.16055822, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.11080255811352213, + "language_loss": 0.897048, + "learning_rate": 0.0009007030081110457, + "loss": 0.9092344, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.58105469, + "step": 1185, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212656, + "balance_loss_mlp": 1.15872598, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.06215110995007368, + "language_loss": 0.8510564, + "learning_rate": 0.000900516590428627, + "loss": 0.8631829, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.53955078, + "step": 1186, + "time_per_iteration": 2.66469407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206955, + "balance_loss_mlp": 1.15416956, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07510292852734143, + "language_loss": 0.90231287, + "learning_rate": 0.0009003300172518778, + "loss": 0.91438246, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.52807617, + "step": 1187, + "time_per_iteration": 2.6872987747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189379, + "balance_loss_mlp": 1.13559163, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.06187047573177096, + "language_loss": 0.84854043, + "learning_rate": 0.0009001432886532321, + "loss": 0.86043417, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.53808594, + "step": 1188, + "time_per_iteration": 2.961327314376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185796, + "balance_loss_mlp": 1.13248527, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0670290505569486, + "language_loss": 0.87277937, + "learning_rate": 0.0008999564047051843, + "loss": 0.88463724, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.53320312, + "step": 1189, + "time_per_iteration": 2.5120058059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119823, + "balance_loss_mlp": 1.14773321, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.07775817493182749, + "language_loss": 0.85562766, + "learning_rate": 0.0008997693654802894, + "loss": 0.86760998, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.50537109, + "step": 1190, + "time_per_iteration": 2.6584115028381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203195, + "balance_loss_mlp": 1.15343666, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08092173087187808, + "language_loss": 0.87245274, + "learning_rate": 0.0008995821710511625, + "loss": 0.88448465, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49780273, + "step": 1191, + "time_per_iteration": 2.75514817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189711, + "balance_loss_mlp": 1.14376771, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.058050392882622655, + "language_loss": 0.85975361, + "learning_rate": 0.0008993948214904786, + "loss": 0.8716507, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.45922852, + "step": 1192, + "time_per_iteration": 2.5808064937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132885, + "balance_loss_mlp": 1.11629128, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.04438752541684951, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.795551, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.16601562, + "step": 1193, + "time_per_iteration": 4.915351629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170271, + "balance_loss_mlp": 1.11338401, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.06516354982073377, + "language_loss": 0.79226351, + "learning_rate": 0.0008990196572654427, + "loss": 0.80396616, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.56933594, + "step": 1194, + "time_per_iteration": 2.914353609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159508, + "balance_loss_mlp": 1.10982203, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.053033431306196574, + "language_loss": 0.88186455, + "learning_rate": 0.0008988318427467426, + "loss": 0.89345956, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.49707031, + "step": 1195, + "time_per_iteration": 2.763303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146949, + "balance_loss_mlp": 1.09754825, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.06471781599702997, + "language_loss": 0.87142104, + "learning_rate": 0.0008986438733877887, + "loss": 0.88289052, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.49414062, + "step": 1196, + "time_per_iteration": 3.453037738800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138036, + "balance_loss_mlp": 1.08901691, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.05831436273017673, + "language_loss": 0.84795159, + "learning_rate": 0.0008984557492615576, + "loss": 0.85933197, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.49023438, + "step": 1197, + "time_per_iteration": 2.9209883213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147831, + "balance_loss_mlp": 1.09816873, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.06183090029168821, + "language_loss": 0.90001792, + "learning_rate": 0.0008982674704410854, + "loss": 0.91149628, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.49658203, + "step": 1198, + "time_per_iteration": 2.723980665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.10364521, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.06439147944581719, + "language_loss": 0.78128076, + "learning_rate": 0.0008980790369994682, + "loss": 0.7928164, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.49926758, + "step": 1199, + "time_per_iteration": 2.968733787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148741, + "balance_loss_mlp": 1.09817219, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.060755539801175186, + "language_loss": 0.8790828, + "learning_rate": 0.000897890449009863, + "loss": 0.89057022, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.50561523, + "step": 1200, + "time_per_iteration": 2.7373695373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159063, + "balance_loss_mlp": 1.11052144, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09508340337221405, + "language_loss": 0.9041636, + "learning_rate": 0.0008977017065454853, + "loss": 0.91575426, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.4855957, + "step": 1201, + "time_per_iteration": 2.6561479568481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172289, + "balance_loss_mlp": 1.12393796, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06896397472633412, + "language_loss": 0.8110497, + "learning_rate": 0.0008975128096796121, + "loss": 0.82277262, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48413086, + "step": 1202, + "time_per_iteration": 2.850882053375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166428, + "balance_loss_mlp": 1.11583591, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.07234791297382964, + "language_loss": 0.86751068, + "learning_rate": 0.0008973237584855794, + "loss": 0.87917495, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.50610352, + "step": 1203, + "time_per_iteration": 2.898651599884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201199, + "balance_loss_mlp": 1.14912796, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.0647782155366788, + "language_loss": 0.82535917, + "learning_rate": 0.0008971345530367832, + "loss": 0.83737111, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.52172852, + "step": 1204, + "time_per_iteration": 2.479710102081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188056, + "balance_loss_mlp": 1.13743997, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07976239468268423, + "language_loss": 0.86050093, + "learning_rate": 0.0008969451934066799, + "loss": 0.87238145, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.50561523, + "step": 1205, + "time_per_iteration": 2.7891948223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190172, + "balance_loss_mlp": 1.13834012, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08603625620414594, + "language_loss": 0.8068459, + "learning_rate": 0.0008967556796687854, + "loss": 0.81874764, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.51879883, + "step": 1206, + "time_per_iteration": 2.879742383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182453, + "balance_loss_mlp": 1.1313839, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.06613018456643845, + "language_loss": 0.8416872, + "learning_rate": 0.0008965660118966752, + "loss": 0.85351169, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.51098633, + "step": 1207, + "time_per_iteration": 2.8900513648986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.11610246, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06058209183838784, + "language_loss": 0.90754479, + "learning_rate": 0.0008963761901639851, + "loss": 0.91918385, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.47802734, + "step": 1208, + "time_per_iteration": 2.805534601211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176094, + "balance_loss_mlp": 1.12457156, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.06993420403149982, + "language_loss": 0.83909518, + "learning_rate": 0.0008961862145444103, + "loss": 0.85085618, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.51538086, + "step": 1209, + "time_per_iteration": 2.6882550716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197419, + "balance_loss_mlp": 1.14587319, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08646594069324176, + "language_loss": 0.85994279, + "learning_rate": 0.0008959960851117059, + "loss": 0.87191701, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.51611328, + "step": 1210, + "time_per_iteration": 2.6176648139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118703, + "balance_loss_mlp": 1.13340998, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06670419812311852, + "language_loss": 0.84013158, + "learning_rate": 0.0008958058019396868, + "loss": 0.85200191, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.53637695, + "step": 1211, + "time_per_iteration": 2.7867624759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177443, + "balance_loss_mlp": 1.12754154, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08722593193124767, + "language_loss": 0.87226063, + "learning_rate": 0.0008956153651022274, + "loss": 0.88403505, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.49926758, + "step": 1212, + "time_per_iteration": 2.671705961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169147, + "balance_loss_mlp": 1.11726665, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.06082314874639417, + "language_loss": 0.84296238, + "learning_rate": 0.0008954247746732618, + "loss": 0.85465384, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.51904297, + "step": 1213, + "time_per_iteration": 2.58005952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163838, + "balance_loss_mlp": 1.1156534, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.06006865966510304, + "language_loss": 0.91204965, + "learning_rate": 0.0008952340307267837, + "loss": 0.92368799, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48144531, + "step": 1214, + "time_per_iteration": 2.842824697494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149903, + "balance_loss_mlp": 1.09983516, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.07140080071894721, + "language_loss": 0.84202802, + "learning_rate": 0.0008950431333368468, + "loss": 0.85352707, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.50097656, + "step": 1215, + "time_per_iteration": 2.5616672039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155221, + "balance_loss_mlp": 1.10656011, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.083723319453273, + "language_loss": 0.85366404, + "learning_rate": 0.0008948520825775634, + "loss": 0.86521626, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48657227, + "step": 1216, + "time_per_iteration": 3.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114764, + "balance_loss_mlp": 1.09895492, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.05781662545039131, + "language_loss": 0.84181142, + "learning_rate": 0.0008946608785231067, + "loss": 0.85328782, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48706055, + "step": 1217, + "time_per_iteration": 2.861449956893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131497, + "balance_loss_mlp": 1.08352745, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.06428977242182035, + "language_loss": 0.85432529, + "learning_rate": 0.0008944695212477084, + "loss": 0.86564028, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.47973633, + "step": 1218, + "time_per_iteration": 2.540524959564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148618, + "balance_loss_mlp": 1.09907508, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.060914019840806265, + "language_loss": 0.86493349, + "learning_rate": 0.0008942780108256599, + "loss": 0.87641972, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.49560547, + "step": 1219, + "time_per_iteration": 2.613769769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142119, + "balance_loss_mlp": 1.09100199, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.05108155821019921, + "language_loss": 0.87340164, + "learning_rate": 0.0008940863473313121, + "loss": 0.88482285, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.51123047, + "step": 1220, + "time_per_iteration": 2.4549899101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145999, + "balance_loss_mlp": 1.09702742, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07702998226564757, + "language_loss": 0.8851074, + "learning_rate": 0.0008938945308390756, + "loss": 0.8965674, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48974609, + "step": 1221, + "time_per_iteration": 2.6133854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149321, + "balance_loss_mlp": 1.10211444, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.057479910137590906, + "language_loss": 0.88199294, + "learning_rate": 0.00089370256142342, + "loss": 0.89348614, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.47192383, + "step": 1222, + "time_per_iteration": 2.713489532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.09286284, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.05442066188859713, + "language_loss": 0.85879123, + "learning_rate": 0.0008935104391588746, + "loss": 0.87021047, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.49121094, + "step": 1223, + "time_per_iteration": 2.7304563522338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145924, + "balance_loss_mlp": 1.09447336, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.05049406517739995, + "language_loss": 0.8341555, + "learning_rate": 0.0008933181641200276, + "loss": 0.84561473, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.51513672, + "step": 1224, + "time_per_iteration": 3.122603416442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.09279394, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.0678885239417847, + "language_loss": 0.8627063, + "learning_rate": 0.0008931257363815271, + "loss": 0.87410253, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.46826172, + "step": 1225, + "time_per_iteration": 2.86014986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142208, + "balance_loss_mlp": 1.09490585, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.0639396043769501, + "language_loss": 0.90318632, + "learning_rate": 0.0008929331560180798, + "loss": 0.91460842, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.47338867, + "step": 1226, + "time_per_iteration": 2.9069020748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158077, + "balance_loss_mlp": 1.10924876, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.05735405278544162, + "language_loss": 0.9124881, + "learning_rate": 0.0008927404231044525, + "loss": 0.92406881, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48828125, + "step": 1227, + "time_per_iteration": 2.745591163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154284, + "balance_loss_mlp": 1.10571766, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.062458312515348655, + "language_loss": 0.8233285, + "learning_rate": 0.0008925475377154703, + "loss": 0.83487129, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48583984, + "step": 1228, + "time_per_iteration": 2.7165796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147881, + "balance_loss_mlp": 1.09664452, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.06307879716822463, + "language_loss": 0.82915187, + "learning_rate": 0.0008923544999260183, + "loss": 0.84063065, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.51293945, + "step": 1229, + "time_per_iteration": 2.787444829940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156102, + "balance_loss_mlp": 1.10567617, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.06236445133400911, + "language_loss": 0.92471206, + "learning_rate": 0.00089216130981104, + "loss": 0.9362731, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.50439453, + "step": 1230, + "time_per_iteration": 3.0671463012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148349, + "balance_loss_mlp": 1.09816241, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.06420697058211047, + "language_loss": 0.82893002, + "learning_rate": 0.000891967967445539, + "loss": 0.84041357, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.50195312, + "step": 1231, + "time_per_iteration": 2.692819356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147263, + "balance_loss_mlp": 1.09733796, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.044472050821074895, + "language_loss": 0.89257467, + "learning_rate": 0.0008917744729045772, + "loss": 0.90404725, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.49975586, + "step": 1232, + "time_per_iteration": 2.911123037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.10190618, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.055115174481180494, + "language_loss": 0.84317499, + "learning_rate": 0.0008915808262632757, + "loss": 0.85468972, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.49633789, + "step": 1233, + "time_per_iteration": 2.8429055213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164245, + "balance_loss_mlp": 1.1117928, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.07089823280283834, + "language_loss": 0.93916011, + "learning_rate": 0.0008913870275968148, + "loss": 0.95080256, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.52392578, + "step": 1234, + "time_per_iteration": 2.7355082035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152305, + "balance_loss_mlp": 1.10321498, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06512180670183462, + "language_loss": 0.87916219, + "learning_rate": 0.0008911930769804342, + "loss": 0.8906852, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.49145508, + "step": 1235, + "time_per_iteration": 3.320653200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115702, + "balance_loss_mlp": 1.10549772, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.04926889071384256, + "language_loss": 0.91928077, + "learning_rate": 0.0008909989744894318, + "loss": 0.93085092, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.51513672, + "step": 1236, + "time_per_iteration": 2.860095500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114863, + "balance_loss_mlp": 1.09808517, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.06373579401102465, + "language_loss": 0.81724823, + "learning_rate": 0.0008908047201991649, + "loss": 0.82873452, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.50512695, + "step": 1237, + "time_per_iteration": 2.7173092365264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146815, + "balance_loss_mlp": 1.10065758, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.06973577397583665, + "language_loss": 0.86895192, + "learning_rate": 0.0008906103141850502, + "loss": 0.88042009, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.46142578, + "step": 1238, + "time_per_iteration": 2.9070518016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149112, + "balance_loss_mlp": 1.10068893, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.07438040904238923, + "language_loss": 0.88608682, + "learning_rate": 0.0008904157565225621, + "loss": 0.897578, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48461914, + "step": 1239, + "time_per_iteration": 2.598175287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114606, + "balance_loss_mlp": 1.09758997, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.07265689268382322, + "language_loss": 0.82424903, + "learning_rate": 0.000890221047287235, + "loss": 0.83570957, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48486328, + "step": 1240, + "time_per_iteration": 3.5255463123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149116, + "balance_loss_mlp": 1.10207629, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07692592831537566, + "language_loss": 0.91524613, + "learning_rate": 0.0008900261865546615, + "loss": 0.92673725, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47021484, + "step": 1241, + "time_per_iteration": 2.626298189163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150585, + "balance_loss_mlp": 1.10101807, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.06193436068824588, + "language_loss": 0.85487348, + "learning_rate": 0.0008898311744004936, + "loss": 0.86637932, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.49584961, + "step": 1242, + "time_per_iteration": 2.6845884323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143159, + "balance_loss_mlp": 1.09638107, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06489370510499948, + "language_loss": 0.87195957, + "learning_rate": 0.0008896360109004414, + "loss": 0.88339114, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.46801758, + "step": 1243, + "time_per_iteration": 2.6279244422912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149339, + "balance_loss_mlp": 1.10239482, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.05690023470638135, + "language_loss": 0.84913921, + "learning_rate": 0.0008894406961302742, + "loss": 0.8606326, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.46948242, + "step": 1244, + "time_per_iteration": 2.5823607444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161498, + "balance_loss_mlp": 1.11591244, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.06599652790645752, + "language_loss": 0.84225279, + "learning_rate": 0.0008892452301658201, + "loss": 0.85386777, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.45581055, + "step": 1245, + "time_per_iteration": 3.0007240772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153792, + "balance_loss_mlp": 1.1045351, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.05569216777143309, + "language_loss": 0.83851659, + "learning_rate": 0.0008890496130829653, + "loss": 0.8500545, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.49316406, + "step": 1246, + "time_per_iteration": 2.656524658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.10424757, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.0643203237989141, + "language_loss": 0.85808307, + "learning_rate": 0.0008888538449576555, + "loss": 0.86958289, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.45751953, + "step": 1247, + "time_per_iteration": 2.5420141220092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148571, + "balance_loss_mlp": 1.09993315, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.07931889136759729, + "language_loss": 0.83083689, + "learning_rate": 0.0008886579258658944, + "loss": 0.84232259, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48632812, + "step": 1248, + "time_per_iteration": 2.574025869369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136833, + "balance_loss_mlp": 1.08786154, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.057547694087262784, + "language_loss": 0.85210383, + "learning_rate": 0.0008884618558837446, + "loss": 0.8634721, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.48974609, + "step": 1249, + "time_per_iteration": 2.808790922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146334, + "balance_loss_mlp": 1.09407234, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.05843363394571656, + "language_loss": 0.87170362, + "learning_rate": 0.0008882656350873273, + "loss": 0.88316691, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.52319336, + "step": 1250, + "time_per_iteration": 2.839341163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139888, + "balance_loss_mlp": 1.08998704, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.06920486589868534, + "language_loss": 0.87495792, + "learning_rate": 0.0008880692635528219, + "loss": 0.88635677, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.49975586, + "step": 1251, + "time_per_iteration": 3.0422415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141134, + "balance_loss_mlp": 1.09404635, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09445201185980338, + "language_loss": 0.89987123, + "learning_rate": 0.0008878727413564669, + "loss": 0.91128266, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47094727, + "step": 1252, + "time_per_iteration": 2.7974343299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110917, + "balance_loss_mlp": 1.09066832, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.0270998190046769, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81244767, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.18457031, + "step": 1253, + "time_per_iteration": 4.892668724060059 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150056, + "balance_loss_mlp": 1.09707963, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.06472275672686992, + "language_loss": 0.79044139, + "learning_rate": 0.0008874792452834528, + "loss": 0.80194199, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.53076172, + "step": 1254, + "time_per_iteration": 2.759533643722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144784, + "balance_loss_mlp": 1.09397733, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08671647217417044, + "language_loss": 0.87847424, + "learning_rate": 0.0008872822715595626, + "loss": 0.88992208, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.50878906, + "step": 1255, + "time_per_iteration": 2.6758921146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136115, + "balance_loss_mlp": 1.08731091, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.07818195128513271, + "language_loss": 0.87750483, + "learning_rate": 0.0008870851474793598, + "loss": 0.88886595, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.48803711, + "step": 1256, + "time_per_iteration": 2.5903451442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140246, + "balance_loss_mlp": 1.09196591, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.06462138017812241, + "language_loss": 0.90108514, + "learning_rate": 0.0008868878731193752, + "loss": 0.91248751, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48291016, + "step": 1257, + "time_per_iteration": 2.9156484603881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131411, + "balance_loss_mlp": 1.08611095, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.06839520252820154, + "language_loss": 0.89823216, + "learning_rate": 0.0008866904485561973, + "loss": 0.90954626, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.45361328, + "step": 1258, + "time_per_iteration": 2.709073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128152, + "balance_loss_mlp": 1.07698727, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.061516465429869265, + "language_loss": 0.83619797, + "learning_rate": 0.000886492873866473, + "loss": 0.84747952, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.51245117, + "step": 1259, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122174, + "balance_loss_mlp": 1.07315516, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.07532562043269028, + "language_loss": 0.85057306, + "learning_rate": 0.000886295149126908, + "loss": 0.86179483, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.49023438, + "step": 1260, + "time_per_iteration": 2.7702596187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_mlp": 1.07291138, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.06506459806255929, + "language_loss": 0.86249155, + "learning_rate": 0.0008860972744142655, + "loss": 0.87369466, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47363281, + "step": 1261, + "time_per_iteration": 2.9010353088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111356, + "balance_loss_mlp": 1.06575668, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.05333874014607912, + "language_loss": 0.82215619, + "learning_rate": 0.0008858992498053671, + "loss": 0.83329183, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47729492, + "step": 1262, + "time_per_iteration": 2.8307647705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_mlp": 1.08506405, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.04388178085496151, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77694511, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.20703125, + "step": 1263, + "time_per_iteration": 4.839150428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113047, + "balance_loss_mlp": 1.06517243, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07576677138650743, + "language_loss": 0.83877796, + "learning_rate": 0.0008855027512063817, + "loss": 0.84990847, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47924805, + "step": 1264, + "time_per_iteration": 2.6955387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116702, + "balance_loss_mlp": 1.06847, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.08737911579836782, + "language_loss": 0.86160326, + "learning_rate": 0.0008853042773702292, + "loss": 0.87277025, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.48217773, + "step": 1265, + "time_per_iteration": 2.718477725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123795, + "balance_loss_mlp": 1.07191551, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.05410456343654981, + "language_loss": 0.87916005, + "learning_rate": 0.0008851056539456896, + "loss": 0.89039803, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.51855469, + "step": 1266, + "time_per_iteration": 2.668398380279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127352, + "balance_loss_mlp": 1.07792759, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.06341671281787149, + "language_loss": 0.82546353, + "learning_rate": 0.0008849068810098755, + "loss": 0.8367371, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.49414062, + "step": 1267, + "time_per_iteration": 3.348644971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132731, + "balance_loss_mlp": 1.08523834, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.08675992555990221, + "language_loss": 0.8333391, + "learning_rate": 0.0008847079586399575, + "loss": 0.84466636, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47509766, + "step": 1268, + "time_per_iteration": 2.549433946609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126198, + "balance_loss_mlp": 1.07994461, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07249150513377325, + "language_loss": 0.8672694, + "learning_rate": 0.0008845088869131641, + "loss": 0.87853134, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.46289062, + "step": 1269, + "time_per_iteration": 2.6586451530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.10145724, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.06266770628228314, + "language_loss": 0.89411461, + "learning_rate": 0.0008843096659067818, + "loss": 0.90561438, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.48510742, + "step": 1270, + "time_per_iteration": 2.626946210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146652, + "balance_loss_mlp": 1.10228229, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.056965438466979365, + "language_loss": 0.86992264, + "learning_rate": 0.000884110295698155, + "loss": 0.88138914, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.44335938, + "step": 1271, + "time_per_iteration": 2.970078706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160922, + "balance_loss_mlp": 1.11080623, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.06894839907125858, + "language_loss": 0.86557794, + "learning_rate": 0.0008839107763646861, + "loss": 0.87718713, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.5012207, + "step": 1272, + "time_per_iteration": 2.592349052429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183532, + "balance_loss_mlp": 1.13437057, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.06647703149266906, + "language_loss": 0.90856385, + "learning_rate": 0.0008837111079838353, + "loss": 0.92039919, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.49194336, + "step": 1273, + "time_per_iteration": 2.7098910808563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118943, + "balance_loss_mlp": 1.14289117, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.05923779703064254, + "language_loss": 0.90316379, + "learning_rate": 0.000883511290633121, + "loss": 0.91505814, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.46533203, + "step": 1274, + "time_per_iteration": 2.5714197158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.13739181, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.060927364177961095, + "language_loss": 0.92697686, + "learning_rate": 0.000883311324390119, + "loss": 0.93883693, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.48608398, + "step": 1275, + "time_per_iteration": 2.740896224975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189584, + "balance_loss_mlp": 1.13474798, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.07775603238406727, + "language_loss": 0.82056022, + "learning_rate": 0.0008831112093324629, + "loss": 0.83245611, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.5480957, + "step": 1276, + "time_per_iteration": 3.0821468830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190059, + "balance_loss_mlp": 1.13927567, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.05600773018776359, + "language_loss": 0.89543378, + "learning_rate": 0.0008829109455378444, + "loss": 0.90733445, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.50830078, + "step": 1277, + "time_per_iteration": 2.7299413681030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192458, + "balance_loss_mlp": 1.14241397, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.05156937738675093, + "language_loss": 0.87083036, + "learning_rate": 0.000882710533084013, + "loss": 0.88275498, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.5, + "step": 1278, + "time_per_iteration": 2.6295228004455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185847, + "balance_loss_mlp": 1.13568354, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.05927927368096647, + "language_loss": 0.90088928, + "learning_rate": 0.0008825099720487755, + "loss": 0.91274774, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.50195312, + "step": 1279, + "time_per_iteration": 2.630868434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149494, + "balance_loss_mlp": 1.13461673, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04555367127523109, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76410633, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.1484375, + "step": 1280, + "time_per_iteration": 4.843670129776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118256, + "balance_loss_mlp": 1.10366488, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.038204832859796624, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79062366, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.14550781, + "step": 1281, + "time_per_iteration": 4.784554481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115452, + "balance_loss_mlp": 1.10547721, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.05852441511604794, + "language_loss": 0.89541078, + "learning_rate": 0.0008819073982335619, + "loss": 0.90695602, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.49047852, + "step": 1282, + "time_per_iteration": 2.8370161056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141297, + "balance_loss_mlp": 1.09726083, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07515840278086762, + "language_loss": 0.84908974, + "learning_rate": 0.0008817062436519235, + "loss": 0.86050272, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.44042969, + "step": 1283, + "time_per_iteration": 2.6532042026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114078, + "balance_loss_mlp": 1.09164214, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.051214690731677004, + "language_loss": 0.9022612, + "learning_rate": 0.0008815049408787788, + "loss": 0.91366905, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.49072266, + "step": 1284, + "time_per_iteration": 2.577040195465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145806, + "balance_loss_mlp": 1.09857535, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.06399849872592922, + "language_loss": 0.86388409, + "learning_rate": 0.0008813034899922805, + "loss": 0.87534213, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47216797, + "step": 1285, + "time_per_iteration": 2.586411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153157, + "balance_loss_mlp": 1.10366094, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.05962621730359375, + "language_loss": 0.90523338, + "learning_rate": 0.0008811018910706387, + "loss": 0.91676497, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.49536133, + "step": 1286, + "time_per_iteration": 2.558340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150564, + "balance_loss_mlp": 1.0996381, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08171747444285254, + "language_loss": 0.82914776, + "learning_rate": 0.0008809001441921211, + "loss": 0.84065336, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.50976562, + "step": 1287, + "time_per_iteration": 2.7096829414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134679, + "balance_loss_mlp": 1.08651865, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.061876473909820096, + "language_loss": 0.86037469, + "learning_rate": 0.0008806982494350528, + "loss": 0.87172151, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.48144531, + "step": 1288, + "time_per_iteration": 2.6826744079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.0885514, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.05818805427718153, + "language_loss": 0.90965348, + "learning_rate": 0.0008804962068778161, + "loss": 0.92104065, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.50195312, + "step": 1289, + "time_per_iteration": 2.9314775466918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137271, + "balance_loss_mlp": 1.08872867, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.06661216201088474, + "language_loss": 0.81390089, + "learning_rate": 0.0008802940165988511, + "loss": 0.82527363, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.48510742, + "step": 1290, + "time_per_iteration": 2.8629136085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113117, + "balance_loss_mlp": 1.08389127, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.06960392685137955, + "language_loss": 0.89268786, + "learning_rate": 0.000880091678676655, + "loss": 0.90399957, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47265625, + "step": 1291, + "time_per_iteration": 2.8345038890838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.08882165, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.058047960295431696, + "language_loss": 0.89150697, + "learning_rate": 0.0008798891931897821, + "loss": 0.90286887, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47338867, + "step": 1292, + "time_per_iteration": 2.7299227714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128008, + "balance_loss_mlp": 1.07949018, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.09954343743221296, + "language_loss": 0.84998739, + "learning_rate": 0.0008796865602168447, + "loss": 0.86126745, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.48535156, + "step": 1293, + "time_per_iteration": 2.5342278480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127533, + "balance_loss_mlp": 1.08220935, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.05777797953149353, + "language_loss": 0.89527249, + "learning_rate": 0.0008794837798365115, + "loss": 0.90654784, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.45361328, + "step": 1294, + "time_per_iteration": 2.6889185905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.08886147, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.07754051928079464, + "language_loss": 0.89232659, + "learning_rate": 0.0008792808521275089, + "loss": 0.90369469, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47924805, + "step": 1295, + "time_per_iteration": 2.7635927200317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136837, + "balance_loss_mlp": 1.09027398, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.09989296116771008, + "language_loss": 0.87984705, + "learning_rate": 0.0008790777771686206, + "loss": 0.89121538, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.46557617, + "step": 1296, + "time_per_iteration": 2.579235076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124595, + "balance_loss_mlp": 1.07853234, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.08251132162328097, + "language_loss": 0.85680348, + "learning_rate": 0.0008788745550386872, + "loss": 0.86804938, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.46044922, + "step": 1297, + "time_per_iteration": 2.598031759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128617, + "balance_loss_mlp": 1.08152938, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.06717402893383145, + "language_loss": 0.80945367, + "learning_rate": 0.0008786711858166063, + "loss": 0.82073987, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47070312, + "step": 1298, + "time_per_iteration": 2.9720141887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133144, + "balance_loss_mlp": 1.08696246, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.058753985131359356, + "language_loss": 0.84356344, + "learning_rate": 0.0008784676695813332, + "loss": 0.85489488, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.46166992, + "step": 1299, + "time_per_iteration": 3.003113031387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154452, + "balance_loss_mlp": 1.10700631, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07081449776085671, + "language_loss": 0.85444576, + "learning_rate": 0.0008782640064118796, + "loss": 0.86599028, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47436523, + "step": 1300, + "time_per_iteration": 2.8769848346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166343, + "balance_loss_mlp": 1.14946294, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.041859158942630086, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77351093, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.16894531, + "step": 1301, + "time_per_iteration": 4.951652526855469 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191692, + "balance_loss_mlp": 1.14701271, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.07273634964220443, + "language_loss": 0.8750245, + "learning_rate": 0.0008778562395867648, + "loss": 0.88694143, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.44677734, + "step": 1302, + "time_per_iteration": 2.604402542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181408, + "balance_loss_mlp": 1.13629961, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07562070017846675, + "language_loss": 0.84288502, + "learning_rate": 0.0008776521360894127, + "loss": 0.85469913, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.45092773, + "step": 1303, + "time_per_iteration": 2.5878565311431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08784008, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0317480068151838, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80065739, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.15820312, + "step": 1304, + "time_per_iteration": 4.7717835903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116688, + "balance_loss_mlp": 1.12220049, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.05690422496958516, + "language_loss": 0.90951985, + "learning_rate": 0.0008772434893213186, + "loss": 0.92118865, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.44702148, + "step": 1305, + "time_per_iteration": 2.604490280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160948, + "balance_loss_mlp": 1.11405063, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.058263181320018995, + "language_loss": 0.85050523, + "learning_rate": 0.0008770389462092276, + "loss": 0.86211473, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46875, + "step": 1306, + "time_per_iteration": 2.6470468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011567, + "balance_loss_mlp": 1.1099937, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.058464254330546805, + "language_loss": 0.87023067, + "learning_rate": 0.0008768342567176357, + "loss": 0.88179767, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.46704102, + "step": 1307, + "time_per_iteration": 2.8168630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155047, + "balance_loss_mlp": 1.10753012, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.05479935706331158, + "language_loss": 0.90999937, + "learning_rate": 0.0008766294209260107, + "loss": 0.9215498, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.4753418, + "step": 1308, + "time_per_iteration": 2.721531629562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144469, + "balance_loss_mlp": 1.09704781, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.06755027454964987, + "language_loss": 0.91936618, + "learning_rate": 0.0008764244389138767, + "loss": 0.93081093, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47436523, + "step": 1309, + "time_per_iteration": 2.574913263320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146846, + "balance_loss_mlp": 1.10061693, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09614568206927013, + "language_loss": 0.82912982, + "learning_rate": 0.000876219310760815, + "loss": 0.84059829, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.46240234, + "step": 1310, + "time_per_iteration": 2.8861234188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140262, + "balance_loss_mlp": 1.09419942, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.07943381545238665, + "language_loss": 0.82026285, + "learning_rate": 0.0008760140365464631, + "loss": 0.83166546, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.46020508, + "step": 1311, + "time_per_iteration": 2.615981340408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157686, + "balance_loss_mlp": 1.11212397, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.0923524312347507, + "language_loss": 0.8768574, + "learning_rate": 0.0008758086163505156, + "loss": 0.88843429, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.45532227, + "step": 1312, + "time_per_iteration": 2.6723434925079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144164, + "balance_loss_mlp": 1.09872115, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.06443576206069311, + "language_loss": 0.90026277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91170442, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.45458984, + "step": 1313, + "time_per_iteration": 2.841367721557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114771, + "balance_loss_mlp": 1.10291111, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.057466156915965357, + "language_loss": 0.90976274, + "learning_rate": 0.0008753973383328954, + "loss": 0.92123979, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.44824219, + "step": 1314, + "time_per_iteration": 2.7198092937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135642, + "balance_loss_mlp": 1.08912706, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.0651730634150067, + "language_loss": 0.84640622, + "learning_rate": 0.0008751914806708952, + "loss": 0.85776269, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.46508789, + "step": 1315, + "time_per_iteration": 2.619739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138249, + "balance_loss_mlp": 1.0955956, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.06535523514746128, + "language_loss": 0.82706141, + "learning_rate": 0.0008749854773466439, + "loss": 0.83844388, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.42700195, + "step": 1316, + "time_per_iteration": 2.6750850677490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126734, + "balance_loss_mlp": 1.08594072, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07438570972797282, + "language_loss": 0.85103095, + "learning_rate": 0.0008747793284401192, + "loss": 0.86229837, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.40771484, + "step": 1317, + "time_per_iteration": 2.667684316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127851, + "balance_loss_mlp": 1.08231306, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.06662830476911753, + "language_loss": 0.8637262, + "learning_rate": 0.0008745730340313551, + "loss": 0.87500465, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.45532227, + "step": 1318, + "time_per_iteration": 2.783167839050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_mlp": 1.08298802, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.06014849970215255, + "language_loss": 0.84828806, + "learning_rate": 0.0008743665942004422, + "loss": 0.85955328, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.43554688, + "step": 1319, + "time_per_iteration": 2.6454880237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128022, + "balance_loss_mlp": 1.08334279, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.10116204644494126, + "language_loss": 0.93301231, + "learning_rate": 0.0008741600090275277, + "loss": 0.94429255, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.44702148, + "step": 1320, + "time_per_iteration": 2.565373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112488, + "balance_loss_mlp": 1.07884121, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.06655436432492466, + "language_loss": 0.84446663, + "learning_rate": 0.0008739532785928151, + "loss": 0.85571539, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.45996094, + "step": 1321, + "time_per_iteration": 3.479727268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080328, + "balance_loss_mlp": 1.06325758, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.0281051137535917, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7597391, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.17089844, + "step": 1322, + "time_per_iteration": 4.7930076122283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136178, + "balance_loss_mlp": 1.08921003, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.06285601142266005, + "language_loss": 0.83366752, + "learning_rate": 0.0008735393822590908, + "loss": 0.84502923, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.46923828, + "step": 1323, + "time_per_iteration": 2.672137498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145864, + "balance_loss_mlp": 1.10192394, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.05471127015298985, + "language_loss": 0.8775813, + "learning_rate": 0.0008733322165207681, + "loss": 0.88903993, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.43969727, + "step": 1324, + "time_per_iteration": 2.6422736644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157775, + "balance_loss_mlp": 1.11292815, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.058409122955484685, + "language_loss": 0.83687508, + "learning_rate": 0.0008731249058420247, + "loss": 0.84845281, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.44824219, + "step": 1325, + "time_per_iteration": 3.02577805519104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165947, + "balance_loss_mlp": 1.11995602, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.0843662219595253, + "language_loss": 0.90814316, + "learning_rate": 0.0008729174503033459, + "loss": 0.91980267, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.45947266, + "step": 1326, + "time_per_iteration": 2.700956344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160817, + "balance_loss_mlp": 1.11418188, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.07395752020353057, + "language_loss": 0.83274329, + "learning_rate": 0.0008727098499852728, + "loss": 0.84435147, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.46630859, + "step": 1327, + "time_per_iteration": 2.8289363384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.0946734, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.05433597882612883, + "language_loss": 0.90389377, + "learning_rate": 0.0008725021049684034, + "loss": 0.91528177, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.44165039, + "step": 1328, + "time_per_iteration": 2.766871452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.09057808, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.04999939134312536, + "language_loss": 0.83732843, + "learning_rate": 0.000872294215333391, + "loss": 0.84867573, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.44165039, + "step": 1329, + "time_per_iteration": 3.181687116622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133543, + "balance_loss_mlp": 1.08941174, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.053270875218317436, + "language_loss": 0.83338815, + "learning_rate": 0.0008720861811609457, + "loss": 0.84472358, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.44140625, + "step": 1330, + "time_per_iteration": 2.753095865249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139869, + "balance_loss_mlp": 1.09282851, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0744958299593676, + "language_loss": 0.83801699, + "learning_rate": 0.0008718780025318338, + "loss": 0.84941566, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.4699707, + "step": 1331, + "time_per_iteration": 2.74076771736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141571, + "balance_loss_mlp": 1.09913218, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.06658506014654758, + "language_loss": 0.84681445, + "learning_rate": 0.0008716696795268771, + "loss": 0.85823017, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.42456055, + "step": 1332, + "time_per_iteration": 2.6771953105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141914, + "balance_loss_mlp": 1.09718704, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.06458865940403113, + "language_loss": 0.86108088, + "learning_rate": 0.0008714612122269538, + "loss": 0.87250006, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.44750977, + "step": 1333, + "time_per_iteration": 2.872405767440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145867, + "balance_loss_mlp": 1.09944701, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.06078246423813374, + "language_loss": 0.89285004, + "learning_rate": 0.0008712526007129982, + "loss": 0.90430868, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46411133, + "step": 1334, + "time_per_iteration": 2.575467586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148778, + "balance_loss_mlp": 1.10517156, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.06822349657501799, + "language_loss": 0.91275418, + "learning_rate": 0.0008710438450660003, + "loss": 0.92424202, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.43603516, + "step": 1335, + "time_per_iteration": 2.6461987495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149209, + "balance_loss_mlp": 1.10157323, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.08158488021096956, + "language_loss": 0.88278055, + "learning_rate": 0.0008708349453670064, + "loss": 0.89427269, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47583008, + "step": 1336, + "time_per_iteration": 2.5001657009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128588, + "balance_loss_mlp": 1.08297849, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.0603403973753485, + "language_loss": 0.91654134, + "learning_rate": 0.0008706259016971185, + "loss": 0.92782724, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.45629883, + "step": 1337, + "time_per_iteration": 2.817657947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127771, + "balance_loss_mlp": 1.07865644, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.08421665296665147, + "language_loss": 0.83723027, + "learning_rate": 0.0008704167141374944, + "loss": 0.848508, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.49145508, + "step": 1338, + "time_per_iteration": 2.808487892150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_mlp": 1.08003271, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.05813050369368248, + "language_loss": 0.88781357, + "learning_rate": 0.0008702073827693482, + "loss": 0.89909494, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.48144531, + "step": 1339, + "time_per_iteration": 2.687836170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131918, + "balance_loss_mlp": 1.08711886, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.05714278292432699, + "language_loss": 0.89388514, + "learning_rate": 0.0008699979076739494, + "loss": 0.9052043, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.44799805, + "step": 1340, + "time_per_iteration": 2.9907524585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157888, + "balance_loss_mlp": 1.11089551, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.06321899043923618, + "language_loss": 0.8949765, + "learning_rate": 0.0008697882889326234, + "loss": 0.90655541, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.4699707, + "step": 1341, + "time_per_iteration": 2.5261731147766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182653, + "balance_loss_mlp": 1.13513625, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.06545350512623192, + "language_loss": 0.87013066, + "learning_rate": 0.0008695785266267515, + "loss": 0.88195717, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.4753418, + "step": 1342, + "time_per_iteration": 2.719949722290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194656, + "balance_loss_mlp": 1.14585173, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.07227104516109029, + "language_loss": 0.8379634, + "learning_rate": 0.0008693686208377704, + "loss": 0.84991002, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.48828125, + "step": 1343, + "time_per_iteration": 2.789046049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011909, + "balance_loss_mlp": 1.14572012, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.08291144049116697, + "language_loss": 0.89388204, + "learning_rate": 0.0008691585716471733, + "loss": 0.90579104, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.45214844, + "step": 1344, + "time_per_iteration": 2.63281512260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182868, + "balance_loss_mlp": 1.1348505, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.05462335243620436, + "language_loss": 0.86349607, + "learning_rate": 0.0008689483791365079, + "loss": 0.87532479, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.48022461, + "step": 1345, + "time_per_iteration": 2.8293464183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165648, + "balance_loss_mlp": 1.11879873, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.060641418043912716, + "language_loss": 0.89744675, + "learning_rate": 0.0008687380433873786, + "loss": 0.90910327, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46875, + "step": 1346, + "time_per_iteration": 2.757361650466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150314, + "balance_loss_mlp": 1.100389, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.0738804898683007, + "language_loss": 0.83070856, + "learning_rate": 0.0008685275644814448, + "loss": 0.84221172, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.49926758, + "step": 1347, + "time_per_iteration": 2.716006278991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147842, + "balance_loss_mlp": 1.10087395, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07544817120788133, + "language_loss": 0.85244781, + "learning_rate": 0.0008683169425004216, + "loss": 0.86392623, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46972656, + "step": 1348, + "time_per_iteration": 2.900754451751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114914, + "balance_loss_mlp": 1.09842825, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.08404854247051008, + "language_loss": 0.83688962, + "learning_rate": 0.0008681061775260799, + "loss": 0.84838104, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.50708008, + "step": 1349, + "time_per_iteration": 2.8356235027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140271, + "balance_loss_mlp": 1.09356534, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08196022848482862, + "language_loss": 0.92983842, + "learning_rate": 0.0008678952696402458, + "loss": 0.94124115, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46704102, + "step": 1350, + "time_per_iteration": 2.5051889419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_mlp": 1.0865308, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.052642437263987304, + "language_loss": 0.86759204, + "learning_rate": 0.000867684218924801, + "loss": 0.87891388, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.45629883, + "step": 1351, + "time_per_iteration": 2.8635144233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089623, + "balance_loss_mlp": 1.0725522, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.04013302579778462, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80036712, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.17089844, + "step": 1352, + "time_per_iteration": 4.89817476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121624, + "balance_loss_mlp": 1.07587171, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.055845692832442596, + "language_loss": 0.85694808, + "learning_rate": 0.0008672616893328834, + "loss": 0.8681643, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.45751953, + "step": 1353, + "time_per_iteration": 2.9335103034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123767, + "balance_loss_mlp": 1.07877684, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.07010977425409264, + "language_loss": 0.9082427, + "learning_rate": 0.0008670502106204512, + "loss": 0.91948032, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.44970703, + "step": 1354, + "time_per_iteration": 2.8469178676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138616, + "balance_loss_mlp": 1.08840501, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.056353527093492256, + "language_loss": 0.82360619, + "learning_rate": 0.0008668385894064892, + "loss": 0.83499235, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.50195312, + "step": 1355, + "time_per_iteration": 2.672883987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149756, + "balance_loss_mlp": 1.10321617, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.05383030346289838, + "language_loss": 0.89593899, + "learning_rate": 0.0008666268257731562, + "loss": 0.90743661, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46557617, + "step": 1356, + "time_per_iteration": 3.1050939559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169178, + "balance_loss_mlp": 1.12127948, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.05849819020383372, + "language_loss": 0.85968256, + "learning_rate": 0.0008664149198026662, + "loss": 0.87137431, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.47900391, + "step": 1357, + "time_per_iteration": 3.226966619491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156465, + "balance_loss_mlp": 1.10932934, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.07293583935871151, + "language_loss": 0.89518476, + "learning_rate": 0.0008662028715772883, + "loss": 0.90674949, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.47143555, + "step": 1358, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163078, + "balance_loss_mlp": 1.11718237, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.05890556701012809, + "language_loss": 0.86217821, + "learning_rate": 0.0008659906811793467, + "loss": 0.87380904, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.45898438, + "step": 1359, + "time_per_iteration": 2.651193857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151481, + "balance_loss_mlp": 1.10699224, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.06298146111957026, + "language_loss": 0.90418088, + "learning_rate": 0.0008657783486912215, + "loss": 0.91569573, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.44482422, + "step": 1360, + "time_per_iteration": 2.723550319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156338, + "balance_loss_mlp": 1.11022782, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.055299708084911615, + "language_loss": 0.90110713, + "learning_rate": 0.0008655658741953472, + "loss": 0.91267049, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.4609375, + "step": 1361, + "time_per_iteration": 3.216830015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139946, + "balance_loss_mlp": 1.09564757, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.04868556149108388, + "language_loss": 0.89168048, + "learning_rate": 0.0008653532577742136, + "loss": 0.90307987, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.44311523, + "step": 1362, + "time_per_iteration": 2.718886375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143457, + "balance_loss_mlp": 1.0986346, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.058057923999792295, + "language_loss": 0.87558335, + "learning_rate": 0.0008651404995103659, + "loss": 0.88701797, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.44824219, + "step": 1363, + "time_per_iteration": 2.594294309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.09338474, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.06330728330165165, + "language_loss": 0.87334514, + "learning_rate": 0.0008649275994864041, + "loss": 0.88471884, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.43994141, + "step": 1364, + "time_per_iteration": 2.707449197769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144999, + "balance_loss_mlp": 1.09879303, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.05276541609050752, + "language_loss": 0.84391934, + "learning_rate": 0.0008647145577849834, + "loss": 0.85536933, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46191406, + "step": 1365, + "time_per_iteration": 2.8216350078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131283, + "balance_loss_mlp": 1.08560157, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.05376997595185902, + "language_loss": 0.83317888, + "learning_rate": 0.0008645013744888139, + "loss": 0.84449172, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.45678711, + "step": 1366, + "time_per_iteration": 2.866891622543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149616, + "balance_loss_mlp": 1.10536587, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.06316724717597957, + "language_loss": 0.87992281, + "learning_rate": 0.0008642880496806607, + "loss": 0.89141893, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.44287109, + "step": 1367, + "time_per_iteration": 2.7763173580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142909, + "balance_loss_mlp": 1.09772861, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.05877759558608074, + "language_loss": 0.84959197, + "learning_rate": 0.0008640745834433437, + "loss": 0.86102104, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.4519043, + "step": 1368, + "time_per_iteration": 2.738328218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134336, + "balance_loss_mlp": 1.09018087, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.05935956886320276, + "language_loss": 0.87054664, + "learning_rate": 0.000863860975859738, + "loss": 0.88189, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.44165039, + "step": 1369, + "time_per_iteration": 2.9206831455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131372, + "balance_loss_mlp": 1.0855242, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.06691392922801855, + "language_loss": 0.88684422, + "learning_rate": 0.0008636472270127733, + "loss": 0.89815795, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.45825195, + "step": 1370, + "time_per_iteration": 2.6078739166259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116186, + "balance_loss_mlp": 1.07021928, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.06515524250359679, + "language_loss": 0.90367895, + "learning_rate": 0.0008634333369854345, + "loss": 0.91484082, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.45947266, + "step": 1371, + "time_per_iteration": 2.6001384258270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110327, + "balance_loss_mlp": 1.0667206, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.056061894150206536, + "language_loss": 0.87892628, + "learning_rate": 0.0008632193058607608, + "loss": 0.89002955, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.43554688, + "step": 1372, + "time_per_iteration": 2.711435317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113538, + "balance_loss_mlp": 1.06628299, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.060513983317086996, + "language_loss": 0.81023312, + "learning_rate": 0.0008630051337218466, + "loss": 0.82136846, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47314453, + "step": 1373, + "time_per_iteration": 2.656416893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110026, + "balance_loss_mlp": 1.0668484, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0689512550651149, + "language_loss": 0.82808203, + "learning_rate": 0.0008627908206518409, + "loss": 0.83918226, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.43188477, + "step": 1374, + "time_per_iteration": 2.673738956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_mlp": 1.02716982, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.01820003864645097, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76191109, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12695312, + "step": 1375, + "time_per_iteration": 5.317140817642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115308, + "balance_loss_mlp": 1.07272696, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.062338636090573274, + "language_loss": 0.91769958, + "learning_rate": 0.0008623617720514241, + "loss": 0.92885268, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.42578125, + "step": 1376, + "time_per_iteration": 2.666618585586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117829, + "balance_loss_mlp": 1.07255304, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.08321054400070194, + "language_loss": 0.85169828, + "learning_rate": 0.0008621470366875848, + "loss": 0.86287659, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.45288086, + "step": 1377, + "time_per_iteration": 2.5939900875091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011137, + "balance_loss_mlp": 1.0724293, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.0756812485553519, + "language_loss": 0.88528687, + "learning_rate": 0.0008619321607257966, + "loss": 0.89642382, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.41259766, + "step": 1378, + "time_per_iteration": 2.675719976425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.08109117, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.05967522341676015, + "language_loss": 0.8244732, + "learning_rate": 0.000861717144249482, + "loss": 0.8357054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.42138672, + "step": 1379, + "time_per_iteration": 2.8289949893951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132354, + "balance_loss_mlp": 1.09170318, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06486885922060631, + "language_loss": 0.90334523, + "learning_rate": 0.0008615019873421175, + "loss": 0.91466868, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.40649414, + "step": 1380, + "time_per_iteration": 2.4665510654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141805, + "balance_loss_mlp": 1.09798408, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.06471812563896691, + "language_loss": 0.86262017, + "learning_rate": 0.0008612866900872349, + "loss": 0.87403822, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.43823242, + "step": 1381, + "time_per_iteration": 2.553489923477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140972, + "balance_loss_mlp": 1.10017824, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.07006288293307902, + "language_loss": 0.88817614, + "learning_rate": 0.0008610712525684197, + "loss": 0.89958596, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.40771484, + "step": 1382, + "time_per_iteration": 2.623844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156525, + "balance_loss_mlp": 1.11341906, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.06690376769295572, + "language_loss": 0.85084939, + "learning_rate": 0.0008608556748693121, + "loss": 0.8624146, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.43115234, + "step": 1383, + "time_per_iteration": 3.248947858810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149603, + "balance_loss_mlp": 1.10549557, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.05893966497122096, + "language_loss": 0.86648834, + "learning_rate": 0.000860639957073607, + "loss": 0.8779844, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.44116211, + "step": 1384, + "time_per_iteration": 2.6954376697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161137, + "balance_loss_mlp": 1.11838901, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.05777577847879513, + "language_loss": 0.88325369, + "learning_rate": 0.0008604240992650534, + "loss": 0.8948651, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.42749023, + "step": 1385, + "time_per_iteration": 2.6810553073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116884, + "balance_loss_mlp": 1.12613928, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.1266990207417539, + "language_loss": 0.89650941, + "learning_rate": 0.0008602081015274545, + "loss": 0.90819776, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.42724609, + "step": 1386, + "time_per_iteration": 2.7079007625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169207, + "balance_loss_mlp": 1.12602973, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.05666517988787923, + "language_loss": 0.83684492, + "learning_rate": 0.0008599919639446684, + "loss": 0.84853697, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.43139648, + "step": 1387, + "time_per_iteration": 2.67275333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184027, + "balance_loss_mlp": 1.13755894, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.06873806966805297, + "language_loss": 0.80686462, + "learning_rate": 0.000859775686600607, + "loss": 0.81870484, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46459961, + "step": 1388, + "time_per_iteration": 2.568384885787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192065, + "balance_loss_mlp": 1.14676547, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.07413400256287127, + "language_loss": 0.85524642, + "learning_rate": 0.0008595592695792367, + "loss": 0.86716712, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.453125, + "step": 1389, + "time_per_iteration": 2.6748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182907, + "balance_loss_mlp": 1.13884759, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06676524761439688, + "language_loss": 0.9117986, + "learning_rate": 0.0008593427129645778, + "loss": 0.92362767, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.44042969, + "step": 1390, + "time_per_iteration": 2.5506954193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186114, + "balance_loss_mlp": 1.14205468, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.056989477345309104, + "language_loss": 0.85532665, + "learning_rate": 0.0008591260168407052, + "loss": 0.86718786, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.44067383, + "step": 1391, + "time_per_iteration": 2.759000778198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_mlp": 1.13714194, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.12230490659722075, + "language_loss": 0.83154678, + "learning_rate": 0.0008589091812917479, + "loss": 0.84336257, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.4440918, + "step": 1392, + "time_per_iteration": 2.6213910579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183464, + "balance_loss_mlp": 1.14030981, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07403824045185783, + "language_loss": 0.8547672, + "learning_rate": 0.0008586922064018887, + "loss": 0.86660182, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.43139648, + "step": 1393, + "time_per_iteration": 2.6706490516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170846, + "balance_loss_mlp": 1.12375855, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.06891205333434622, + "language_loss": 0.89827204, + "learning_rate": 0.0008584750922553651, + "loss": 0.90998048, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.47021484, + "step": 1394, + "time_per_iteration": 3.1465976238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164798, + "balance_loss_mlp": 1.1222403, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.06253124916771012, + "language_loss": 0.84102368, + "learning_rate": 0.0008582578389364677, + "loss": 0.85267168, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.42529297, + "step": 1395, + "time_per_iteration": 2.853278875350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170721, + "balance_loss_mlp": 1.12573135, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.0656545534576685, + "language_loss": 0.92268932, + "learning_rate": 0.0008580404465295422, + "loss": 0.93439656, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.44970703, + "step": 1396, + "time_per_iteration": 2.773932695388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152323, + "balance_loss_mlp": 1.10826349, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07972324646927738, + "language_loss": 0.88789833, + "learning_rate": 0.0008578229151189876, + "loss": 0.89942157, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.44067383, + "step": 1397, + "time_per_iteration": 2.934276819229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10151267, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.10010461149900847, + "language_loss": 0.8178823, + "learning_rate": 0.0008576052447892573, + "loss": 0.82932794, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.43115234, + "step": 1398, + "time_per_iteration": 2.5337071418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131122, + "balance_loss_mlp": 1.08768189, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.07718983812215899, + "language_loss": 0.86768365, + "learning_rate": 0.000857387435624858, + "loss": 0.87899494, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.43457031, + "step": 1399, + "time_per_iteration": 2.5189273357391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127749, + "balance_loss_mlp": 1.08404672, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0707561541840249, + "language_loss": 0.88852745, + "learning_rate": 0.0008571694877103513, + "loss": 0.89980495, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.43701172, + "step": 1400, + "time_per_iteration": 3.287325859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126684, + "balance_loss_mlp": 1.08372128, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.08476375879770352, + "language_loss": 0.88499445, + "learning_rate": 0.0008569514011303515, + "loss": 0.89626133, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.4296875, + "step": 1401, + "time_per_iteration": 2.849506378173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120044, + "balance_loss_mlp": 1.07770109, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.12418270059874827, + "language_loss": 0.88531977, + "learning_rate": 0.0008567331759695277, + "loss": 0.89652026, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.42358398, + "step": 1402, + "time_per_iteration": 2.7033023834228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119932, + "balance_loss_mlp": 1.07584798, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.09855769315853927, + "language_loss": 0.86756563, + "learning_rate": 0.0008565148123126023, + "loss": 0.87876499, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.44091797, + "step": 1403, + "time_per_iteration": 2.645425319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119876, + "balance_loss_mlp": 1.07769978, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.15226973878739974, + "language_loss": 0.86578166, + "learning_rate": 0.0008562963102443516, + "loss": 0.87698042, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.421875, + "step": 1404, + "time_per_iteration": 2.6965179443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130222, + "balance_loss_mlp": 1.08668637, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.09156828725831004, + "language_loss": 0.85926664, + "learning_rate": 0.0008560776698496056, + "loss": 0.87056887, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.43530273, + "step": 1405, + "time_per_iteration": 2.868159532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141969, + "balance_loss_mlp": 1.09707534, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.07226677638641436, + "language_loss": 0.86433703, + "learning_rate": 0.0008558588912132481, + "loss": 0.87575674, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.44873047, + "step": 1406, + "time_per_iteration": 2.8309988975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.05236614, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03207539465139433, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77525663, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.14257812, + "step": 1407, + "time_per_iteration": 4.926543235778809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09220862, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.06146298960376288, + "language_loss": 0.83448923, + "learning_rate": 0.0008554209195555016, + "loss": 0.84585381, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.44287109, + "step": 1408, + "time_per_iteration": 2.6698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136456, + "balance_loss_mlp": 1.08965421, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.1627330563817166, + "language_loss": 0.89102834, + "learning_rate": 0.0008552017267041483, + "loss": 0.90239286, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.46801758, + "step": 1409, + "time_per_iteration": 2.6957972049713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127578, + "balance_loss_mlp": 1.08349395, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06560812899143556, + "language_loss": 0.83656335, + "learning_rate": 0.0008549823959512549, + "loss": 0.84783912, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.44091797, + "step": 1410, + "time_per_iteration": 2.7068376541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011101, + "balance_loss_mlp": 1.06708908, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.08175260567644033, + "language_loss": 0.87610555, + "learning_rate": 0.0008547629273819728, + "loss": 0.88720655, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.43041992, + "step": 1411, + "time_per_iteration": 3.366260290145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_mlp": 1.06542349, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.10517352924457117, + "language_loss": 0.84009993, + "learning_rate": 0.0008545433210815074, + "loss": 0.85118002, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.42578125, + "step": 1412, + "time_per_iteration": 2.630105972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.07931852, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.09841738404648297, + "language_loss": 0.87974489, + "learning_rate": 0.0008543235771351176, + "loss": 0.89097011, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.43188477, + "step": 1413, + "time_per_iteration": 2.725048065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.08635998, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.059677420125308425, + "language_loss": 0.84918916, + "learning_rate": 0.0008541036956281154, + "loss": 0.86048239, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.42993164, + "step": 1414, + "time_per_iteration": 2.897216796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133545, + "balance_loss_mlp": 1.08898425, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.08487151018546404, + "language_loss": 0.82919049, + "learning_rate": 0.0008538836766458665, + "loss": 0.84052598, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.44580078, + "step": 1415, + "time_per_iteration": 2.8930981159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137425, + "balance_loss_mlp": 1.0942955, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09871518143765563, + "language_loss": 0.85738099, + "learning_rate": 0.0008536635202737897, + "loss": 0.86875528, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.43164062, + "step": 1416, + "time_per_iteration": 2.7891178131103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137299, + "balance_loss_mlp": 1.0931915, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.10766210404252562, + "language_loss": 0.82790214, + "learning_rate": 0.0008534432265973573, + "loss": 0.83927512, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.44091797, + "step": 1417, + "time_per_iteration": 2.6409006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141948, + "balance_loss_mlp": 1.09691095, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07824380469589887, + "language_loss": 0.88708508, + "learning_rate": 0.000853222795702095, + "loss": 0.89850456, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.45092773, + "step": 1418, + "time_per_iteration": 3.4312241077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115343, + "balance_loss_mlp": 1.10767758, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.06262628073505326, + "language_loss": 0.84196067, + "learning_rate": 0.0008530022276735813, + "loss": 0.85349494, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.45727539, + "step": 1419, + "time_per_iteration": 2.742341995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169959, + "balance_loss_mlp": 1.12742519, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07008703106338479, + "language_loss": 0.86301696, + "learning_rate": 0.0008527815225974489, + "loss": 0.87471658, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.42529297, + "step": 1420, + "time_per_iteration": 2.643151044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172801, + "balance_loss_mlp": 1.12731028, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10800570533054084, + "language_loss": 0.88767672, + "learning_rate": 0.0008525606805593829, + "loss": 0.8994047, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.45483398, + "step": 1421, + "time_per_iteration": 2.4374186992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115892, + "balance_loss_mlp": 1.11283422, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.11472023337789067, + "language_loss": 0.83181965, + "learning_rate": 0.0008523397016451213, + "loss": 0.84340894, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46142578, + "step": 1422, + "time_per_iteration": 2.585376739501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152063, + "balance_loss_mlp": 1.10824132, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.08784028487991961, + "language_loss": 0.87910116, + "learning_rate": 0.0008521185859404564, + "loss": 0.89062172, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.43847656, + "step": 1423, + "time_per_iteration": 3.399348020553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150781, + "balance_loss_mlp": 1.10634017, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06323160386311827, + "language_loss": 0.89755672, + "learning_rate": 0.0008518973335312326, + "loss": 0.90906453, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.44433594, + "step": 1424, + "time_per_iteration": 2.771397352218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141797, + "balance_loss_mlp": 1.09628344, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.0741893947597381, + "language_loss": 0.83755773, + "learning_rate": 0.0008516759445033477, + "loss": 0.84897572, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.45532227, + "step": 1425, + "time_per_iteration": 2.623136520385742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148744, + "balance_loss_mlp": 1.10227656, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08118081060083703, + "language_loss": 0.85448551, + "learning_rate": 0.0008514544189427526, + "loss": 0.865973, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.46484375, + "step": 1426, + "time_per_iteration": 2.695749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156426, + "balance_loss_mlp": 1.11208034, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.0837156631450272, + "language_loss": 0.86976963, + "learning_rate": 0.0008512327569354511, + "loss": 0.88133389, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.44360352, + "step": 1427, + "time_per_iteration": 2.5354061126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160051, + "balance_loss_mlp": 1.11353528, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.09189170382991782, + "language_loss": 0.84034801, + "learning_rate": 0.0008510109585675001, + "loss": 0.8519485, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.46508789, + "step": 1428, + "time_per_iteration": 2.5996179580688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093492, + "balance_loss_mlp": 1.07680273, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.03549776566589832, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.8224684, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.16699219, + "step": 1429, + "time_per_iteration": 4.714696407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172648, + "balance_loss_mlp": 1.1280638, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.1239425770540774, + "language_loss": 0.81035018, + "learning_rate": 0.0008505669530941415, + "loss": 0.82207668, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 3.346867322921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171144, + "balance_loss_mlp": 1.12613082, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.0741807723541833, + "language_loss": 0.84519219, + "learning_rate": 0.000850344746161112, + "loss": 0.85690367, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.45019531, + "step": 1431, + "time_per_iteration": 2.6365530490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178527, + "balance_loss_mlp": 1.13418126, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.09683250699138053, + "language_loss": 0.88287663, + "learning_rate": 0.0008501224032121894, + "loss": 0.8946619, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.44360352, + "step": 1432, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178788, + "balance_loss_mlp": 1.13406062, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06051880699738469, + "language_loss": 0.82098711, + "learning_rate": 0.0008498999243336946, + "loss": 0.832775, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.44726562, + "step": 1433, + "time_per_iteration": 2.643663167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198526, + "balance_loss_mlp": 1.15129471, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.07173936681504893, + "language_loss": 0.87897062, + "learning_rate": 0.0008496773096120021, + "loss": 0.89095587, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.47241211, + "step": 1434, + "time_per_iteration": 2.8680803775787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198281, + "balance_loss_mlp": 1.15164685, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.07924459326066897, + "language_loss": 0.84949142, + "learning_rate": 0.0008494545591335381, + "loss": 0.86147422, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46630859, + "step": 1435, + "time_per_iteration": 2.9436187744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197184, + "balance_loss_mlp": 1.15176487, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.05338969573395925, + "language_loss": 0.87283278, + "learning_rate": 0.0008492316729847823, + "loss": 0.88480461, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.4543457, + "step": 1436, + "time_per_iteration": 2.817201614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195413, + "balance_loss_mlp": 1.14739525, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08524745340475512, + "language_loss": 0.80082995, + "learning_rate": 0.0008490086512522664, + "loss": 0.81278408, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47998047, + "step": 1437, + "time_per_iteration": 2.7126290798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196486, + "balance_loss_mlp": 1.14870656, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.06867103991167788, + "language_loss": 0.90572739, + "learning_rate": 0.0008487854940225755, + "loss": 0.9176923, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47729492, + "step": 1438, + "time_per_iteration": 2.431755542755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207177, + "balance_loss_mlp": 1.15858746, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.13716227323677116, + "language_loss": 0.90202403, + "learning_rate": 0.0008485622013823466, + "loss": 0.91409582, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.48608398, + "step": 1439, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198257, + "balance_loss_mlp": 1.15062046, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.09985187013126534, + "language_loss": 0.836923, + "learning_rate": 0.00084833877341827, + "loss": 0.84890562, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47680664, + "step": 1440, + "time_per_iteration": 2.652665138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215433, + "balance_loss_mlp": 1.16562724, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09777751450797587, + "language_loss": 0.81022394, + "learning_rate": 0.000848115210217088, + "loss": 0.82237822, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.49853516, + "step": 1441, + "time_per_iteration": 2.550879955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120133, + "balance_loss_mlp": 1.15166724, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.06658099231370791, + "language_loss": 0.82249796, + "learning_rate": 0.0008478915118655952, + "loss": 0.83451128, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.49658203, + "step": 1442, + "time_per_iteration": 2.7541940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209129, + "balance_loss_mlp": 1.16261363, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.05385742523937431, + "language_loss": 0.86750221, + "learning_rate": 0.0008476676784506393, + "loss": 0.87959349, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.46557617, + "step": 1443, + "time_per_iteration": 2.6595921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120895, + "balance_loss_mlp": 1.16083765, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07541643273231594, + "language_loss": 0.82715142, + "learning_rate": 0.0008474437100591201, + "loss": 0.83924091, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.48120117, + "step": 1444, + "time_per_iteration": 3.285985231399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209577, + "balance_loss_mlp": 1.16258454, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.07952238187909891, + "language_loss": 0.8560605, + "learning_rate": 0.0008472196067779898, + "loss": 0.86815625, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47021484, + "step": 1445, + "time_per_iteration": 2.677077293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204567, + "balance_loss_mlp": 1.15600109, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10163023549653756, + "language_loss": 0.86494523, + "learning_rate": 0.0008469953686942531, + "loss": 0.87699091, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.48583984, + "step": 1446, + "time_per_iteration": 3.10603928565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.14158559, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.0769454608790312, + "language_loss": 0.83537692, + "learning_rate": 0.0008467709958949668, + "loss": 0.84726554, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.47265625, + "step": 1447, + "time_per_iteration": 2.7602903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116478, + "balance_loss_mlp": 1.11943233, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08244080074007111, + "language_loss": 0.86534739, + "learning_rate": 0.0008465464884672403, + "loss": 0.87699515, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.45410156, + "step": 1448, + "time_per_iteration": 2.702974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178355, + "balance_loss_mlp": 1.13424778, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.061441667483596626, + "language_loss": 0.85982984, + "learning_rate": 0.0008463218464982348, + "loss": 0.87161338, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.44091797, + "step": 1449, + "time_per_iteration": 2.832615852355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185601, + "balance_loss_mlp": 1.14058757, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07503412994840371, + "language_loss": 0.88168389, + "learning_rate": 0.0008460970700751645, + "loss": 0.89353991, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.45019531, + "step": 1450, + "time_per_iteration": 3.0487136840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185626, + "balance_loss_mlp": 1.13977861, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.06352945894963989, + "language_loss": 0.88538259, + "learning_rate": 0.000845872159285295, + "loss": 0.89723885, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.45849609, + "step": 1451, + "time_per_iteration": 2.715423822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_mlp": 1.04985404, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.02807340123185793, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78833961, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17285156, + "step": 1452, + "time_per_iteration": 4.906192302703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197684, + "balance_loss_mlp": 1.15064442, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.06703382456082828, + "language_loss": 0.86617672, + "learning_rate": 0.0008454219349544836, + "loss": 0.87815356, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47045898, + "step": 1453, + "time_per_iteration": 3.3534200191497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.15343201, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.08552050648295068, + "language_loss": 0.82341981, + "learning_rate": 0.000845196621588334, + "loss": 0.83540004, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.44580078, + "step": 1454, + "time_per_iteration": 2.743699073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204394, + "balance_loss_mlp": 1.1566391, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.05325666962256515, + "language_loss": 0.7637955, + "learning_rate": 0.0008449711742049706, + "loss": 0.77583951, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.4777832, + "step": 1455, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208188, + "balance_loss_mlp": 1.16222095, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.09912152167704158, + "language_loss": 0.84447122, + "learning_rate": 0.0008447455928919196, + "loss": 0.85655314, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.45996094, + "step": 1456, + "time_per_iteration": 2.597557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242882, + "balance_loss_mlp": 1.19460225, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.060789109492995964, + "language_loss": 0.87272859, + "learning_rate": 0.0008445198777367595, + "loss": 0.88515741, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.48291016, + "step": 1457, + "time_per_iteration": 2.5689990520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283391, + "balance_loss_mlp": 1.23394287, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.0840599244275116, + "language_loss": 0.80820799, + "learning_rate": 0.0008442940288271208, + "loss": 0.82104188, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.49365234, + "step": 1458, + "time_per_iteration": 2.674907922744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299064, + "balance_loss_mlp": 1.24899602, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06912303271008884, + "language_loss": 0.87410611, + "learning_rate": 0.0008440680462506856, + "loss": 0.88709676, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.50073242, + "step": 1459, + "time_per_iteration": 2.73905873298645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312423, + "balance_loss_mlp": 1.26221192, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.11964292138845481, + "language_loss": 0.86650789, + "learning_rate": 0.0008438419300951883, + "loss": 0.87963212, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.50219727, + "step": 1460, + "time_per_iteration": 2.6775193214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277494, + "balance_loss_mlp": 1.22690177, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.08967430845786024, + "language_loss": 0.86711442, + "learning_rate": 0.0008436156804484148, + "loss": 0.87988937, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.50610352, + "step": 1461, + "time_per_iteration": 2.8446624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225027, + "balance_loss_mlp": 1.17615128, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.06778030965882964, + "language_loss": 0.88354933, + "learning_rate": 0.0008433892973982031, + "loss": 0.89579964, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.48901367, + "step": 1462, + "time_per_iteration": 2.5101869106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212759, + "balance_loss_mlp": 1.16168988, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07940790981700917, + "language_loss": 0.85705763, + "learning_rate": 0.0008431627810324431, + "loss": 0.86918521, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.51098633, + "step": 1463, + "time_per_iteration": 2.6701931953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208608, + "balance_loss_mlp": 1.15906441, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.1112721524597414, + "language_loss": 0.81312853, + "learning_rate": 0.000842936131439076, + "loss": 0.82521462, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.49584961, + "step": 1464, + "time_per_iteration": 2.626397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182235, + "balance_loss_mlp": 1.13440847, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.10805991000078381, + "language_loss": 0.88305855, + "learning_rate": 0.0008427093487060951, + "loss": 0.89488095, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.4777832, + "step": 1465, + "time_per_iteration": 2.6287689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152025, + "balance_loss_mlp": 1.10815573, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.05392746655550109, + "language_loss": 0.85014635, + "learning_rate": 0.000842482432921545, + "loss": 0.86166662, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.4387207, + "step": 1466, + "time_per_iteration": 2.843055009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140929, + "balance_loss_mlp": 1.09691691, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.12216249404138245, + "language_loss": 0.8786549, + "learning_rate": 0.0008422553841735225, + "loss": 0.89006418, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.44018555, + "step": 1467, + "time_per_iteration": 2.4870855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130953, + "balance_loss_mlp": 1.08686972, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.0834179705505054, + "language_loss": 0.85186172, + "learning_rate": 0.0008420282025501757, + "loss": 0.86317128, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.44091797, + "step": 1468, + "time_per_iteration": 2.746919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.09730196, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07747841896553878, + "language_loss": 0.85862702, + "learning_rate": 0.0008418008881397043, + "loss": 0.8700223, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.42236328, + "step": 1469, + "time_per_iteration": 2.7157111167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011536, + "balance_loss_mlp": 1.11108959, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.09196817065592088, + "language_loss": 0.83090472, + "learning_rate": 0.0008415734410303595, + "loss": 0.84244066, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.42529297, + "step": 1470, + "time_per_iteration": 3.2546660900115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.1166662, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07745609031802311, + "language_loss": 0.91133046, + "learning_rate": 0.0008413458613104444, + "loss": 0.92292744, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.43017578, + "step": 1471, + "time_per_iteration": 2.683119773864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124215, + "balance_loss_mlp": 1.08091772, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06716648824100378, + "language_loss": 0.83225214, + "learning_rate": 0.0008411181490683129, + "loss": 0.84349424, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.43334961, + "step": 1472, + "time_per_iteration": 2.7247512340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112102, + "balance_loss_mlp": 1.06692195, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08730853561294576, + "language_loss": 0.83099282, + "learning_rate": 0.0008408903043923707, + "loss": 0.84211385, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.45166016, + "step": 1473, + "time_per_iteration": 2.9982750415802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_mlp": 1.06675041, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09441991509127853, + "language_loss": 0.81456125, + "learning_rate": 0.0008406623273710754, + "loss": 0.82569724, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.46826172, + "step": 1474, + "time_per_iteration": 2.6457254886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107143, + "balance_loss_mlp": 1.06482363, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.08147557265850319, + "language_loss": 0.83874208, + "learning_rate": 0.0008404342180929351, + "loss": 0.84981352, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.42358398, + "step": 1475, + "time_per_iteration": 2.6071481704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110668, + "balance_loss_mlp": 1.06758618, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0682383784230515, + "language_loss": 0.81900609, + "learning_rate": 0.00084020597664651, + "loss": 0.83011281, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.43066406, + "step": 1476, + "time_per_iteration": 2.831547260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118821, + "balance_loss_mlp": 1.07149458, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.08199753583087593, + "language_loss": 0.84526181, + "learning_rate": 0.0008399776031204111, + "loss": 0.85645002, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.47290039, + "step": 1477, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112444, + "balance_loss_mlp": 1.07832992, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07183050675580523, + "language_loss": 0.80975109, + "learning_rate": 0.0008397490976033009, + "loss": 0.82099551, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.46118164, + "step": 1478, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.03766239, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.035679392232843235, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933525, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.16210938, + "step": 1479, + "time_per_iteration": 4.813107252120972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132957, + "balance_loss_mlp": 1.08925462, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06426749014533666, + "language_loss": 0.85708797, + "learning_rate": 0.0008392916909509525, + "loss": 0.86841756, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.43725586, + "step": 1480, + "time_per_iteration": 3.105465888977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.10180378, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.12099224111333258, + "language_loss": 0.8583495, + "learning_rate": 0.0008390627899932954, + "loss": 0.86980623, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.43847656, + "step": 1481, + "time_per_iteration": 2.5961339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146403, + "balance_loss_mlp": 1.1041795, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.09850404509995118, + "language_loss": 0.88747412, + "learning_rate": 0.000838833757399789, + "loss": 0.89893812, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.42211914, + "step": 1482, + "time_per_iteration": 2.9445223808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160742, + "balance_loss_mlp": 1.11513209, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.09258701289693592, + "language_loss": 0.81233478, + "learning_rate": 0.0008386045932593515, + "loss": 0.82394218, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.45605469, + "step": 1483, + "time_per_iteration": 2.696171283721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172022, + "balance_loss_mlp": 1.12853456, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07718327666813503, + "language_loss": 0.8687939, + "learning_rate": 0.0008383752976609525, + "loss": 0.88051414, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.43481445, + "step": 1484, + "time_per_iteration": 2.948983907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159194, + "balance_loss_mlp": 1.11508679, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06564205880415652, + "language_loss": 0.80617285, + "learning_rate": 0.0008381458706936123, + "loss": 0.81776482, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.44116211, + "step": 1485, + "time_per_iteration": 2.689715623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117177, + "balance_loss_mlp": 1.12740064, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06570872016312425, + "language_loss": 0.87734085, + "learning_rate": 0.0008379163124464025, + "loss": 0.88905853, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.44384766, + "step": 1486, + "time_per_iteration": 2.7226197719573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166912, + "balance_loss_mlp": 1.12526059, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.0915307653224295, + "language_loss": 0.77564812, + "learning_rate": 0.0008376866230084452, + "loss": 0.78731728, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.41650391, + "step": 1487, + "time_per_iteration": 2.82708477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154293, + "balance_loss_mlp": 1.10901785, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07232162522245564, + "language_loss": 0.86754864, + "learning_rate": 0.000837456802468914, + "loss": 0.87909162, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.45239258, + "step": 1488, + "time_per_iteration": 2.6107335090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115391, + "balance_loss_mlp": 1.1082294, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.06580975478488113, + "language_loss": 0.85965604, + "learning_rate": 0.0008372268509170331, + "loss": 0.8711952, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.45678711, + "step": 1489, + "time_per_iteration": 2.682190418243408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147981, + "balance_loss_mlp": 1.10554218, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.0640942252200205, + "language_loss": 0.85215169, + "learning_rate": 0.0008369967684420779, + "loss": 0.86363149, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.42431641, + "step": 1490, + "time_per_iteration": 2.708315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.11154985, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.07293711729105107, + "language_loss": 0.84566355, + "learning_rate": 0.0008367665551333736, + "loss": 0.85722154, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.44262695, + "step": 1491, + "time_per_iteration": 2.605665445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159368, + "balance_loss_mlp": 1.11216116, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.0802107480821924, + "language_loss": 0.85808468, + "learning_rate": 0.0008365362110802977, + "loss": 0.86967838, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47241211, + "step": 1492, + "time_per_iteration": 2.879655122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155437, + "balance_loss_mlp": 1.109303, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.06007050516222503, + "language_loss": 0.82957923, + "learning_rate": 0.0008363057363722773, + "loss": 0.84113365, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.46142578, + "step": 1493, + "time_per_iteration": 2.8600335121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154458, + "balance_loss_mlp": 1.11085081, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.060904552171674266, + "language_loss": 0.8464222, + "learning_rate": 0.0008360751310987906, + "loss": 0.85796678, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.4362793, + "step": 1494, + "time_per_iteration": 2.602029800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151781, + "balance_loss_mlp": 1.11160707, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.06255193118064963, + "language_loss": 0.86073208, + "learning_rate": 0.0008358443953493666, + "loss": 0.87224984, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.40185547, + "step": 1495, + "time_per_iteration": 2.8682689666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116061, + "balance_loss_mlp": 1.11702669, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.06637793594414569, + "language_loss": 0.89093578, + "learning_rate": 0.0008356135292135851, + "loss": 0.90254188, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.43579102, + "step": 1496, + "time_per_iteration": 2.519700288772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162426, + "balance_loss_mlp": 1.11760294, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.07926576541007177, + "language_loss": 0.92873323, + "learning_rate": 0.0008353825327810758, + "loss": 0.94035745, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.44873047, + "step": 1497, + "time_per_iteration": 2.4195892810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.09852648, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.05522330058639147, + "language_loss": 0.81832987, + "learning_rate": 0.00083515140614152, + "loss": 0.82973409, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.41894531, + "step": 1498, + "time_per_iteration": 2.6989245414733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151843, + "balance_loss_mlp": 1.10992932, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.08112895482541128, + "language_loss": 0.87581354, + "learning_rate": 0.0008349201493846485, + "loss": 0.88733196, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.41894531, + "step": 1499, + "time_per_iteration": 2.647165298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113639, + "balance_loss_mlp": 1.09364128, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.06188269799142739, + "language_loss": 0.89485824, + "learning_rate": 0.0008346887626002432, + "loss": 0.90622216, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.42724609, + "step": 1500, + "time_per_iteration": 2.546494960784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.09546816, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.07756887509348087, + "language_loss": 0.86612689, + "learning_rate": 0.000834457245878137, + "loss": 0.87751424, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.43261719, + "step": 1501, + "time_per_iteration": 2.6271145343780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132854, + "balance_loss_mlp": 1.08993816, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07465598629984396, + "language_loss": 0.8176384, + "learning_rate": 0.000834225599308212, + "loss": 0.82896686, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.42895508, + "step": 1502, + "time_per_iteration": 3.2550971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150677, + "balance_loss_mlp": 1.10580611, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07581203663628927, + "language_loss": 0.85830456, + "learning_rate": 0.0008339938229804016, + "loss": 0.8698113, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.44897461, + "step": 1503, + "time_per_iteration": 2.704310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132016, + "balance_loss_mlp": 1.11475468, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04995777902546146, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76566839, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17285156, + "step": 1504, + "time_per_iteration": 4.959474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.10965538, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.06157445053236475, + "language_loss": 0.84505653, + "learning_rate": 0.0008335298814111094, + "loss": 0.85662901, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47607422, + "step": 1505, + "time_per_iteration": 2.5612986087799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178976, + "balance_loss_mlp": 1.13374829, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.05887296654917154, + "language_loss": 0.88222575, + "learning_rate": 0.0008332977163497455, + "loss": 0.89401549, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.4519043, + "step": 1506, + "time_per_iteration": 2.8017849922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183741, + "balance_loss_mlp": 1.13696313, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07773532252894584, + "language_loss": 0.83964998, + "learning_rate": 0.0008330654218907325, + "loss": 0.8514874, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.46801758, + "step": 1507, + "time_per_iteration": 2.6568052768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167782, + "balance_loss_mlp": 1.12016964, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.05364053536005051, + "language_loss": 0.82260346, + "learning_rate": 0.0008328329981242548, + "loss": 0.83428133, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47631836, + "step": 1508, + "time_per_iteration": 2.8732171058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161954, + "balance_loss_mlp": 1.11479485, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.06776855665971031, + "language_loss": 0.88091129, + "learning_rate": 0.0008326004451405475, + "loss": 0.8925308, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47143555, + "step": 1509, + "time_per_iteration": 2.762476921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.11104107, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.08089915602738365, + "language_loss": 0.82757521, + "learning_rate": 0.0008323677630298957, + "loss": 0.83914363, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.45800781, + "step": 1510, + "time_per_iteration": 2.554558753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152926, + "balance_loss_mlp": 1.1073643, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.07106066660777852, + "language_loss": 0.85773015, + "learning_rate": 0.0008321349518826345, + "loss": 0.86925942, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.45556641, + "step": 1511, + "time_per_iteration": 2.8341891765594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144812, + "balance_loss_mlp": 1.09870172, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.06994476337169399, + "language_loss": 0.95554525, + "learning_rate": 0.0008319020117891491, + "loss": 0.96699333, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.4609375, + "step": 1512, + "time_per_iteration": 2.6152215003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147304, + "balance_loss_mlp": 1.09902406, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.09218377020634298, + "language_loss": 0.87772787, + "learning_rate": 0.0008316689428398751, + "loss": 0.88920093, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.4831543, + "step": 1513, + "time_per_iteration": 2.687288522720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148068, + "balance_loss_mlp": 1.10407972, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05407373665960582, + "language_loss": 0.89050305, + "learning_rate": 0.0008314357451252979, + "loss": 0.90198368, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.44018555, + "step": 1514, + "time_per_iteration": 2.7870078086853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151939, + "balance_loss_mlp": 1.10644853, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.11283198751561448, + "language_loss": 0.88657945, + "learning_rate": 0.0008312024187359527, + "loss": 0.89809883, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.45483398, + "step": 1515, + "time_per_iteration": 2.6400256156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144613, + "balance_loss_mlp": 1.10060108, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.08270455580526427, + "language_loss": 0.87534022, + "learning_rate": 0.000830968963762425, + "loss": 0.8867864, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.43994141, + "step": 1516, + "time_per_iteration": 3.0442028045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151597, + "balance_loss_mlp": 1.10617828, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.06364079743342543, + "language_loss": 0.84482789, + "learning_rate": 0.0008307353802953497, + "loss": 0.85634387, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.45361328, + "step": 1517, + "time_per_iteration": 2.672921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171551, + "balance_loss_mlp": 1.12281811, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.060139597091390135, + "language_loss": 0.86612219, + "learning_rate": 0.0008305016684254125, + "loss": 0.87783766, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.48803711, + "step": 1518, + "time_per_iteration": 2.7845590114593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.12947094, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.09151635615922826, + "language_loss": 0.87469971, + "learning_rate": 0.0008302678282433479, + "loss": 0.88644284, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.44848633, + "step": 1519, + "time_per_iteration": 2.562605619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163342, + "balance_loss_mlp": 1.11999798, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07068722957296131, + "language_loss": 0.85016668, + "learning_rate": 0.0008300338598399411, + "loss": 0.86180007, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.43359375, + "step": 1520, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155651, + "balance_loss_mlp": 1.11111403, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07704766336953982, + "language_loss": 0.95187533, + "learning_rate": 0.0008297997633060263, + "loss": 0.96343178, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.44506836, + "step": 1521, + "time_per_iteration": 2.5206730365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_mlp": 1.08468485, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07256926042070597, + "language_loss": 0.85441822, + "learning_rate": 0.0008295655387324883, + "loss": 0.865695, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.42993164, + "step": 1522, + "time_per_iteration": 2.8186635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126011, + "balance_loss_mlp": 1.08090246, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.07210388942873598, + "language_loss": 0.8532753, + "learning_rate": 0.0008293311862102609, + "loss": 0.86453545, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.45092773, + "step": 1523, + "time_per_iteration": 2.4982752799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.07334912, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.0579845522804068, + "language_loss": 0.89434093, + "learning_rate": 0.0008290967058303275, + "loss": 0.90552431, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.44995117, + "step": 1524, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.07575774, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07735764089304721, + "language_loss": 0.86793721, + "learning_rate": 0.0008288620976837219, + "loss": 0.87910557, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.41088867, + "step": 1525, + "time_per_iteration": 2.4877853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.06881261, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.06064034312392981, + "language_loss": 0.83118868, + "learning_rate": 0.000828627361861527, + "loss": 0.84231043, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.43383789, + "step": 1526, + "time_per_iteration": 2.567406415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06620967, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.0729369607745646, + "language_loss": 0.84539104, + "learning_rate": 0.0008283924984548752, + "loss": 0.85648245, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.42919922, + "step": 1527, + "time_per_iteration": 2.8396716117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117649, + "balance_loss_mlp": 1.07480514, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.05516048868040139, + "language_loss": 0.85423326, + "learning_rate": 0.0008281575075549485, + "loss": 0.86540973, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.4284668, + "step": 1528, + "time_per_iteration": 2.596402645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093475, + "balance_loss_mlp": 1.0787884, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.03776357558455706, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78446174, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.14648438, + "step": 1529, + "time_per_iteration": 4.641916513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118614, + "balance_loss_mlp": 1.07436347, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.11599739785132454, + "language_loss": 0.90857148, + "learning_rate": 0.0008276871436402469, + "loss": 0.91975754, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.44238281, + "step": 1530, + "time_per_iteration": 2.8211593627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113901, + "balance_loss_mlp": 1.07239282, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.06834093724659761, + "language_loss": 0.87937176, + "learning_rate": 0.000827451770808083, + "loss": 0.8905108, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.41503906, + "step": 1531, + "time_per_iteration": 2.7127888202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.06357539, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.06489723039655686, + "language_loss": 0.8385976, + "learning_rate": 0.0008272162708478674, + "loss": 0.84966749, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.43457031, + "step": 1532, + "time_per_iteration": 2.580057144165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119293, + "balance_loss_mlp": 1.07749844, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.06938693493012958, + "language_loss": 0.86437017, + "learning_rate": 0.000826980643851029, + "loss": 0.87556309, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.41821289, + "step": 1533, + "time_per_iteration": 2.689450740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118363, + "balance_loss_mlp": 1.07518554, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.057495804655394826, + "language_loss": 0.85101378, + "learning_rate": 0.0008267448899090464, + "loss": 0.8621974, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.43188477, + "step": 1534, + "time_per_iteration": 2.5541234016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139738, + "balance_loss_mlp": 1.09460509, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.0763188518859088, + "language_loss": 0.81071836, + "learning_rate": 0.0008265090091134473, + "loss": 0.82211578, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.45117188, + "step": 1535, + "time_per_iteration": 2.851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.09309804, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06589165398662913, + "language_loss": 0.80565453, + "learning_rate": 0.0008262730015558088, + "loss": 0.8170197, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.43432617, + "step": 1536, + "time_per_iteration": 2.8671340942382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113965, + "balance_loss_mlp": 1.09423184, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.08099910548300644, + "language_loss": 0.82513618, + "learning_rate": 0.0008260368673277574, + "loss": 0.83653271, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.45410156, + "step": 1537, + "time_per_iteration": 3.114685297012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134888, + "balance_loss_mlp": 1.08973145, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06868209454347093, + "language_loss": 0.84501362, + "learning_rate": 0.0008258006065209682, + "loss": 0.85636258, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.45141602, + "step": 1538, + "time_per_iteration": 2.7343428134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112017, + "balance_loss_mlp": 1.07341647, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.07819005704771397, + "language_loss": 0.80795646, + "learning_rate": 0.0008255642192271657, + "loss": 0.8191582, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.4675293, + "step": 1539, + "time_per_iteration": 2.7900264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123831, + "balance_loss_mlp": 1.0775305, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06984070899888078, + "language_loss": 0.84251219, + "learning_rate": 0.0008253277055381241, + "loss": 0.85375053, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.46313477, + "step": 1540, + "time_per_iteration": 2.7936105728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126968, + "balance_loss_mlp": 1.08383858, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09213105437911238, + "language_loss": 0.86479163, + "learning_rate": 0.0008250910655456658, + "loss": 0.87606132, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.43115234, + "step": 1541, + "time_per_iteration": 3.119706392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141818, + "balance_loss_mlp": 1.09723353, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.06264221574110865, + "language_loss": 0.84348595, + "learning_rate": 0.0008248542993416625, + "loss": 0.85490412, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.44628906, + "step": 1542, + "time_per_iteration": 2.6273162364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.09224987, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.062187844768518095, + "language_loss": 0.838552, + "learning_rate": 0.0008246174070180352, + "loss": 0.84992176, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.44702148, + "step": 1543, + "time_per_iteration": 2.6559441089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155532, + "balance_loss_mlp": 1.11099529, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09249403217806111, + "language_loss": 0.84424686, + "learning_rate": 0.0008243803886667537, + "loss": 0.85580218, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.44506836, + "step": 1544, + "time_per_iteration": 3.161595582962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155762, + "balance_loss_mlp": 1.11196482, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.11473976054569617, + "language_loss": 0.79569989, + "learning_rate": 0.0008241432443798364, + "loss": 0.80725753, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.43774414, + "step": 1545, + "time_per_iteration": 2.8056137561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154045, + "balance_loss_mlp": 1.11160624, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05050947415994233, + "language_loss": 0.86053026, + "learning_rate": 0.0008239059742493512, + "loss": 0.87207067, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.42456055, + "step": 1546, + "time_per_iteration": 2.6890687942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146751, + "balance_loss_mlp": 1.10383546, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.060404475813103174, + "language_loss": 0.87675822, + "learning_rate": 0.0008236685783674142, + "loss": 0.88822567, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.42944336, + "step": 1547, + "time_per_iteration": 3.0594639778137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176135, + "balance_loss_mlp": 1.15439153, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05730794129930028, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77397329, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.21777344, + "step": 1548, + "time_per_iteration": 4.907459020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115635, + "balance_loss_mlp": 1.11174202, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08902597202075696, + "language_loss": 0.82813615, + "learning_rate": 0.0008231934097178955, + "loss": 0.83969963, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.44604492, + "step": 1549, + "time_per_iteration": 2.622082471847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.1013267, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.06733871211748228, + "language_loss": 0.85700476, + "learning_rate": 0.0008229556371347903, + "loss": 0.86848152, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.46362305, + "step": 1550, + "time_per_iteration": 3.0081942081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133769, + "balance_loss_mlp": 1.09018564, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.09176779567237862, + "language_loss": 0.79384351, + "learning_rate": 0.0008227177391691874, + "loss": 0.80518115, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.43554688, + "step": 1551, + "time_per_iteration": 3.1698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126053, + "balance_loss_mlp": 1.08218408, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07033401560901072, + "language_loss": 0.89799201, + "learning_rate": 0.0008224797159134463, + "loss": 0.90925252, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.4387207, + "step": 1552, + "time_per_iteration": 2.714494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.07816052, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.05144631995573129, + "language_loss": 0.83942962, + "learning_rate": 0.0008222415674599765, + "loss": 0.85061103, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.39990234, + "step": 1553, + "time_per_iteration": 3.0642828941345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130247, + "balance_loss_mlp": 1.08563888, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07574846124683007, + "language_loss": 0.83871847, + "learning_rate": 0.0008220032939012349, + "loss": 0.85002089, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.44628906, + "step": 1554, + "time_per_iteration": 2.714172840118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.08810425, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.05026836342639273, + "language_loss": 0.8851645, + "learning_rate": 0.0008217648953297277, + "loss": 0.89646089, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.41503906, + "step": 1555, + "time_per_iteration": 2.8413305282592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139651, + "balance_loss_mlp": 1.09692693, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07726233455877282, + "language_loss": 0.78621179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79760832, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.42749023, + "step": 1556, + "time_per_iteration": 2.6995439529418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153013, + "balance_loss_mlp": 1.10766625, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07367356569931041, + "language_loss": 0.8461448, + "learning_rate": 0.0008212877235186833, + "loss": 0.85767496, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.45361328, + "step": 1557, + "time_per_iteration": 2.655294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105489, + "balance_loss_mlp": 1.09290004, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.039126881386902713, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78843045, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12597656, + "step": 1558, + "time_per_iteration": 4.953773021697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148338, + "balance_loss_mlp": 1.10647154, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.07045252665170362, + "language_loss": 0.81300378, + "learning_rate": 0.0008208100527678611, + "loss": 0.82448721, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.41870117, + "step": 1559, + "time_per_iteration": 2.5706257820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142691, + "balance_loss_mlp": 1.10223174, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.09371754463761041, + "language_loss": 0.79173958, + "learning_rate": 0.0008205710305218135, + "loss": 0.80316657, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.40454102, + "step": 1560, + "time_per_iteration": 3.001490354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152428, + "balance_loss_mlp": 1.11292171, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.06044421333553386, + "language_loss": 0.90459639, + "learning_rate": 0.0008203318838190541, + "loss": 0.91612065, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.39501953, + "step": 1561, + "time_per_iteration": 2.753243923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166566, + "balance_loss_mlp": 1.1229353, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.07449479195038491, + "language_loss": 0.85542631, + "learning_rate": 0.0008200926127524281, + "loss": 0.86709195, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.43676758, + "step": 1562, + "time_per_iteration": 2.6388282775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184921, + "balance_loss_mlp": 1.14045644, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.07268784417656445, + "language_loss": 0.83160597, + "learning_rate": 0.0008198532174148289, + "loss": 0.8434552, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.44482422, + "step": 1563, + "time_per_iteration": 2.71712589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076623, + "balance_loss_mlp": 1.06308043, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03416296623034226, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81762791, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.13574219, + "step": 1564, + "time_per_iteration": 4.830719232559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194058, + "balance_loss_mlp": 1.15185785, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08914748552149089, + "language_loss": 0.88889605, + "learning_rate": 0.0008193740542985244, + "loss": 0.90083665, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.421875, + "step": 1565, + "time_per_iteration": 2.6047041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199035, + "balance_loss_mlp": 1.15647733, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.07863054385005203, + "language_loss": 0.8685202, + "learning_rate": 0.0008191342867058467, + "loss": 0.88051057, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.42578125, + "step": 1566, + "time_per_iteration": 2.715708017349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196816, + "balance_loss_mlp": 1.15280378, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.087093537774187, + "language_loss": 0.83839655, + "learning_rate": 0.0008188943952142509, + "loss": 0.85036469, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.43994141, + "step": 1567, + "time_per_iteration": 2.831888198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118972, + "balance_loss_mlp": 1.14663815, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09637975850341399, + "language_loss": 0.82476509, + "learning_rate": 0.0008186543799168711, + "loss": 0.83666229, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.43041992, + "step": 1568, + "time_per_iteration": 3.121755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_mlp": 1.13324285, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.08024736909630528, + "language_loss": 0.88665748, + "learning_rate": 0.0008184142409068892, + "loss": 0.89842814, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.43847656, + "step": 1569, + "time_per_iteration": 2.990497350692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163968, + "balance_loss_mlp": 1.12343669, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.05684047424393967, + "language_loss": 0.86850333, + "learning_rate": 0.000818173978277536, + "loss": 0.88014305, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.40551758, + "step": 1570, + "time_per_iteration": 2.636310338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171599, + "balance_loss_mlp": 1.12956595, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.07636807389642969, + "language_loss": 0.84349716, + "learning_rate": 0.000817933592122089, + "loss": 0.85521317, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.4206543, + "step": 1571, + "time_per_iteration": 2.699178695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163998, + "balance_loss_mlp": 1.11984301, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.07546742874281152, + "language_loss": 0.83585215, + "learning_rate": 0.0008176930825338749, + "loss": 0.8474921, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.44189453, + "step": 1572, + "time_per_iteration": 2.550837516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166441, + "balance_loss_mlp": 1.12385964, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07092433148156627, + "language_loss": 0.89086282, + "learning_rate": 0.0008174524496062679, + "loss": 0.90252721, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.42578125, + "step": 1573, + "time_per_iteration": 2.883683919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116421, + "balance_loss_mlp": 1.11907697, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.061103918995996154, + "language_loss": 0.8587321, + "learning_rate": 0.0008172116934326894, + "loss": 0.8703742, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.45092773, + "step": 1574, + "time_per_iteration": 2.7379467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162954, + "balance_loss_mlp": 1.12132585, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.07023429776023385, + "language_loss": 0.87709713, + "learning_rate": 0.0008169708141066097, + "loss": 0.88872665, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.41625977, + "step": 1575, + "time_per_iteration": 2.571963310241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154168, + "balance_loss_mlp": 1.11435199, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.11601472076904104, + "language_loss": 0.90864658, + "learning_rate": 0.0008167298117215465, + "loss": 0.92018831, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.39819336, + "step": 1576, + "time_per_iteration": 2.562636375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153517, + "balance_loss_mlp": 1.11141217, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08960201833145559, + "language_loss": 0.88355744, + "learning_rate": 0.0008164886863710649, + "loss": 0.89509267, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.42138672, + "step": 1577, + "time_per_iteration": 2.921163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151824, + "balance_loss_mlp": 1.11212754, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07034131144929774, + "language_loss": 0.86199445, + "learning_rate": 0.0008162474381487783, + "loss": 0.87351274, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.39697266, + "step": 1578, + "time_per_iteration": 3.029076337814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.11016417, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.07584256466560314, + "language_loss": 0.85196549, + "learning_rate": 0.0008160060671483475, + "loss": 0.86348867, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.42163086, + "step": 1579, + "time_per_iteration": 2.7073986530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142614, + "balance_loss_mlp": 1.10289371, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.08686038732079729, + "language_loss": 0.83729678, + "learning_rate": 0.0008157645734634809, + "loss": 0.84872293, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.3972168, + "step": 1580, + "time_per_iteration": 2.6613049507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090857, + "balance_loss_mlp": 1.07302368, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.0332286598930082, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77987349, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.17871094, + "step": 1581, + "time_per_iteration": 4.915473699569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074598, + "balance_loss_mlp": 1.05705047, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.028649014265593315, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74289095, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17578125, + "step": 1582, + "time_per_iteration": 4.889309883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129405, + "balance_loss_mlp": 1.08827806, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.06812522797045092, + "language_loss": 0.84052569, + "learning_rate": 0.000815039357240067, + "loss": 0.85181975, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.41113281, + "step": 1583, + "time_per_iteration": 2.6366286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138467, + "balance_loss_mlp": 1.09672034, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.06492424308297744, + "language_loss": 0.85869169, + "learning_rate": 0.0008147973737554952, + "loss": 0.87007636, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.41748047, + "step": 1584, + "time_per_iteration": 2.7854599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136804, + "balance_loss_mlp": 1.095963, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.08202571879527615, + "language_loss": 0.86834013, + "learning_rate": 0.000814555268055744, + "loss": 0.87970817, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.40844727, + "step": 1585, + "time_per_iteration": 2.6199045181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132861, + "balance_loss_mlp": 1.09130502, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07393752668393892, + "language_loss": 0.87929702, + "learning_rate": 0.0008143130402348073, + "loss": 0.89062566, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.41625977, + "step": 1586, + "time_per_iteration": 2.638741970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129239, + "balance_loss_mlp": 1.08868384, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.06849121050203105, + "language_loss": 0.7939502, + "learning_rate": 0.0008140706903867265, + "loss": 0.80524254, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.4050293, + "step": 1587, + "time_per_iteration": 2.810335874557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134042, + "balance_loss_mlp": 1.0908649, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.07851663365650921, + "language_loss": 0.91122121, + "learning_rate": 0.0008138282186055897, + "loss": 0.92256165, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.43188477, + "step": 1588, + "time_per_iteration": 2.7237448692321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137411, + "balance_loss_mlp": 1.09661722, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.06832590097240848, + "language_loss": 0.8307212, + "learning_rate": 0.0008135856249855331, + "loss": 0.84209532, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.40771484, + "step": 1589, + "time_per_iteration": 2.7399301528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153972, + "balance_loss_mlp": 1.11241579, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.09162978556143483, + "language_loss": 0.89933717, + "learning_rate": 0.0008133429096207398, + "loss": 0.91087687, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.41577148, + "step": 1590, + "time_per_iteration": 2.8074302673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_mlp": 1.0156827, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.025543227678258826, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76341486, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.13574219, + "step": 1591, + "time_per_iteration": 4.961095094680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153411, + "balance_loss_mlp": 1.11330891, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.05628096053427355, + "language_loss": 0.87358719, + "learning_rate": 0.0008128571140339123, + "loss": 0.88512129, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.40087891, + "step": 1592, + "time_per_iteration": 2.6484899520874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137482, + "balance_loss_mlp": 1.09497237, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.058132540851188214, + "language_loss": 0.87688839, + "learning_rate": 0.0008126140340004805, + "loss": 0.88826323, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.42529297, + "step": 1593, + "time_per_iteration": 2.509239912033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144438, + "balance_loss_mlp": 1.10316801, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06371804566889869, + "language_loss": 0.82466245, + "learning_rate": 0.0008123708325995172, + "loss": 0.83610678, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.4128418, + "step": 1594, + "time_per_iteration": 3.1773130893707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133345, + "balance_loss_mlp": 1.09240818, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06060698504548286, + "language_loss": 0.79972136, + "learning_rate": 0.0008121275099254414, + "loss": 0.81105477, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 2.9426517486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142693, + "balance_loss_mlp": 1.10244751, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06149446857353131, + "language_loss": 0.88748306, + "learning_rate": 0.0008118840660727194, + "loss": 0.89890993, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.40283203, + "step": 1596, + "time_per_iteration": 2.665166139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.09553957, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.15751252363629464, + "language_loss": 0.88104224, + "learning_rate": 0.0008116405011358644, + "loss": 0.89240128, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.40380859, + "step": 1597, + "time_per_iteration": 3.1415486335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.10291696, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.06428245482632208, + "language_loss": 0.80117774, + "learning_rate": 0.0008113968152094369, + "loss": 0.81262958, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.42285156, + "step": 1598, + "time_per_iteration": 2.50484037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140725, + "balance_loss_mlp": 1.09781003, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.069373282908973, + "language_loss": 0.82692802, + "learning_rate": 0.0008111530083880438, + "loss": 0.83833528, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.42895508, + "step": 1599, + "time_per_iteration": 2.9072136878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.11211586, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.09326308305844169, + "language_loss": 0.86715603, + "learning_rate": 0.0008109090807663399, + "loss": 0.87871301, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.43554688, + "step": 1600, + "time_per_iteration": 2.8556277751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154517, + "balance_loss_mlp": 1.1142, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.07163974647376076, + "language_loss": 0.89029115, + "learning_rate": 0.0008106650324390257, + "loss": 0.90183634, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.40307617, + "step": 1601, + "time_per_iteration": 2.8016483783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115055, + "balance_loss_mlp": 1.10768259, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.06437682840273379, + "language_loss": 0.81480461, + "learning_rate": 0.0008104208635008493, + "loss": 0.82631016, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.42871094, + "step": 1602, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150496, + "balance_loss_mlp": 1.10631728, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.13502170342564263, + "language_loss": 0.8243258, + "learning_rate": 0.0008101765740466058, + "loss": 0.83583081, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.44165039, + "step": 1603, + "time_per_iteration": 2.506427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144916, + "balance_loss_mlp": 1.10135674, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0649160929519563, + "language_loss": 0.84340334, + "learning_rate": 0.0008099321641711364, + "loss": 0.85485256, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.43579102, + "step": 1604, + "time_per_iteration": 2.6318166255950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151756, + "balance_loss_mlp": 1.10938883, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.0523010874933109, + "language_loss": 0.83940029, + "learning_rate": 0.0008096876339693295, + "loss": 0.85091782, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.42407227, + "step": 1605, + "time_per_iteration": 2.620199680328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150228, + "balance_loss_mlp": 1.1086241, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.07539888612246932, + "language_loss": 0.8184768, + "learning_rate": 0.0008094429835361206, + "loss": 0.82997912, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.41625977, + "step": 1606, + "time_per_iteration": 2.9251575469970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147276, + "balance_loss_mlp": 1.10679281, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.07700051037162058, + "language_loss": 0.85932112, + "learning_rate": 0.0008091982129664908, + "loss": 0.87079388, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.40478516, + "step": 1607, + "time_per_iteration": 2.7032129764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169169, + "balance_loss_mlp": 1.12427497, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.11394505928871175, + "language_loss": 0.83292013, + "learning_rate": 0.0008089533223554687, + "loss": 0.84461182, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.44897461, + "step": 1608, + "time_per_iteration": 2.6975207328796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161949, + "balance_loss_mlp": 1.12115526, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.06275490202685644, + "language_loss": 0.85402906, + "learning_rate": 0.0008087083117981294, + "loss": 0.86564851, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.40795898, + "step": 1609, + "time_per_iteration": 2.8709142208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158469, + "balance_loss_mlp": 1.11402774, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.06357956742359384, + "language_loss": 0.88521934, + "learning_rate": 0.0008084631813895943, + "loss": 0.89680409, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.44433594, + "step": 1610, + "time_per_iteration": 2.7704904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148821, + "balance_loss_mlp": 1.1059773, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07818022356789546, + "language_loss": 0.84349322, + "learning_rate": 0.0008082179312250315, + "loss": 0.85498142, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.42871094, + "step": 1611, + "time_per_iteration": 2.6352171897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118188, + "balance_loss_mlp": 1.10588562, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.03204939869531237, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8097403, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.12255859, + "step": 1612, + "time_per_iteration": 4.865812301635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095093, + "balance_loss_mlp": 1.08288634, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.024031397097536, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77724421, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.12207031, + "step": 1613, + "time_per_iteration": 5.057459831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163336, + "balance_loss_mlp": 1.12020612, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.056757119691581794, + "language_loss": 0.82255232, + "learning_rate": 0.0008074814631475545, + "loss": 0.83418566, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.43139648, + "step": 1614, + "time_per_iteration": 3.3026204109191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164621, + "balance_loss_mlp": 1.12153852, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.0685570598787085, + "language_loss": 0.79806983, + "learning_rate": 0.0008072357349114907, + "loss": 0.80971605, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.4309082, + "step": 1615, + "time_per_iteration": 2.663853645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187873, + "balance_loss_mlp": 1.14369345, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.06371446427292905, + "language_loss": 0.8904891, + "learning_rate": 0.0008069898873959363, + "loss": 0.90236783, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.44189453, + "step": 1616, + "time_per_iteration": 2.675607919692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199097, + "balance_loss_mlp": 1.15773141, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.10138062428343411, + "language_loss": 0.8626408, + "learning_rate": 0.0008067439206963375, + "loss": 0.87463176, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.41381836, + "step": 1617, + "time_per_iteration": 2.6264841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193178, + "balance_loss_mlp": 1.15119278, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.06654120721966555, + "language_loss": 0.8650856, + "learning_rate": 0.0008064978349081873, + "loss": 0.87701744, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.41967773, + "step": 1618, + "time_per_iteration": 2.9114232063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180658, + "balance_loss_mlp": 1.13712287, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.06279818174684408, + "language_loss": 0.86905777, + "learning_rate": 0.0008062516301270245, + "loss": 0.88086432, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.43530273, + "step": 1619, + "time_per_iteration": 2.697016477584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174783, + "balance_loss_mlp": 1.13341749, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.07259268941115717, + "language_loss": 0.89074606, + "learning_rate": 0.0008060053064484343, + "loss": 0.90249389, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.41381836, + "step": 1620, + "time_per_iteration": 2.9220941066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160759, + "balance_loss_mlp": 1.11996579, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.054906942105454146, + "language_loss": 0.85286081, + "learning_rate": 0.0008057588639680482, + "loss": 0.8644684, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.40795898, + "step": 1621, + "time_per_iteration": 2.7432475090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161698, + "balance_loss_mlp": 1.11754274, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.08428579582226577, + "language_loss": 0.83045304, + "learning_rate": 0.0008055123027815434, + "loss": 0.84207004, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.44165039, + "step": 1622, + "time_per_iteration": 2.888124465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149406, + "balance_loss_mlp": 1.10947073, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.06442378780427988, + "language_loss": 0.85635763, + "learning_rate": 0.0008052656229846436, + "loss": 0.86785173, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.39916992, + "step": 1623, + "time_per_iteration": 2.7215354442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.11259365, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.1013205930173775, + "language_loss": 0.90875685, + "learning_rate": 0.0008050188246731182, + "loss": 0.92030621, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.42333984, + "step": 1624, + "time_per_iteration": 2.6636321544647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146822, + "balance_loss_mlp": 1.10655355, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08961406202901398, + "language_loss": 0.82641953, + "learning_rate": 0.0008047719079427834, + "loss": 0.83788776, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.40283203, + "step": 1625, + "time_per_iteration": 2.9943442344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067888, + "balance_loss_mlp": 1.05425012, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.02225722433359613, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75419593, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.13671875, + "step": 1626, + "time_per_iteration": 4.865052700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124122, + "balance_loss_mlp": 1.0819937, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.05828883069087806, + "language_loss": 0.86570215, + "learning_rate": 0.0008042777196091757, + "loss": 0.87694335, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.42138672, + "step": 1627, + "time_per_iteration": 2.668349266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127492, + "balance_loss_mlp": 1.08481538, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08399253674550058, + "language_loss": 0.82332879, + "learning_rate": 0.0008040304481977643, + "loss": 0.83460367, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.42675781, + "step": 1628, + "time_per_iteration": 2.6445093154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130913, + "balance_loss_mlp": 1.09224153, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.06122809929096989, + "language_loss": 0.86751842, + "learning_rate": 0.0008037830587512649, + "loss": 0.87882763, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.38671875, + "step": 1629, + "time_per_iteration": 3.0830209255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131503, + "balance_loss_mlp": 1.09068549, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.06235185724616104, + "language_loss": 0.7940957, + "learning_rate": 0.0008035355513657224, + "loss": 0.80541074, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.40820312, + "step": 1630, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135326, + "balance_loss_mlp": 1.09326935, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.06249119555349938, + "language_loss": 0.9321425, + "learning_rate": 0.0008032879261372279, + "loss": 0.94349587, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.42089844, + "step": 1631, + "time_per_iteration": 2.7995047569274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_mlp": 1.01777005, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.019617221588718974, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80666578, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12988281, + "step": 1632, + "time_per_iteration": 5.3968565464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149415, + "balance_loss_mlp": 1.10959888, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.05783646939860944, + "language_loss": 0.87576675, + "learning_rate": 0.0008027923225359748, + "loss": 0.88726091, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.39819336, + "step": 1633, + "time_per_iteration": 2.5933566093444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153635, + "balance_loss_mlp": 1.11145878, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.05944909670445279, + "language_loss": 0.88579285, + "learning_rate": 0.0008025443443556267, + "loss": 0.89732921, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.421875, + "step": 1634, + "time_per_iteration": 2.728522777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149168, + "balance_loss_mlp": 1.109519, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.0772983201911997, + "language_loss": 0.88333809, + "learning_rate": 0.000802296248717147, + "loss": 0.89482975, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.39648438, + "step": 1635, + "time_per_iteration": 2.9030401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140791, + "balance_loss_mlp": 1.0971607, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06629024784700413, + "language_loss": 0.7930302, + "learning_rate": 0.0008020480357168554, + "loss": 0.80443811, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.43603516, + "step": 1636, + "time_per_iteration": 2.839134931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145583, + "balance_loss_mlp": 1.1038121, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.06656267016529639, + "language_loss": 0.88396037, + "learning_rate": 0.0008017997054511165, + "loss": 0.89541626, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.41796875, + "step": 1637, + "time_per_iteration": 2.5937085151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148115, + "balance_loss_mlp": 1.10424566, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06622170213435077, + "language_loss": 0.85649616, + "learning_rate": 0.0008015512580163407, + "loss": 0.86797726, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.43896484, + "step": 1638, + "time_per_iteration": 2.8432726860046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138121, + "balance_loss_mlp": 1.09639752, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.06676164925493694, + "language_loss": 0.81149763, + "learning_rate": 0.0008013026935089838, + "loss": 0.82287884, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.41699219, + "step": 1639, + "time_per_iteration": 2.8703761100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142857, + "balance_loss_mlp": 1.1031127, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.060786667538297263, + "language_loss": 0.84702241, + "learning_rate": 0.0008010540120255472, + "loss": 0.85845095, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.3972168, + "step": 1640, + "time_per_iteration": 2.6741273403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136768, + "balance_loss_mlp": 1.09511614, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.06934658167266547, + "language_loss": 0.86723542, + "learning_rate": 0.0008008052136625774, + "loss": 0.8786031, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.41650391, + "step": 1641, + "time_per_iteration": 2.8395094871520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135427, + "balance_loss_mlp": 1.09272623, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07613576058544219, + "language_loss": 0.87025082, + "learning_rate": 0.0008005562985166666, + "loss": 0.88160515, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.42675781, + "step": 1642, + "time_per_iteration": 2.708812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127626, + "balance_loss_mlp": 1.08621287, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05118616143218352, + "language_loss": 0.85440576, + "learning_rate": 0.0008003072666844524, + "loss": 0.86568201, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.41430664, + "step": 1643, + "time_per_iteration": 2.74019193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127922, + "balance_loss_mlp": 1.08746231, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.07457594622010144, + "language_loss": 0.82632107, + "learning_rate": 0.0008000581182626173, + "loss": 0.83760029, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.40478516, + "step": 1644, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011327, + "balance_loss_mlp": 1.09159672, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.0586598658040055, + "language_loss": 0.86714005, + "learning_rate": 0.0007998088533478894, + "loss": 0.87846708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.41137695, + "step": 1645, + "time_per_iteration": 2.674678087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130403, + "balance_loss_mlp": 1.08805966, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.10428151324619617, + "language_loss": 0.84319067, + "learning_rate": 0.000799559472037042, + "loss": 0.85449469, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.4230957, + "step": 1646, + "time_per_iteration": 2.5389983654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130022, + "balance_loss_mlp": 1.08939528, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05498023868715711, + "language_loss": 0.8798641, + "learning_rate": 0.0007993099744268932, + "loss": 0.8911643, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.40625, + "step": 1647, + "time_per_iteration": 2.919410467147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127448, + "balance_loss_mlp": 1.0858674, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.07648109375468225, + "language_loss": 0.88298547, + "learning_rate": 0.000799060360614307, + "loss": 0.89425999, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.41577148, + "step": 1648, + "time_per_iteration": 2.679098606109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132184, + "balance_loss_mlp": 1.09117627, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.17676844539598618, + "language_loss": 0.83707428, + "learning_rate": 0.0007988106306961917, + "loss": 0.84839618, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.41015625, + "step": 1649, + "time_per_iteration": 3.1304876804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139719, + "balance_loss_mlp": 1.09809113, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.06731506602110418, + "language_loss": 0.84557772, + "learning_rate": 0.0007985607847695014, + "loss": 0.85697484, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.41625977, + "step": 1650, + "time_per_iteration": 2.6152966022491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151307, + "balance_loss_mlp": 1.11087108, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.08658277444707524, + "language_loss": 0.83160597, + "learning_rate": 0.0007983108229312345, + "loss": 0.84311903, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.40454102, + "step": 1651, + "time_per_iteration": 2.9157605171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180085, + "balance_loss_mlp": 1.13864803, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.12326743545136284, + "language_loss": 0.86631948, + "learning_rate": 0.0007980607452784351, + "loss": 0.8781203, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.4140625, + "step": 1652, + "time_per_iteration": 2.5533528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170234, + "balance_loss_mlp": 1.12798643, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.07656805667485655, + "language_loss": 0.90550399, + "learning_rate": 0.0007978105519081919, + "loss": 0.91720629, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.42236328, + "step": 1653, + "time_per_iteration": 2.683962821960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162895, + "balance_loss_mlp": 1.12088561, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.06859901935764132, + "language_loss": 0.88378012, + "learning_rate": 0.0007975602429176385, + "loss": 0.89540899, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.42041016, + "step": 1654, + "time_per_iteration": 2.563507556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165514, + "balance_loss_mlp": 1.12421989, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.07830522948057009, + "language_loss": 0.81779003, + "learning_rate": 0.0007973098184039536, + "loss": 0.82944512, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.4128418, + "step": 1655, + "time_per_iteration": 2.6503560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154556, + "balance_loss_mlp": 1.11433494, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.07004293098644994, + "language_loss": 0.87212098, + "learning_rate": 0.0007970592784643602, + "loss": 0.88366652, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.40185547, + "step": 1656, + "time_per_iteration": 2.8598649501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167315, + "balance_loss_mlp": 1.12366056, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.08267452239342069, + "language_loss": 0.8563, + "learning_rate": 0.0007968086231961272, + "loss": 0.86797309, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.43676758, + "step": 1657, + "time_per_iteration": 2.637216806411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158327, + "balance_loss_mlp": 1.11343288, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.09173012098392071, + "language_loss": 0.83764172, + "learning_rate": 0.0007965578526965671, + "loss": 0.84922498, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.44897461, + "step": 1658, + "time_per_iteration": 2.607729911804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154777, + "balance_loss_mlp": 1.11307764, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.08650327787833377, + "language_loss": 0.86397582, + "learning_rate": 0.0007963069670630377, + "loss": 0.87552357, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.41723633, + "step": 1659, + "time_per_iteration": 2.7385904788970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154696, + "balance_loss_mlp": 1.11175728, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.06815630012467462, + "language_loss": 0.88107586, + "learning_rate": 0.0007960559663929416, + "loss": 0.89262283, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.4296875, + "step": 1660, + "time_per_iteration": 2.696936845779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155709, + "balance_loss_mlp": 1.11372399, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.07443207173064395, + "language_loss": 0.8773188, + "learning_rate": 0.0007958048507837259, + "loss": 0.88887584, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.41992188, + "step": 1661, + "time_per_iteration": 3.0276992321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.12168884, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.07361086812440759, + "language_loss": 0.87900233, + "learning_rate": 0.0007955536203328822, + "loss": 0.89066029, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.44116211, + "step": 1662, + "time_per_iteration": 2.9181947708129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167079, + "balance_loss_mlp": 1.12497449, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0536049497981301, + "language_loss": 0.8375597, + "learning_rate": 0.0007953022751379469, + "loss": 0.84923047, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.42089844, + "step": 1663, + "time_per_iteration": 2.8502774238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160364, + "balance_loss_mlp": 1.11749601, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.09076105210561375, + "language_loss": 0.82297581, + "learning_rate": 0.000795050815296501, + "loss": 0.83457941, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.42871094, + "step": 1664, + "time_per_iteration": 2.990253210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149395, + "balance_loss_mlp": 1.10821986, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.05392034602485258, + "language_loss": 0.93401325, + "learning_rate": 0.0007947992409061695, + "loss": 0.94550717, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.41162109, + "step": 1665, + "time_per_iteration": 2.5734803676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146101, + "balance_loss_mlp": 1.10456824, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07147454481835314, + "language_loss": 0.86398005, + "learning_rate": 0.0007945475520646226, + "loss": 0.87544107, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.4152832, + "step": 1666, + "time_per_iteration": 2.9147067070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144126, + "balance_loss_mlp": 1.10156846, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08541845552139904, + "language_loss": 0.85159481, + "learning_rate": 0.0007942957488695743, + "loss": 0.8630361, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.42578125, + "step": 1667, + "time_per_iteration": 2.6842408180236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138005, + "balance_loss_mlp": 1.09725952, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06001483498827303, + "language_loss": 0.81309706, + "learning_rate": 0.0007940438314187833, + "loss": 0.82447714, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.4074707, + "step": 1668, + "time_per_iteration": 3.0340676307678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128156, + "balance_loss_mlp": 1.08769631, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.06998559069767052, + "language_loss": 0.81191337, + "learning_rate": 0.0007937917998100529, + "loss": 0.82319492, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.40454102, + "step": 1669, + "time_per_iteration": 2.635629177093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.09313023, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.08304565240235381, + "language_loss": 0.79254091, + "learning_rate": 0.0007935396541412302, + "loss": 0.80392736, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.45532227, + "step": 1670, + "time_per_iteration": 2.6226065158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141669, + "balance_loss_mlp": 1.09896851, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07816166477955887, + "language_loss": 0.85914934, + "learning_rate": 0.0007932873945102068, + "loss": 0.87056601, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.42724609, + "step": 1671, + "time_per_iteration": 2.559443473815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_mlp": 1.03238678, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.025388272809080015, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76809424, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15234375, + "step": 1672, + "time_per_iteration": 4.8329596519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176113, + "balance_loss_mlp": 1.13319826, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.10680060394368475, + "language_loss": 0.86589128, + "learning_rate": 0.0007927825337533461, + "loss": 0.87765247, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.42895508, + "step": 1673, + "time_per_iteration": 2.670067071914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117015, + "balance_loss_mlp": 1.12651968, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.0659920492524482, + "language_loss": 0.84953517, + "learning_rate": 0.0007925299328235131, + "loss": 0.86123669, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.43652344, + "step": 1674, + "time_per_iteration": 2.6559884548187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169615, + "balance_loss_mlp": 1.12543643, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.10142438885407562, + "language_loss": 0.85307467, + "learning_rate": 0.000792277218323488, + "loss": 0.86477083, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.44189453, + "step": 1675, + "time_per_iteration": 2.5843372344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158765, + "balance_loss_mlp": 1.11673164, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.06840501438298492, + "language_loss": 0.85418063, + "learning_rate": 0.0007920243903513833, + "loss": 0.86576831, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.4206543, + "step": 1676, + "time_per_iteration": 2.562697649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.09280825, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.06731593225582447, + "language_loss": 0.84609574, + "learning_rate": 0.0007917714490053556, + "loss": 0.85747755, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.45361328, + "step": 1677, + "time_per_iteration": 2.685854434967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131879, + "balance_loss_mlp": 1.09029913, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06310440112326268, + "language_loss": 0.86562228, + "learning_rate": 0.0007915183943836055, + "loss": 0.87694108, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.41601562, + "step": 1678, + "time_per_iteration": 2.8568227291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128366, + "balance_loss_mlp": 1.08466363, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07690366782162197, + "language_loss": 0.84428912, + "learning_rate": 0.0007912652265843773, + "loss": 0.85557282, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.43725586, + "step": 1679, + "time_per_iteration": 3.079998254776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110817, + "balance_loss_mlp": 1.06930852, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.07712564159484636, + "language_loss": 0.8213551, + "learning_rate": 0.0007910119457059597, + "loss": 0.83246326, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.4152832, + "step": 1680, + "time_per_iteration": 2.6812973022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112077, + "balance_loss_mlp": 1.06975782, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.10745693955939492, + "language_loss": 0.81109858, + "learning_rate": 0.0007907585518466849, + "loss": 0.82221937, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.42333984, + "step": 1681, + "time_per_iteration": 2.9406683444976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115262, + "balance_loss_mlp": 1.07265627, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07157404686533678, + "language_loss": 0.89948541, + "learning_rate": 0.000790505045104929, + "loss": 0.91063797, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.42602539, + "step": 1682, + "time_per_iteration": 2.5241646766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119953, + "balance_loss_mlp": 1.07606041, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.06937214564576595, + "language_loss": 0.87034553, + "learning_rate": 0.0007902514255791125, + "loss": 0.88154507, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.43896484, + "step": 1683, + "time_per_iteration": 2.8741068840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111917, + "balance_loss_mlp": 1.076231, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06778435640114842, + "language_loss": 0.87994444, + "learning_rate": 0.0007899976933676986, + "loss": 0.89113617, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.42919922, + "step": 1684, + "time_per_iteration": 2.959290027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117467, + "balance_loss_mlp": 1.07469463, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.06453517439379398, + "language_loss": 0.87573123, + "learning_rate": 0.0007897438485691955, + "loss": 0.88690597, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.42773438, + "step": 1685, + "time_per_iteration": 2.6591978073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_mlp": 1.08655035, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.13512041919643347, + "language_loss": 0.82386112, + "learning_rate": 0.0007894898912821542, + "loss": 0.835177, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.45043945, + "step": 1686, + "time_per_iteration": 2.5375750064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134689, + "balance_loss_mlp": 1.09201205, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.07414292899066016, + "language_loss": 0.8748548, + "learning_rate": 0.0007892358216051695, + "loss": 0.88620168, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.42675781, + "step": 1687, + "time_per_iteration": 2.73968243598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132707, + "balance_loss_mlp": 1.09098339, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06337992950379638, + "language_loss": 0.92269105, + "learning_rate": 0.0007889816396368803, + "loss": 0.93401814, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.41699219, + "step": 1688, + "time_per_iteration": 2.6067299842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131771, + "balance_loss_mlp": 1.08961868, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07885708031147778, + "language_loss": 0.85782814, + "learning_rate": 0.0007887273454759687, + "loss": 0.86914587, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.421875, + "step": 1689, + "time_per_iteration": 2.484260320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122278, + "balance_loss_mlp": 1.08031607, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.06527022407794938, + "language_loss": 0.82859224, + "learning_rate": 0.0007884729392211603, + "loss": 0.83981502, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.41943359, + "step": 1690, + "time_per_iteration": 2.642786741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129634, + "balance_loss_mlp": 1.08812594, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09568065131307975, + "language_loss": 0.86132944, + "learning_rate": 0.0007882184209712245, + "loss": 0.87262577, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.41503906, + "step": 1691, + "time_per_iteration": 2.5199530124664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123067, + "balance_loss_mlp": 1.08234525, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06282055281729462, + "language_loss": 0.86132228, + "learning_rate": 0.000787963790824974, + "loss": 0.87255299, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.40722656, + "step": 1692, + "time_per_iteration": 2.9768075942993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124522, + "balance_loss_mlp": 1.08427668, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.07612118071262816, + "language_loss": 0.89543802, + "learning_rate": 0.0007877090488812651, + "loss": 0.90668321, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.40258789, + "step": 1693, + "time_per_iteration": 2.4604485034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124012, + "balance_loss_mlp": 1.08207428, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.1035661329718289, + "language_loss": 0.83982152, + "learning_rate": 0.0007874541952389973, + "loss": 0.85106164, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.41943359, + "step": 1694, + "time_per_iteration": 2.6709587574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113814, + "balance_loss_mlp": 1.09753752, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08446561178027004, + "language_loss": 0.86949492, + "learning_rate": 0.0007871992299971136, + "loss": 0.8808763, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.40625, + "step": 1695, + "time_per_iteration": 2.5585403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150743, + "balance_loss_mlp": 1.11023593, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.05830689117178756, + "language_loss": 0.84793502, + "learning_rate": 0.0007869441532546001, + "loss": 0.85944247, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.4050293, + "step": 1696, + "time_per_iteration": 2.7510788440704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148317, + "balance_loss_mlp": 1.1100266, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06976949490853021, + "language_loss": 0.79791546, + "learning_rate": 0.0007866889651104867, + "loss": 0.80939865, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.38256836, + "step": 1697, + "time_per_iteration": 2.7944459915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152152, + "balance_loss_mlp": 1.11114383, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.06767982610774756, + "language_loss": 0.83777177, + "learning_rate": 0.000786433665663846, + "loss": 0.84929335, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.40991211, + "step": 1698, + "time_per_iteration": 2.6864194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167936, + "balance_loss_mlp": 1.12514019, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.0725657973515617, + "language_loss": 0.87005848, + "learning_rate": 0.0007861782550137942, + "loss": 0.88173789, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.42797852, + "step": 1699, + "time_per_iteration": 2.896897792816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160393, + "balance_loss_mlp": 1.11986172, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.0774952835645251, + "language_loss": 0.86092401, + "learning_rate": 0.0007859227332594901, + "loss": 0.87252796, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.40527344, + "step": 1700, + "time_per_iteration": 2.8986380100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165908, + "balance_loss_mlp": 1.12449527, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09509515836767467, + "language_loss": 0.85007191, + "learning_rate": 0.0007856671005001365, + "loss": 0.86173105, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.41381836, + "step": 1701, + "time_per_iteration": 3.148084878921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168963, + "balance_loss_mlp": 1.12726378, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.07560076292899535, + "language_loss": 0.82363045, + "learning_rate": 0.0007854113568349787, + "loss": 0.83532006, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.41699219, + "step": 1702, + "time_per_iteration": 3.1411454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191314, + "balance_loss_mlp": 1.14882779, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.08142047178498793, + "language_loss": 0.81090933, + "learning_rate": 0.0007851555023633052, + "loss": 0.82282251, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.42504883, + "step": 1703, + "time_per_iteration": 2.9109766483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197058, + "balance_loss_mlp": 1.1559788, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07993965020483434, + "language_loss": 0.82561779, + "learning_rate": 0.0007848995371844474, + "loss": 0.83758843, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.41088867, + "step": 1704, + "time_per_iteration": 2.531611680984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197334, + "balance_loss_mlp": 1.15267849, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.11293951672356671, + "language_loss": 0.81012988, + "learning_rate": 0.0007846434613977801, + "loss": 0.82210326, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.44677734, + "step": 1705, + "time_per_iteration": 2.5413970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175519, + "balance_loss_mlp": 1.1340816, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.10106481858624654, + "language_loss": 0.78958142, + "learning_rate": 0.0007843872751027203, + "loss": 0.80133665, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.41455078, + "step": 1706, + "time_per_iteration": 2.817387580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158115, + "balance_loss_mlp": 1.1166296, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.06764312208644677, + "language_loss": 0.87366319, + "learning_rate": 0.0007841309783987287, + "loss": 0.88524431, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.41503906, + "step": 1707, + "time_per_iteration": 2.7335729598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155907, + "balance_loss_mlp": 1.11117959, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06220723681544313, + "language_loss": 0.89445031, + "learning_rate": 0.0007838745713853084, + "loss": 0.90600932, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.44702148, + "step": 1708, + "time_per_iteration": 2.6179606914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114633, + "balance_loss_mlp": 1.10207939, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.09473479000062662, + "language_loss": 0.84092307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85238636, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.44213867, + "step": 1709, + "time_per_iteration": 2.703660249710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160153, + "balance_loss_mlp": 1.11723721, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06816782803484764, + "language_loss": 0.86778289, + "learning_rate": 0.0007833614268284082, + "loss": 0.8793844, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.42944336, + "step": 1710, + "time_per_iteration": 2.548859119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077221, + "balance_loss_mlp": 1.06558585, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.029019472878356288, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75186992, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.11621094, + "step": 1711, + "time_per_iteration": 4.9234619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117236, + "balance_loss_mlp": 1.12934983, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.10714861433418864, + "language_loss": 0.78928375, + "learning_rate": 0.0007828478422289016, + "loss": 0.80100739, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.43017578, + "step": 1712, + "time_per_iteration": 2.584307909011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167703, + "balance_loss_mlp": 1.12228465, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.08165577234876795, + "language_loss": 0.89409995, + "learning_rate": 0.0007825908851623833, + "loss": 0.90577698, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.45410156, + "step": 1713, + "time_per_iteration": 2.7400283813476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158648, + "balance_loss_mlp": 1.11475515, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08464988169520862, + "language_loss": 0.85764992, + "learning_rate": 0.0007823338183843533, + "loss": 0.86923635, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.43896484, + "step": 1714, + "time_per_iteration": 2.671375036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157012, + "balance_loss_mlp": 1.11419201, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.0730773907324959, + "language_loss": 0.81870985, + "learning_rate": 0.0007820766419946141, + "loss": 0.83028001, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4284668, + "step": 1715, + "time_per_iteration": 3.3361854553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_mlp": 1.01473284, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.017749933707714268, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80699992, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.12792969, + "step": 1716, + "time_per_iteration": 4.933880567550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193401, + "balance_loss_mlp": 1.14895988, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.1003306893312863, + "language_loss": 0.76434684, + "learning_rate": 0.0007815619607794288, + "loss": 0.77628088, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.4440918, + "step": 1717, + "time_per_iteration": 2.6259148120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191125, + "balance_loss_mlp": 1.14823365, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.07927399877074098, + "language_loss": 0.83156073, + "learning_rate": 0.0007813044561538001, + "loss": 0.84347194, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.42895508, + "step": 1718, + "time_per_iteration": 3.1473774909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.145239, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06905487251407855, + "language_loss": 0.88941157, + "learning_rate": 0.0007810468423160958, + "loss": 0.9013117, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.44799805, + "step": 1719, + "time_per_iteration": 2.895155906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181044, + "balance_loss_mlp": 1.13943982, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06204943336400955, + "language_loss": 0.82643551, + "learning_rate": 0.0007807891193663306, + "loss": 0.83824587, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.41625977, + "step": 1720, + "time_per_iteration": 2.7824859619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165341, + "balance_loss_mlp": 1.12357068, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07732363095630222, + "language_loss": 0.82492876, + "learning_rate": 0.0007805312874045614, + "loss": 0.83658212, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.41796875, + "step": 1721, + "time_per_iteration": 2.5710601806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170989, + "balance_loss_mlp": 1.12807381, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.07358039625922873, + "language_loss": 0.86639178, + "learning_rate": 0.0007802733465308874, + "loss": 0.87810171, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.42895508, + "step": 1722, + "time_per_iteration": 2.4402778148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171295, + "balance_loss_mlp": 1.12632966, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06616160911514579, + "language_loss": 0.8424235, + "learning_rate": 0.0007800152968454501, + "loss": 0.85413647, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.44970703, + "step": 1723, + "time_per_iteration": 2.689309597015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115688, + "balance_loss_mlp": 1.11634886, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06191321033146657, + "language_loss": 0.90671206, + "learning_rate": 0.0007797571384484334, + "loss": 0.91828084, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.40527344, + "step": 1724, + "time_per_iteration": 2.8473238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147699, + "balance_loss_mlp": 1.10421109, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.06062690844208358, + "language_loss": 0.92524576, + "learning_rate": 0.0007794988714400633, + "loss": 0.93672276, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.43530273, + "step": 1725, + "time_per_iteration": 2.62685227394104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146389, + "balance_loss_mlp": 1.10118532, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.09351886782013036, + "language_loss": 0.85586655, + "learning_rate": 0.0007792404959206079, + "loss": 0.86733043, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.45214844, + "step": 1726, + "time_per_iteration": 2.487520694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150134, + "balance_loss_mlp": 1.10707533, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.09481341164405561, + "language_loss": 0.81825417, + "learning_rate": 0.0007789820119903774, + "loss": 0.82975549, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.4309082, + "step": 1727, + "time_per_iteration": 2.9732954502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118165, + "balance_loss_mlp": 1.16734493, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.0769954731958624, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79674315, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14257812, + "step": 1728, + "time_per_iteration": 4.8314409255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149054, + "balance_loss_mlp": 1.10599601, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.06765949793064117, + "language_loss": 0.84123361, + "learning_rate": 0.0007784647192990428, + "loss": 0.85272419, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.43041992, + "step": 1729, + "time_per_iteration": 2.715163230895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147649, + "balance_loss_mlp": 1.10799968, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06156065876328187, + "language_loss": 0.80939102, + "learning_rate": 0.0007782059107387696, + "loss": 0.82086754, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.39672852, + "step": 1730, + "time_per_iteration": 2.865858554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165768, + "balance_loss_mlp": 1.12247074, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.07708666526094303, + "language_loss": 0.88668191, + "learning_rate": 0.0007779469941693826, + "loss": 0.89833963, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.43261719, + "step": 1731, + "time_per_iteration": 2.8640921115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166075, + "balance_loss_mlp": 1.12351775, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.08600344935746515, + "language_loss": 0.76943499, + "learning_rate": 0.0007776879696914029, + "loss": 0.78109574, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.42553711, + "step": 1732, + "time_per_iteration": 2.8162899017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159987, + "balance_loss_mlp": 1.11745262, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.07534435583192022, + "language_loss": 0.89131331, + "learning_rate": 0.000777428837405392, + "loss": 0.90291321, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.42553711, + "step": 1733, + "time_per_iteration": 2.869436740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151668, + "balance_loss_mlp": 1.11042213, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.0649827105829465, + "language_loss": 0.87220478, + "learning_rate": 0.0007771695974119544, + "loss": 0.88372147, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.41259766, + "step": 1734, + "time_per_iteration": 2.5153088569641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138148, + "balance_loss_mlp": 1.0959959, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07614790264044081, + "language_loss": 0.76295686, + "learning_rate": 0.0007769102498117359, + "loss": 0.77433836, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.42163086, + "step": 1735, + "time_per_iteration": 3.1105504035949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136381, + "balance_loss_mlp": 1.09430027, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06230250245944302, + "language_loss": 0.80020654, + "learning_rate": 0.000776650794705424, + "loss": 0.81157035, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.42089844, + "step": 1736, + "time_per_iteration": 3.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141082, + "balance_loss_mlp": 1.09890568, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.053956568858798265, + "language_loss": 0.82610357, + "learning_rate": 0.0007763912321937483, + "loss": 0.8375144, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.421875, + "step": 1737, + "time_per_iteration": 2.6871769428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126175, + "balance_loss_mlp": 1.0870508, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.06336651482337263, + "language_loss": 0.82955027, + "learning_rate": 0.0007761315623774799, + "loss": 0.84081209, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.39111328, + "step": 1738, + "time_per_iteration": 3.4055540561676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113225, + "balance_loss_mlp": 1.09088469, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08278309899958468, + "language_loss": 0.88244802, + "learning_rate": 0.0007758717853574313, + "loss": 0.89377058, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.41381836, + "step": 1739, + "time_per_iteration": 2.7666313648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120554, + "balance_loss_mlp": 1.08114362, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0696820530517557, + "language_loss": 0.90798807, + "learning_rate": 0.0007756119012344571, + "loss": 0.91919363, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.39404297, + "step": 1740, + "time_per_iteration": 2.5491223335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115915, + "balance_loss_mlp": 1.07428706, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06589349032225494, + "language_loss": 0.85103011, + "learning_rate": 0.0007753519101094535, + "loss": 0.86218929, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.41625977, + "step": 1741, + "time_per_iteration": 2.765583038330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112401, + "balance_loss_mlp": 1.0837177, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.0662644502369307, + "language_loss": 0.86452365, + "learning_rate": 0.0007750918120833575, + "loss": 0.87576377, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.40283203, + "step": 1742, + "time_per_iteration": 2.6085479259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140409, + "balance_loss_mlp": 1.10240483, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.07280628286033199, + "language_loss": 0.87783647, + "learning_rate": 0.0007748316072571485, + "loss": 0.88924056, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.37963867, + "step": 1743, + "time_per_iteration": 2.793119192123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133272, + "balance_loss_mlp": 1.09259784, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0850070564381928, + "language_loss": 0.79522568, + "learning_rate": 0.0007745712957318467, + "loss": 0.80655837, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.40698242, + "step": 1744, + "time_per_iteration": 2.943847417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137205, + "balance_loss_mlp": 1.09700739, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06831295126385283, + "language_loss": 0.86807823, + "learning_rate": 0.0007743108776085141, + "loss": 0.87945032, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.40136719, + "step": 1745, + "time_per_iteration": 2.771634101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011368, + "balance_loss_mlp": 1.09743714, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.05902486087385494, + "language_loss": 0.83364028, + "learning_rate": 0.0007740503529882543, + "loss": 0.84500825, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.39331055, + "step": 1746, + "time_per_iteration": 2.7896366119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139374, + "balance_loss_mlp": 1.09831822, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.061665767711377016, + "language_loss": 0.90955931, + "learning_rate": 0.0007737897219722114, + "loss": 0.92095304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.41088867, + "step": 1747, + "time_per_iteration": 2.7088165283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129332, + "balance_loss_mlp": 1.08725071, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08528813851267185, + "language_loss": 0.81553382, + "learning_rate": 0.0007735289846615716, + "loss": 0.82682711, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.42089844, + "step": 1748, + "time_per_iteration": 2.635098934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129982, + "balance_loss_mlp": 1.09119081, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.09169024401551043, + "language_loss": 0.82026851, + "learning_rate": 0.0007732681411575621, + "loss": 0.83156836, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.38818359, + "step": 1749, + "time_per_iteration": 2.6693224906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134785, + "balance_loss_mlp": 1.09437299, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0698579909367107, + "language_loss": 0.88035583, + "learning_rate": 0.0007730071915614514, + "loss": 0.89170372, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.40405273, + "step": 1750, + "time_per_iteration": 2.6900789737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.09800839, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09938227861633823, + "language_loss": 0.89158392, + "learning_rate": 0.0007727461359745489, + "loss": 0.90296388, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.3996582, + "step": 1751, + "time_per_iteration": 2.5086123943328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154901, + "balance_loss_mlp": 1.1132257, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06249007419708336, + "language_loss": 0.86569941, + "learning_rate": 0.0007724849744982056, + "loss": 0.87724847, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.41674805, + "step": 1752, + "time_per_iteration": 2.700474739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169913, + "balance_loss_mlp": 1.12737882, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.06015013269361517, + "language_loss": 0.8195309, + "learning_rate": 0.0007722237072338131, + "loss": 0.83123004, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.42529297, + "step": 1753, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.14816022, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.11537307258838475, + "language_loss": 0.85648489, + "learning_rate": 0.0007719623342828046, + "loss": 0.86841327, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.44726562, + "step": 1754, + "time_per_iteration": 2.517010450363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191581, + "balance_loss_mlp": 1.14685392, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.06847069318075473, + "language_loss": 0.84535718, + "learning_rate": 0.000771700855746654, + "loss": 0.85727292, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.44750977, + "step": 1755, + "time_per_iteration": 2.5961217880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164795, + "balance_loss_mlp": 1.1231432, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.05626734330263072, + "language_loss": 0.8872534, + "learning_rate": 0.0007714392717268763, + "loss": 0.89890134, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.41674805, + "step": 1756, + "time_per_iteration": 2.5784223079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166558, + "balance_loss_mlp": 1.12185431, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.07105398160496887, + "language_loss": 0.8649826, + "learning_rate": 0.0007711775823250273, + "loss": 0.87664813, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.44702148, + "step": 1757, + "time_per_iteration": 2.5373613834381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115594, + "balance_loss_mlp": 1.11207056, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06341765106008965, + "language_loss": 0.83797616, + "learning_rate": 0.0007709157876427039, + "loss": 0.84953558, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.43896484, + "step": 1758, + "time_per_iteration": 3.1393754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144144, + "balance_loss_mlp": 1.10027504, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.0573406658982909, + "language_loss": 0.85933769, + "learning_rate": 0.0007706538877815439, + "loss": 0.8707791, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4387207, + "step": 1759, + "time_per_iteration": 2.6080896854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152987, + "balance_loss_mlp": 1.11054862, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.06135171113161323, + "language_loss": 0.83615482, + "learning_rate": 0.0007703918828432259, + "loss": 0.84768468, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.42456055, + "step": 1760, + "time_per_iteration": 2.5886309146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148897, + "balance_loss_mlp": 1.10464644, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.05937499082636783, + "language_loss": 0.88942921, + "learning_rate": 0.000770129772929469, + "loss": 0.90091813, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.44238281, + "step": 1761, + "time_per_iteration": 2.645293951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140708, + "balance_loss_mlp": 1.09629107, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07244625367361128, + "language_loss": 0.88504505, + "learning_rate": 0.0007698675581420334, + "loss": 0.89645213, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.4440918, + "step": 1762, + "time_per_iteration": 2.849560022354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149138, + "balance_loss_mlp": 1.10469711, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06385607916775927, + "language_loss": 0.79163915, + "learning_rate": 0.0007696052385827199, + "loss": 0.80313051, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.44458008, + "step": 1763, + "time_per_iteration": 2.9164280891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138684, + "balance_loss_mlp": 1.09765172, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.07477333876977248, + "language_loss": 0.78203613, + "learning_rate": 0.00076934281435337, + "loss": 0.79342294, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.41040039, + "step": 1764, + "time_per_iteration": 2.7213284969329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131547, + "balance_loss_mlp": 1.08922768, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0661700543843282, + "language_loss": 0.86476332, + "learning_rate": 0.0007690802855558658, + "loss": 0.87607884, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.4230957, + "step": 1765, + "time_per_iteration": 2.8648691177368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144213, + "balance_loss_mlp": 1.12981212, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.0393682164062729, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77519166, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.14355469, + "step": 1766, + "time_per_iteration": 4.883134603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138855, + "balance_loss_mlp": 1.09441423, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.06478844738748038, + "language_loss": 0.89260793, + "learning_rate": 0.0007685549146641262, + "loss": 0.90399647, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.44458008, + "step": 1767, + "time_per_iteration": 2.5584475994110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138308, + "balance_loss_mlp": 1.09780085, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0552886410345199, + "language_loss": 0.8865279, + "learning_rate": 0.0007682920727738579, + "loss": 0.89791095, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.4050293, + "step": 1768, + "time_per_iteration": 2.462104558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.09170651, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.07550967393636049, + "language_loss": 0.84987569, + "learning_rate": 0.000768029126723369, + "loss": 0.86121619, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.42333984, + "step": 1769, + "time_per_iteration": 2.5362985134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.09360242, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.0745429404709064, + "language_loss": 0.82167029, + "learning_rate": 0.0007677660766147447, + "loss": 0.83301806, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.41186523, + "step": 1770, + "time_per_iteration": 2.516824960708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.06356168, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.02503514207226814, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73550433, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.15917969, + "step": 1771, + "time_per_iteration": 4.943475008010864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137395, + "balance_loss_mlp": 1.09543359, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.06960763795190199, + "language_loss": 0.80136019, + "learning_rate": 0.0007672396646316306, + "loss": 0.81273413, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.41918945, + "step": 1772, + "time_per_iteration": 2.5425803661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145424, + "balance_loss_mlp": 1.10341442, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.05748114386543088, + "language_loss": 0.80760133, + "learning_rate": 0.000766976302961512, + "loss": 0.81905556, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.42041016, + "step": 1773, + "time_per_iteration": 2.982287645339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155937, + "balance_loss_mlp": 1.11330807, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.06912006035569716, + "language_loss": 0.81549138, + "learning_rate": 0.0007667128376420003, + "loss": 0.82705075, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.42626953, + "step": 1774, + "time_per_iteration": 2.5396063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151156, + "balance_loss_mlp": 1.10926604, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07768471353958366, + "language_loss": 0.84963071, + "learning_rate": 0.0007664492687753817, + "loss": 0.86114228, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.41894531, + "step": 1775, + "time_per_iteration": 2.7326042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139013, + "balance_loss_mlp": 1.09845805, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10552495092435867, + "language_loss": 0.81927752, + "learning_rate": 0.000766185596463983, + "loss": 0.83066773, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.40551758, + "step": 1776, + "time_per_iteration": 2.622465133666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126657, + "balance_loss_mlp": 1.08455205, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.06005887645947995, + "language_loss": 0.77224028, + "learning_rate": 0.0007659218208101706, + "loss": 0.78350687, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.42114258, + "step": 1777, + "time_per_iteration": 3.099862575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124902, + "balance_loss_mlp": 1.0852288, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.057585659974550854, + "language_loss": 0.85272229, + "learning_rate": 0.0007656579419163515, + "loss": 0.86397129, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.39672852, + "step": 1778, + "time_per_iteration": 2.7696709632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129572, + "balance_loss_mlp": 1.08794475, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.07376046533358642, + "language_loss": 0.77272999, + "learning_rate": 0.0007653939598849724, + "loss": 0.78402567, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.41650391, + "step": 1779, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131616, + "balance_loss_mlp": 1.11511779, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.05276839393693404, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84011823, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16503906, + "step": 1780, + "time_per_iteration": 4.96061897277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112473, + "balance_loss_mlp": 1.08267307, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07129012841004771, + "language_loss": 0.80831903, + "learning_rate": 0.000764865686819522, + "loss": 0.81956631, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.4206543, + "step": 1781, + "time_per_iteration": 3.089735507965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.08492422, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0622927262326037, + "language_loss": 0.86375809, + "learning_rate": 0.0007646013959905449, + "loss": 0.87502241, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.41503906, + "step": 1782, + "time_per_iteration": 2.6112704277038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123127, + "balance_loss_mlp": 1.08130884, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10167310682771787, + "language_loss": 0.81018484, + "learning_rate": 0.0007643370024341949, + "loss": 0.82141614, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.41821289, + "step": 1783, + "time_per_iteration": 3.1074132919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115046, + "balance_loss_mlp": 1.07563567, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.057781870331099924, + "language_loss": 0.83518296, + "learning_rate": 0.0007640725062531195, + "loss": 0.84633338, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.39404297, + "step": 1784, + "time_per_iteration": 2.491313934326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112121, + "balance_loss_mlp": 1.07228112, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.12476428026998775, + "language_loss": 0.86600161, + "learning_rate": 0.0007638079075500047, + "loss": 0.87712288, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.39819336, + "step": 1785, + "time_per_iteration": 2.5236706733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070785, + "balance_loss_mlp": 1.05457258, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.032988320908807454, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76251453, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.16210938, + "step": 1786, + "time_per_iteration": 4.938300609588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_mlp": 1.09274352, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.06899034270313556, + "language_loss": 0.83409935, + "learning_rate": 0.0007632784029886026, + "loss": 0.84544241, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.41552734, + "step": 1787, + "time_per_iteration": 2.6218347549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140121, + "balance_loss_mlp": 1.09968519, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.05777013506444436, + "language_loss": 0.85674673, + "learning_rate": 0.0007630134973358873, + "loss": 0.86814797, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.40429688, + "step": 1788, + "time_per_iteration": 2.9675180912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.11780846, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.11323624876812292, + "language_loss": 0.86969185, + "learning_rate": 0.0007627484895722763, + "loss": 0.88126147, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.39160156, + "step": 1789, + "time_per_iteration": 2.6400198936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164783, + "balance_loss_mlp": 1.1222018, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.06957715435201431, + "language_loss": 0.80509681, + "learning_rate": 0.0007624833798006552, + "loss": 0.81674469, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.42602539, + "step": 1790, + "time_per_iteration": 3.042621374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162497, + "balance_loss_mlp": 1.11924767, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.09367673394256656, + "language_loss": 0.84194326, + "learning_rate": 0.0007622181681239483, + "loss": 0.85356832, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.43261719, + "step": 1791, + "time_per_iteration": 2.642648220062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140416, + "balance_loss_mlp": 1.09907472, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07487034842421487, + "language_loss": 0.84962463, + "learning_rate": 0.0007619528546451202, + "loss": 0.86102873, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.41333008, + "step": 1792, + "time_per_iteration": 2.8014347553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.08941662, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.05771787988130437, + "language_loss": 0.84187096, + "learning_rate": 0.0007616874394671745, + "loss": 0.85317373, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.40869141, + "step": 1793, + "time_per_iteration": 3.336076498031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137422, + "balance_loss_mlp": 1.09276664, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08239177777048284, + "language_loss": 0.85433841, + "learning_rate": 0.0007614219226931547, + "loss": 0.86571258, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44677734, + "step": 1794, + "time_per_iteration": 2.6596035957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.0951401, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.06809904369873732, + "language_loss": 0.85092592, + "learning_rate": 0.0007611563044261435, + "loss": 0.86229378, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.41674805, + "step": 1795, + "time_per_iteration": 2.545440435409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140576, + "balance_loss_mlp": 1.09601521, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.08865061616635866, + "language_loss": 0.8722235, + "learning_rate": 0.0007608905847692631, + "loss": 0.88362932, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.44555664, + "step": 1796, + "time_per_iteration": 2.471306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112492, + "balance_loss_mlp": 1.08486605, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07442154430907115, + "language_loss": 0.86828166, + "learning_rate": 0.0007606247638256749, + "loss": 0.87953079, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.40039062, + "step": 1797, + "time_per_iteration": 2.8728272914886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_mlp": 1.03099036, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.022391201486326673, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79215777, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.14453125, + "step": 1798, + "time_per_iteration": 4.99533486366272 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_mlp": 1.0224725, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.020693498138200886, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80363786, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.14160156, + "step": 1799, + "time_per_iteration": 4.871920347213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131321, + "balance_loss_mlp": 1.086761, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.06425687332848114, + "language_loss": 0.8622126, + "learning_rate": 0.0007598266943068686, + "loss": 0.8735258, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44555664, + "step": 1800, + "time_per_iteration": 2.7352967262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128705, + "balance_loss_mlp": 1.0892942, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.06122285990583016, + "language_loss": 0.84089196, + "learning_rate": 0.0007595604692488507, + "loss": 0.85217899, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.39404297, + "step": 1801, + "time_per_iteration": 2.520047664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145052, + "balance_loss_mlp": 1.10182643, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.08959882775364528, + "language_loss": 0.83156121, + "learning_rate": 0.0007592941434205215, + "loss": 0.84301168, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.43237305, + "step": 1802, + "time_per_iteration": 2.774533987045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_mlp": 1.01191127, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.0173366039721641, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74594939, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11962891, + "step": 1803, + "time_per_iteration": 5.441190004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130945, + "balance_loss_mlp": 1.08481145, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07392614166366455, + "language_loss": 0.80754089, + "learning_rate": 0.0007587611898665566, + "loss": 0.81885034, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.4609375, + "step": 1804, + "time_per_iteration": 3.0738565921783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126694, + "balance_loss_mlp": 1.08320653, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.052717282161679486, + "language_loss": 0.82365519, + "learning_rate": 0.0007584945623478315, + "loss": 0.83492208, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.43530273, + "step": 1805, + "time_per_iteration": 2.810065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.08773112, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.0654216117506123, + "language_loss": 0.81839657, + "learning_rate": 0.000758227834472617, + "loss": 0.8297019, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.42822266, + "step": 1806, + "time_per_iteration": 3.0400753021240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129234, + "balance_loss_mlp": 1.08631909, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.06780310502945991, + "language_loss": 0.77468187, + "learning_rate": 0.0007579610063444664, + "loss": 0.78597426, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.42895508, + "step": 1807, + "time_per_iteration": 2.720200538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_mlp": 1.0805254, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.056464817781099026, + "language_loss": 0.87875664, + "learning_rate": 0.0007576940780669712, + "loss": 0.88999271, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4309082, + "step": 1808, + "time_per_iteration": 3.1972455978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119319, + "balance_loss_mlp": 1.07723832, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.06350201854913072, + "language_loss": 0.84194762, + "learning_rate": 0.0007574270497437624, + "loss": 0.85314083, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.42089844, + "step": 1809, + "time_per_iteration": 2.956308364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.08036816, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.05949268624371524, + "language_loss": 0.88030243, + "learning_rate": 0.000757159921478509, + "loss": 0.89152765, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.42138672, + "step": 1810, + "time_per_iteration": 2.7515318393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_mlp": 1.04769194, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.027450813841054106, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75509393, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.719837427139282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.09272385, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.06099509375847796, + "language_loss": 0.87676752, + "learning_rate": 0.0007566253655367423, + "loss": 0.88813394, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.43896484, + "step": 1812, + "time_per_iteration": 2.6117310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145498, + "balance_loss_mlp": 1.10196316, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.26075237363376164, + "language_loss": 0.90086293, + "learning_rate": 0.000756357938067762, + "loss": 0.91231787, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.43554688, + "step": 1813, + "time_per_iteration": 2.6537845134735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09305573, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.07803772738029488, + "language_loss": 0.8299284, + "learning_rate": 0.0007560904110718033, + "loss": 0.84130079, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44165039, + "step": 1814, + "time_per_iteration": 3.2229981422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131299, + "balance_loss_mlp": 1.08549881, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06602375994559181, + "language_loss": 0.83648008, + "learning_rate": 0.0007558227846527297, + "loss": 0.8477931, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.45751953, + "step": 1815, + "time_per_iteration": 2.8217966556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137186, + "balance_loss_mlp": 1.09300709, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.06552880481969095, + "language_loss": 0.83563447, + "learning_rate": 0.0007555550589144429, + "loss": 0.84700632, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44189453, + "step": 1816, + "time_per_iteration": 2.4231276512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148289, + "balance_loss_mlp": 1.1026082, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.05960251663438414, + "language_loss": 0.84705317, + "learning_rate": 0.000755287233960883, + "loss": 0.85853606, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.45678711, + "step": 1817, + "time_per_iteration": 2.5598244667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148754, + "balance_loss_mlp": 1.10297787, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.06564730471203778, + "language_loss": 0.78051704, + "learning_rate": 0.0007550193098960292, + "loss": 0.79200459, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.45751953, + "step": 1818, + "time_per_iteration": 2.8570642471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115418, + "balance_loss_mlp": 1.11033523, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.05538445579726575, + "language_loss": 0.8654325, + "learning_rate": 0.0007547512868238988, + "loss": 0.87697428, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.43847656, + "step": 1819, + "time_per_iteration": 3.1437833309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170578, + "balance_loss_mlp": 1.12499213, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.0822966351911203, + "language_loss": 0.83893883, + "learning_rate": 0.0007544831648485473, + "loss": 0.85064459, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.45605469, + "step": 1820, + "time_per_iteration": 2.660233736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162235, + "balance_loss_mlp": 1.11684048, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.06443547558053964, + "language_loss": 0.81439716, + "learning_rate": 0.0007542149440740694, + "loss": 0.82601953, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.45385742, + "step": 1821, + "time_per_iteration": 2.6618528366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154684, + "balance_loss_mlp": 1.10938418, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.06960442221541481, + "language_loss": 0.86201102, + "learning_rate": 0.000753946624604597, + "loss": 0.87355781, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.45288086, + "step": 1822, + "time_per_iteration": 2.7180583477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138697, + "balance_loss_mlp": 1.09466076, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.11840223630221765, + "language_loss": 0.88456279, + "learning_rate": 0.0007536782065443015, + "loss": 0.89594972, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44042969, + "step": 1823, + "time_per_iteration": 2.6035680770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147734, + "balance_loss_mlp": 1.1024822, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.08971754998357863, + "language_loss": 0.75357497, + "learning_rate": 0.0007534096899973919, + "loss": 0.76505232, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.45263672, + "step": 1824, + "time_per_iteration": 2.592313528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136405, + "balance_loss_mlp": 1.095397, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.056380284358423516, + "language_loss": 0.8296026, + "learning_rate": 0.0007531410750681154, + "loss": 0.84096658, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.41015625, + "step": 1825, + "time_per_iteration": 2.7599031925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149352, + "balance_loss_mlp": 1.10710466, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.06329210930184016, + "language_loss": 0.8686763, + "learning_rate": 0.0007528723618607575, + "loss": 0.88016987, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.42236328, + "step": 1826, + "time_per_iteration": 3.423145055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156709, + "balance_loss_mlp": 1.11808527, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.05752886424443174, + "language_loss": 0.8293525, + "learning_rate": 0.0007526035504796422, + "loss": 0.84091961, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.38598633, + "step": 1827, + "time_per_iteration": 2.774202346801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164193, + "balance_loss_mlp": 1.12080038, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.08334994788856638, + "language_loss": 0.87348354, + "learning_rate": 0.0007523346410291312, + "loss": 0.8851254, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.43408203, + "step": 1828, + "time_per_iteration": 2.7933921813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172191, + "balance_loss_mlp": 1.13127816, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.05847449829546615, + "language_loss": 0.85163879, + "learning_rate": 0.0007520656336136245, + "loss": 0.86336064, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.40942383, + "step": 1829, + "time_per_iteration": 2.9654810428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167386, + "balance_loss_mlp": 1.12675905, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06508844853371867, + "language_loss": 0.88540596, + "learning_rate": 0.0007517965283375599, + "loss": 0.89707983, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.40625, + "step": 1830, + "time_per_iteration": 2.833653211593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161789, + "balance_loss_mlp": 1.12078059, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.05306701185260888, + "language_loss": 0.89636958, + "learning_rate": 0.0007515273253054132, + "loss": 0.90798748, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.41015625, + "step": 1831, + "time_per_iteration": 2.648688554763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162371, + "balance_loss_mlp": 1.11788237, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.060637132075448665, + "language_loss": 0.8317945, + "learning_rate": 0.0007512580246216988, + "loss": 0.84341824, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44482422, + "step": 1832, + "time_per_iteration": 2.695558786392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152178, + "balance_loss_mlp": 1.11288619, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.06652239867864222, + "language_loss": 0.8520152, + "learning_rate": 0.000750988626390968, + "loss": 0.86353695, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.39306641, + "step": 1833, + "time_per_iteration": 2.5903215408325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114923, + "balance_loss_mlp": 1.10810232, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.05520517467567221, + "language_loss": 0.85274744, + "learning_rate": 0.0007507191307178108, + "loss": 0.86423969, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.41137695, + "step": 1834, + "time_per_iteration": 2.7567453384399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132557, + "balance_loss_mlp": 1.0890696, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.06897138795442613, + "language_loss": 0.75032014, + "learning_rate": 0.0007504495377068543, + "loss": 0.76164567, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.43481445, + "step": 1835, + "time_per_iteration": 2.7309370040893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134622, + "balance_loss_mlp": 1.08972788, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09099083327189633, + "language_loss": 0.81936944, + "learning_rate": 0.0007501798474627642, + "loss": 0.8307156, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44873047, + "step": 1836, + "time_per_iteration": 2.9126806259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113171, + "balance_loss_mlp": 1.08853245, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.058808043239055564, + "language_loss": 0.8375026, + "learning_rate": 0.0007499100600902433, + "loss": 0.84881973, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.43164062, + "step": 1837, + "time_per_iteration": 2.9810633659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124171, + "balance_loss_mlp": 1.08118403, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08552727697149294, + "language_loss": 0.8450433, + "learning_rate": 0.0007496401756940324, + "loss": 0.85628498, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.43017578, + "step": 1838, + "time_per_iteration": 2.670412540435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130914, + "balance_loss_mlp": 1.08897638, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.06964876492363449, + "language_loss": 0.82608843, + "learning_rate": 0.0007493701943789098, + "loss": 0.83739758, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.41967773, + "step": 1839, + "time_per_iteration": 2.772620677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.09537208, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07045943234490067, + "language_loss": 0.83116889, + "learning_rate": 0.000749100116249692, + "loss": 0.84255433, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.43188477, + "step": 1840, + "time_per_iteration": 2.6031582355499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144616, + "balance_loss_mlp": 1.10110414, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.08424265710124153, + "language_loss": 0.86582088, + "learning_rate": 0.0007488299414112321, + "loss": 0.87726706, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.43505859, + "step": 1841, + "time_per_iteration": 2.5864784717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.10726476, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.058600000923872894, + "language_loss": 0.77847576, + "learning_rate": 0.0007485596699684215, + "loss": 0.78998852, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.43994141, + "step": 1842, + "time_per_iteration": 2.8149642944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156484, + "balance_loss_mlp": 1.11266279, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.055073821734726955, + "language_loss": 0.85694617, + "learning_rate": 0.000748289302026189, + "loss": 0.86851102, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.43823242, + "step": 1843, + "time_per_iteration": 2.8475751876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158372, + "balance_loss_mlp": 1.11688685, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.057565803102883874, + "language_loss": 0.85718876, + "learning_rate": 0.0007480188376895004, + "loss": 0.86877251, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.41479492, + "step": 1844, + "time_per_iteration": 3.0344529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140478, + "balance_loss_mlp": 1.12693632, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.05127204690943662, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74951822, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.13574219, + "step": 1845, + "time_per_iteration": 4.8589537143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176931, + "balance_loss_mlp": 1.13518405, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08988090291235612, + "language_loss": 0.78641856, + "learning_rate": 0.0007474776202528074, + "loss": 0.79818785, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.41772461, + "step": 1846, + "time_per_iteration": 2.9269866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184559, + "balance_loss_mlp": 1.14243031, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08000045078310114, + "language_loss": 0.81513619, + "learning_rate": 0.000747206867362922, + "loss": 0.82698178, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.42114258, + "step": 1847, + "time_per_iteration": 3.067870616912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169442, + "balance_loss_mlp": 1.12573957, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.0760432300690223, + "language_loss": 0.84328806, + "learning_rate": 0.0007469360184988194, + "loss": 0.85498255, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.43701172, + "step": 1848, + "time_per_iteration": 2.8130369186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159569, + "balance_loss_mlp": 1.11837053, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08168000095068725, + "language_loss": 0.86707914, + "learning_rate": 0.0007466650737656518, + "loss": 0.87867486, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.41162109, + "step": 1849, + "time_per_iteration": 2.592503309249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115621, + "balance_loss_mlp": 1.11324644, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06757272046168854, + "language_loss": 0.89898217, + "learning_rate": 0.0007463940332686098, + "loss": 0.91054422, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.42944336, + "step": 1850, + "time_per_iteration": 2.4776744842529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148398, + "balance_loss_mlp": 1.10607898, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.05922624538442341, + "language_loss": 0.84461212, + "learning_rate": 0.0007461228971129205, + "loss": 0.85609609, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.42358398, + "step": 1851, + "time_per_iteration": 2.9012656211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.11387658, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.058626739978073765, + "language_loss": 0.85743707, + "learning_rate": 0.0007458516654038483, + "loss": 0.86898398, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.40820312, + "step": 1852, + "time_per_iteration": 2.666947603225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165665, + "balance_loss_mlp": 1.12160563, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06798765543406252, + "language_loss": 0.86475062, + "learning_rate": 0.0007455803382466946, + "loss": 0.87640727, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44042969, + "step": 1853, + "time_per_iteration": 2.804776191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162987, + "balance_loss_mlp": 1.11985719, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07311152518110202, + "language_loss": 0.87308323, + "learning_rate": 0.0007453089157467979, + "loss": 0.88471317, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.43139648, + "step": 1854, + "time_per_iteration": 2.8038864135742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159292, + "balance_loss_mlp": 1.1161381, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06621845487790666, + "language_loss": 0.82129812, + "learning_rate": 0.0007450373980095341, + "loss": 0.83289105, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.43164062, + "step": 1855, + "time_per_iteration": 3.0980496406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154286, + "balance_loss_mlp": 1.11268187, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.05908088829108725, + "language_loss": 0.87076378, + "learning_rate": 0.0007447657851403155, + "loss": 0.88230669, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.41601562, + "step": 1856, + "time_per_iteration": 2.6393351554870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148054, + "balance_loss_mlp": 1.10609269, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.07116077808597938, + "language_loss": 0.79415643, + "learning_rate": 0.0007444940772445915, + "loss": 0.805637, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.41943359, + "step": 1857, + "time_per_iteration": 2.7049038410186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.10770321, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06303496934817837, + "language_loss": 0.80443203, + "learning_rate": 0.0007442222744278484, + "loss": 0.81591749, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.40844727, + "step": 1858, + "time_per_iteration": 2.6416029930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.10056937, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.06290523981550739, + "language_loss": 0.84690839, + "learning_rate": 0.0007439503767956099, + "loss": 0.85831463, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.40063477, + "step": 1859, + "time_per_iteration": 2.697295665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095769, + "balance_loss_mlp": 1.08213139, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.02707100394521806, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80767375, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.13671875, + "step": 1860, + "time_per_iteration": 4.896381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157881, + "balance_loss_mlp": 1.11744571, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.054355964588402354, + "language_loss": 0.86204398, + "learning_rate": 0.000743406297506922, + "loss": 0.87362283, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.40478516, + "step": 1861, + "time_per_iteration": 2.7121450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154988, + "balance_loss_mlp": 1.11362243, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.056412092641732435, + "language_loss": 0.8442747, + "learning_rate": 0.0007431341160617031, + "loss": 0.85582459, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.41381836, + "step": 1862, + "time_per_iteration": 2.902806520462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.13052833, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06986467819319542, + "language_loss": 0.88734752, + "learning_rate": 0.0007428618402234491, + "loss": 0.89907002, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.41723633, + "step": 1863, + "time_per_iteration": 2.644352436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159657, + "balance_loss_mlp": 1.11831546, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.06293448628505635, + "language_loss": 0.8061077, + "learning_rate": 0.0007425894700978668, + "loss": 0.81770432, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.41357422, + "step": 1864, + "time_per_iteration": 2.782757043838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.10699308, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.056888458094662434, + "language_loss": 0.79858804, + "learning_rate": 0.0007423170057906996, + "loss": 0.81006974, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.41162109, + "step": 1865, + "time_per_iteration": 3.848773956298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133926, + "balance_loss_mlp": 1.09391952, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.06447904861600703, + "language_loss": 0.86500657, + "learning_rate": 0.0007420444474077275, + "loss": 0.87634581, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.40014648, + "step": 1866, + "time_per_iteration": 2.542572498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126566, + "balance_loss_mlp": 1.0855341, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.07300351460408123, + "language_loss": 0.8986578, + "learning_rate": 0.0007417717950547671, + "loss": 0.90992349, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.41040039, + "step": 1867, + "time_per_iteration": 2.5633254051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073925, + "balance_loss_mlp": 1.06143153, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.026482390846264015, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77070534, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.125, + "step": 1868, + "time_per_iteration": 4.904905557632446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111694, + "balance_loss_mlp": 1.07345176, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.053992922509511466, + "language_loss": 0.850173, + "learning_rate": 0.0007412262088623299, + "loss": 0.86128998, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.38232422, + "step": 1869, + "time_per_iteration": 2.7310874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110773, + "balance_loss_mlp": 1.07200575, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08370102618564679, + "language_loss": 0.79675972, + "learning_rate": 0.0007409532752346684, + "loss": 0.80786741, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.38769531, + "step": 1870, + "time_per_iteration": 2.6629347801208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110981, + "balance_loss_mlp": 1.07166612, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06403903481871269, + "language_loss": 0.88829064, + "learning_rate": 0.0007406802480606491, + "loss": 0.89940047, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.39306641, + "step": 1871, + "time_per_iteration": 2.6200008392333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.06835461, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.0729370697679506, + "language_loss": 0.90798759, + "learning_rate": 0.0007404071274462707, + "loss": 0.9190588, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.38769531, + "step": 1872, + "time_per_iteration": 2.5693628787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111805, + "balance_loss_mlp": 1.07978415, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.06627703814726228, + "language_loss": 0.84024733, + "learning_rate": 0.0007401339134975682, + "loss": 0.85142779, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.38208008, + "step": 1873, + "time_per_iteration": 2.7031140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127585, + "balance_loss_mlp": 1.08760262, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.06845959531373838, + "language_loss": 0.84298885, + "learning_rate": 0.0007398606063206122, + "loss": 0.85426462, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.39990234, + "step": 1874, + "time_per_iteration": 2.6090316772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.07598901, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.06521397848462201, + "language_loss": 0.78764814, + "learning_rate": 0.0007395872060215101, + "loss": 0.79879999, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.3918457, + "step": 1875, + "time_per_iteration": 2.620976448059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0831089, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.06345733178575377, + "language_loss": 0.88705117, + "learning_rate": 0.0007393137127064056, + "loss": 0.89827275, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.39013672, + "step": 1876, + "time_per_iteration": 2.7320597171783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125006, + "balance_loss_mlp": 1.08511841, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.056097062255587686, + "language_loss": 0.84576774, + "learning_rate": 0.0007390401264814779, + "loss": 0.85701776, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.39868164, + "step": 1877, + "time_per_iteration": 2.605865478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123607, + "balance_loss_mlp": 1.08503079, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.06159732683880817, + "language_loss": 0.84937686, + "learning_rate": 0.0007387664474529427, + "loss": 0.86061299, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.38598633, + "step": 1878, + "time_per_iteration": 2.6548514366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.09750319, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.05796680079252983, + "language_loss": 0.91768891, + "learning_rate": 0.0007384926757270518, + "loss": 0.92906928, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.40527344, + "step": 1879, + "time_per_iteration": 2.6339149475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137039, + "balance_loss_mlp": 1.09791493, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.05405313293747941, + "language_loss": 0.79881001, + "learning_rate": 0.0007382188114100924, + "loss": 0.81018037, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.39111328, + "step": 1880, + "time_per_iteration": 2.983384132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139197, + "balance_loss_mlp": 1.09964395, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.12141150358978081, + "language_loss": 0.82206392, + "learning_rate": 0.0007379448546083884, + "loss": 0.83345592, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.39575195, + "step": 1881, + "time_per_iteration": 2.9186532497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140707, + "balance_loss_mlp": 1.10127282, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06284373597557333, + "language_loss": 0.88377333, + "learning_rate": 0.0007376708054282992, + "loss": 0.8951804, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.39428711, + "step": 1882, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144635, + "balance_loss_mlp": 1.10605919, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.05224621202588268, + "language_loss": 0.84316945, + "learning_rate": 0.0007373966639762201, + "loss": 0.85461575, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.38574219, + "step": 1883, + "time_per_iteration": 2.623133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147786, + "balance_loss_mlp": 1.10620606, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.06751899300287477, + "language_loss": 0.89170045, + "learning_rate": 0.0007371224303585822, + "loss": 0.90317833, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.41577148, + "step": 1884, + "time_per_iteration": 2.628394842147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021984, + "balance_loss_mlp": 1.01154125, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.007236456832270123, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8137905, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10449219, + "step": 1885, + "time_per_iteration": 4.717620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114112, + "balance_loss_mlp": 1.10049307, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.057116908748179596, + "language_loss": 0.82560247, + "learning_rate": 0.0007365736870525335, + "loss": 0.83701366, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.40625, + "step": 1886, + "time_per_iteration": 2.8198611736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132227, + "balance_loss_mlp": 1.09310222, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.06530442713985495, + "language_loss": 0.83123338, + "learning_rate": 0.000736299177577164, + "loss": 0.84255564, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.39135742, + "step": 1887, + "time_per_iteration": 2.613863945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128864, + "balance_loss_mlp": 1.08992994, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0666501464088242, + "language_loss": 0.84363097, + "learning_rate": 0.0007360245763623174, + "loss": 0.85491955, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3894043, + "step": 1888, + "time_per_iteration": 2.6378068923950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115221, + "balance_loss_mlp": 1.07702661, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06993226621121658, + "language_loss": 0.90142351, + "learning_rate": 0.0007357498835146039, + "loss": 0.91257572, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.38183594, + "step": 1889, + "time_per_iteration": 2.8125081062316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128167, + "balance_loss_mlp": 1.08878016, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.07359030033413445, + "language_loss": 0.87316656, + "learning_rate": 0.0007354750991406684, + "loss": 0.88444823, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.39379883, + "step": 1890, + "time_per_iteration": 2.714569568634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121285, + "balance_loss_mlp": 1.0807066, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07836036923074335, + "language_loss": 0.80991101, + "learning_rate": 0.0007352002233471919, + "loss": 0.8211239, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.40576172, + "step": 1891, + "time_per_iteration": 2.6287412643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121974, + "balance_loss_mlp": 1.08180022, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.058839902089765785, + "language_loss": 0.79524523, + "learning_rate": 0.0007349252562408906, + "loss": 0.80646491, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.40161133, + "step": 1892, + "time_per_iteration": 2.669903039932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125098, + "balance_loss_mlp": 1.08449531, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.057079030651025625, + "language_loss": 0.81590033, + "learning_rate": 0.0007346501979285158, + "loss": 0.8271513, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.40600586, + "step": 1893, + "time_per_iteration": 2.9146764278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083448, + "balance_loss_mlp": 1.07238543, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.036364529291757694, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81622547, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11083984, + "step": 1894, + "time_per_iteration": 4.784435272216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126267, + "balance_loss_mlp": 1.08444858, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06549610472034906, + "language_loss": 0.86352968, + "learning_rate": 0.0007340998081127308, + "loss": 0.87479234, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.41796875, + "step": 1895, + "time_per_iteration": 2.7702367305755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130662, + "balance_loss_mlp": 1.09113181, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06520113052193731, + "language_loss": 0.91046786, + "learning_rate": 0.0007338244768230007, + "loss": 0.92177445, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.39550781, + "step": 1896, + "time_per_iteration": 2.7612760066986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133468, + "balance_loss_mlp": 1.09315181, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.058734972315737245, + "language_loss": 0.89108521, + "learning_rate": 0.0007335490547545578, + "loss": 0.90241992, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.40307617, + "step": 1897, + "time_per_iteration": 3.024462938308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135084, + "balance_loss_mlp": 1.09343266, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06208128991116815, + "language_loss": 0.82833707, + "learning_rate": 0.0007332735420143308, + "loss": 0.83968788, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.41650391, + "step": 1898, + "time_per_iteration": 2.725468158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112873, + "balance_loss_mlp": 1.08669686, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.09645190116324148, + "language_loss": 0.86573303, + "learning_rate": 0.0007329979387092826, + "loss": 0.8770203, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.42016602, + "step": 1899, + "time_per_iteration": 2.6357531547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133626, + "balance_loss_mlp": 1.09259379, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.06150604002201611, + "language_loss": 0.84294677, + "learning_rate": 0.0007327222449464124, + "loss": 0.85428298, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.41040039, + "step": 1900, + "time_per_iteration": 3.2381174564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136855, + "balance_loss_mlp": 1.09382069, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07567830151973255, + "language_loss": 0.89052904, + "learning_rate": 0.0007324464608327538, + "loss": 0.90189761, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4309082, + "step": 1901, + "time_per_iteration": 2.597569227218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.10814035, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.07712085030005716, + "language_loss": 0.88794601, + "learning_rate": 0.0007321705864753758, + "loss": 0.89944601, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.41870117, + "step": 1902, + "time_per_iteration": 2.6877686977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151954, + "balance_loss_mlp": 1.11097002, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.05591922142148154, + "language_loss": 0.84586883, + "learning_rate": 0.0007318946219813823, + "loss": 0.85738844, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.40991211, + "step": 1903, + "time_per_iteration": 3.0283257961273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11341679, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.0702623940180467, + "language_loss": 0.90117764, + "learning_rate": 0.000731618567457912, + "loss": 0.91269374, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.38208008, + "step": 1904, + "time_per_iteration": 2.651491165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114788, + "balance_loss_mlp": 1.10522676, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07047012066076976, + "language_loss": 0.87036794, + "learning_rate": 0.000731342423012139, + "loss": 0.88184673, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.42700195, + "step": 1905, + "time_per_iteration": 3.0361618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143776, + "balance_loss_mlp": 1.10331631, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.06969182334255739, + "language_loss": 0.82982039, + "learning_rate": 0.0007310661887512722, + "loss": 0.84125817, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.40478516, + "step": 1906, + "time_per_iteration": 3.020333766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134716, + "balance_loss_mlp": 1.09592557, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.056548054453958524, + "language_loss": 0.82503444, + "learning_rate": 0.0007307898647825549, + "loss": 0.83638155, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.38793945, + "step": 1907, + "time_per_iteration": 2.6819958686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128623, + "balance_loss_mlp": 1.08568358, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.0662764931561561, + "language_loss": 0.89910614, + "learning_rate": 0.0007305134512132659, + "loss": 0.9103924, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.42944336, + "step": 1908, + "time_per_iteration": 2.688716411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.08063269, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.07972147303822336, + "language_loss": 0.83329952, + "learning_rate": 0.0007302369481507183, + "loss": 0.8445071, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.40136719, + "step": 1909, + "time_per_iteration": 2.520551919937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043123, + "balance_loss_mlp": 1.03272831, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.028970701382128577, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004882, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10400391, + "step": 1910, + "time_per_iteration": 4.862990140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_mlp": 1.07534695, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.0535153553246422, + "language_loss": 0.85860741, + "learning_rate": 0.000729683673975274, + "loss": 0.86976075, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.3996582, + "step": 1911, + "time_per_iteration": 2.6834514141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117796, + "balance_loss_mlp": 1.07783747, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.07394300555179863, + "language_loss": 0.83108044, + "learning_rate": 0.0007294069030771774, + "loss": 0.84225845, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.39941406, + "step": 1912, + "time_per_iteration": 3.6458523273468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124936, + "balance_loss_mlp": 1.08483398, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.05916806609098389, + "language_loss": 0.90897858, + "learning_rate": 0.0007291300431154224, + "loss": 0.920228, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.40112305, + "step": 1913, + "time_per_iteration": 2.5737557411193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_mlp": 1.02157927, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.013681752942923219, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71422619, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.11279297, + "step": 1914, + "time_per_iteration": 5.031456232070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113953, + "balance_loss_mlp": 1.07499564, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.06158754254944219, + "language_loss": 0.79961407, + "learning_rate": 0.0007285760564309179, + "loss": 0.81075364, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.38964844, + "step": 1915, + "time_per_iteration": 3.152339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122924, + "balance_loss_mlp": 1.08346629, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10197178679971165, + "language_loss": 0.85308397, + "learning_rate": 0.0007282989299232448, + "loss": 0.86431319, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.39453125, + "step": 1916, + "time_per_iteration": 3.0152268409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119949, + "balance_loss_mlp": 1.08013296, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.05980283450468872, + "language_loss": 0.8385278, + "learning_rate": 0.0007280217147820668, + "loss": 0.84972733, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.39794922, + "step": 1917, + "time_per_iteration": 2.625802755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114962, + "balance_loss_mlp": 1.07512259, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06755957483710798, + "language_loss": 0.79489267, + "learning_rate": 0.0007277444111150079, + "loss": 0.80604231, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.3984375, + "step": 1918, + "time_per_iteration": 2.6753525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112846, + "balance_loss_mlp": 1.08785725, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.07157808177363079, + "language_loss": 0.84730321, + "learning_rate": 0.0007274670190297272, + "loss": 0.8585878, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.40576172, + "step": 1919, + "time_per_iteration": 2.6149959564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.09986341, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.05944559747374387, + "language_loss": 0.8264004, + "learning_rate": 0.0007271895386339179, + "loss": 0.83782172, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.42285156, + "step": 1920, + "time_per_iteration": 2.7611513137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140712, + "balance_loss_mlp": 1.09970427, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.059089751588204814, + "language_loss": 0.83542717, + "learning_rate": 0.0007269119700353073, + "loss": 0.8468343, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.41015625, + "step": 1921, + "time_per_iteration": 2.782167911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148229, + "balance_loss_mlp": 1.10738814, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06644949508392005, + "language_loss": 0.85268104, + "learning_rate": 0.0007266343133416571, + "loss": 0.8641634, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.40844727, + "step": 1922, + "time_per_iteration": 2.7218997478485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.06340241, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.03214674667569998, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78192997, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.12695312, + "step": 1923, + "time_per_iteration": 4.837427854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145902, + "balance_loss_mlp": 1.1028676, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.07518583721861193, + "language_loss": 0.84417462, + "learning_rate": 0.0007260787361004556, + "loss": 0.85563368, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.43041992, + "step": 1924, + "time_per_iteration": 2.5874598026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.03880954, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023888622594867324, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74812186, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11865234, + "step": 1925, + "time_per_iteration": 4.961286544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137865, + "balance_loss_mlp": 1.09571242, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.05584746966952834, + "language_loss": 0.87657702, + "learning_rate": 0.0007255228077730903, + "loss": 0.88795567, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.42163086, + "step": 1926, + "time_per_iteration": 2.663482666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.09786606, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.05562014185368244, + "language_loss": 0.81976974, + "learning_rate": 0.0007252447122218632, + "loss": 0.83117759, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.42919922, + "step": 1927, + "time_per_iteration": 3.1484758853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138853, + "balance_loss_mlp": 1.09655809, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.06601877155853234, + "language_loss": 0.88791764, + "learning_rate": 0.0007249665292228834, + "loss": 0.89930612, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.4230957, + "step": 1928, + "time_per_iteration": 2.5840864181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140563, + "balance_loss_mlp": 1.09872091, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.05314866644458525, + "language_loss": 0.83534646, + "learning_rate": 0.000724688258884151, + "loss": 0.84675211, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.41845703, + "step": 1929, + "time_per_iteration": 2.6063482761383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129765, + "balance_loss_mlp": 1.09166527, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.06946275153671234, + "language_loss": 0.86767673, + "learning_rate": 0.0007244099013137002, + "loss": 0.87897444, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.38085938, + "step": 1930, + "time_per_iteration": 3.0539071559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.0873971, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.05696415350586704, + "language_loss": 0.89040637, + "learning_rate": 0.0007241314566195993, + "loss": 0.90168232, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.40185547, + "step": 1931, + "time_per_iteration": 3.2625389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07861531, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.08463017827171934, + "language_loss": 0.85909784, + "learning_rate": 0.0007238529249099496, + "loss": 0.87028337, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.39941406, + "step": 1932, + "time_per_iteration": 2.6740944385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.09080601, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.046016525030599324, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78958464, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10791016, + "step": 1933, + "time_per_iteration": 4.862685203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125745, + "balance_loss_mlp": 1.08347321, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.10032321862894769, + "language_loss": 0.80747449, + "learning_rate": 0.000723295600876581, + "loss": 0.81873196, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.42285156, + "step": 1934, + "time_per_iteration": 2.990391969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125218, + "balance_loss_mlp": 1.08406699, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.057414096803471676, + "language_loss": 0.87956464, + "learning_rate": 0.0007230168087692344, + "loss": 0.89081681, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.41162109, + "step": 1935, + "time_per_iteration": 2.656625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119009, + "balance_loss_mlp": 1.07924092, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.060205825913767164, + "language_loss": 0.82307911, + "learning_rate": 0.0007227379300790839, + "loss": 0.83426917, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.39770508, + "step": 1936, + "time_per_iteration": 2.997037649154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114267, + "balance_loss_mlp": 1.07218599, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.06128365804507508, + "language_loss": 0.86067426, + "learning_rate": 0.0007224589649143997, + "loss": 0.87181687, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.4206543, + "step": 1937, + "time_per_iteration": 2.5290677547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124508, + "balance_loss_mlp": 1.08228397, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.06605047879793914, + "language_loss": 0.81297445, + "learning_rate": 0.0007221799133834861, + "loss": 0.82421947, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.42236328, + "step": 1938, + "time_per_iteration": 2.613140106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122203, + "balance_loss_mlp": 1.08195794, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.09318016716004435, + "language_loss": 0.8198092, + "learning_rate": 0.00072190077559468, + "loss": 0.83103126, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.40209961, + "step": 1939, + "time_per_iteration": 2.517237424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115818, + "balance_loss_mlp": 1.07578754, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.0553068133661429, + "language_loss": 0.8932575, + "learning_rate": 0.0007216215516563527, + "loss": 0.90441567, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.40014648, + "step": 1940, + "time_per_iteration": 2.7175915241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_mlp": 1.07089305, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.06982995582267476, + "language_loss": 0.83827746, + "learning_rate": 0.0007213422416769083, + "loss": 0.84939647, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.41015625, + "step": 1941, + "time_per_iteration": 2.5922279357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116664, + "balance_loss_mlp": 1.07684803, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.050249137281424494, + "language_loss": 0.75479639, + "learning_rate": 0.0007210628457647849, + "loss": 0.76596296, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.39819336, + "step": 1942, + "time_per_iteration": 2.583151340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.07781446, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.0794488438004998, + "language_loss": 0.79022861, + "learning_rate": 0.000720783364028453, + "loss": 0.8014161, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.40942383, + "step": 1943, + "time_per_iteration": 2.7737677097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114071, + "balance_loss_mlp": 1.07418346, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.05694655733140731, + "language_loss": 0.87941283, + "learning_rate": 0.0007205037965764177, + "loss": 0.89055347, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.39868164, + "step": 1944, + "time_per_iteration": 2.558089256286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123121, + "balance_loss_mlp": 1.08430672, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07621334150317126, + "language_loss": 0.85730159, + "learning_rate": 0.0007202241435172161, + "loss": 0.86853278, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.38769531, + "step": 1945, + "time_per_iteration": 2.7602779865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125439, + "balance_loss_mlp": 1.08574176, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07927003262790512, + "language_loss": 0.88465476, + "learning_rate": 0.0007199444049594198, + "loss": 0.89590919, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.39697266, + "step": 1946, + "time_per_iteration": 2.9583580493927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119027, + "balance_loss_mlp": 1.07665968, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.055396154938164174, + "language_loss": 0.8346498, + "learning_rate": 0.0007196645810116322, + "loss": 0.8458401, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.42382812, + "step": 1947, + "time_per_iteration": 2.6851320266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131178, + "balance_loss_mlp": 1.09045637, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.05889971918419499, + "language_loss": 0.84302223, + "learning_rate": 0.0007193846717824912, + "loss": 0.854334, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.40698242, + "step": 1948, + "time_per_iteration": 2.9035325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.08848619, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.07994215642664601, + "language_loss": 0.88549483, + "learning_rate": 0.0007191046773806669, + "loss": 0.89678907, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.40942383, + "step": 1949, + "time_per_iteration": 2.574697256088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135159, + "balance_loss_mlp": 1.09224343, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07615017139071276, + "language_loss": 0.8356899, + "learning_rate": 0.0007188245979148631, + "loss": 0.84704149, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.42919922, + "step": 1950, + "time_per_iteration": 3.216397285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137761, + "balance_loss_mlp": 1.09475029, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.061651705216508604, + "language_loss": 0.87894762, + "learning_rate": 0.0007185444334938157, + "loss": 0.89032525, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.43041992, + "step": 1951, + "time_per_iteration": 2.6782584190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127424, + "balance_loss_mlp": 1.08879972, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.07782676746029546, + "language_loss": 0.84900033, + "learning_rate": 0.0007182641842262947, + "loss": 0.86027455, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.38647461, + "step": 1952, + "time_per_iteration": 2.639446258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125752, + "balance_loss_mlp": 1.08603168, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.05954692469221933, + "language_loss": 0.78027642, + "learning_rate": 0.0007179838502211022, + "loss": 0.79153389, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.3972168, + "step": 1953, + "time_per_iteration": 2.84329891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131364, + "balance_loss_mlp": 1.09028411, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.10232430816689406, + "language_loss": 0.86411202, + "learning_rate": 0.0007177034315870738, + "loss": 0.8754257, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.41064453, + "step": 1954, + "time_per_iteration": 2.957648992538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124051, + "balance_loss_mlp": 1.08325803, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06271313302399782, + "language_loss": 0.91398948, + "learning_rate": 0.0007174229284330773, + "loss": 0.92523003, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.40795898, + "step": 1955, + "time_per_iteration": 2.5879859924316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128257, + "balance_loss_mlp": 1.08879828, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.06607511431735706, + "language_loss": 0.86850858, + "learning_rate": 0.0007171423408680141, + "loss": 0.87979114, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.39453125, + "step": 1956, + "time_per_iteration": 2.7903566360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123297, + "balance_loss_mlp": 1.08295655, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.06886679209235984, + "language_loss": 0.90041375, + "learning_rate": 0.0007168616690008176, + "loss": 0.91164672, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.40356445, + "step": 1957, + "time_per_iteration": 2.6327474117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07705224, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.062429689069725576, + "language_loss": 0.85725892, + "learning_rate": 0.0007165809129404545, + "loss": 0.86842352, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.39404297, + "step": 1958, + "time_per_iteration": 2.7385900020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124898, + "balance_loss_mlp": 1.08527279, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.05793527093847313, + "language_loss": 0.85962278, + "learning_rate": 0.0007163000727959239, + "loss": 0.87087178, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.39624023, + "step": 1959, + "time_per_iteration": 2.485438585281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_mlp": 1.0320313, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.027906108498427614, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79005599, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.14453125, + "step": 1960, + "time_per_iteration": 4.834578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_mlp": 1.07865775, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.05325294699236946, + "language_loss": 0.84349847, + "learning_rate": 0.00071573814069052, + "loss": 0.85467696, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.39208984, + "step": 1961, + "time_per_iteration": 2.9086802005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120534, + "balance_loss_mlp": 1.08219612, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.09498383670658105, + "language_loss": 0.88074362, + "learning_rate": 0.0007154570489478081, + "loss": 0.89194894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.38330078, + "step": 1962, + "time_per_iteration": 3.2217841148376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117183, + "balance_loss_mlp": 1.07889283, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.05466788938828107, + "language_loss": 0.86278516, + "learning_rate": 0.0007151758735572514, + "loss": 0.87395698, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.38232422, + "step": 1963, + "time_per_iteration": 3.01104998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130106, + "balance_loss_mlp": 1.08921766, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.06218420858169212, + "language_loss": 0.81413925, + "learning_rate": 0.0007148946146280119, + "loss": 0.82544029, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.40893555, + "step": 1964, + "time_per_iteration": 2.8039112091064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_mlp": 1.01440012, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.022738468700431315, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73218751, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12207031, + "step": 1965, + "time_per_iteration": 4.8600172996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024213, + "balance_loss_mlp": 1.0124352, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.018349030303600054, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76366156, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.11767578, + "step": 1966, + "time_per_iteration": 4.918729782104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135372, + "balance_loss_mlp": 1.09648633, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.2766921299066869, + "language_loss": 0.83812642, + "learning_rate": 0.0007140503377003022, + "loss": 0.84948009, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.38891602, + "step": 1967, + "time_per_iteration": 3.015761613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149326, + "balance_loss_mlp": 1.10862756, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.07158509383086724, + "language_loss": 0.8519339, + "learning_rate": 0.000713768745708599, + "loss": 0.8634271, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.40698242, + "step": 1968, + "time_per_iteration": 2.6109209060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140905, + "balance_loss_mlp": 1.09996843, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.05954158443482363, + "language_loss": 0.774553, + "learning_rate": 0.0007134870707245085, + "loss": 0.78596205, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.40893555, + "step": 1969, + "time_per_iteration": 3.2631757259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150809, + "balance_loss_mlp": 1.11008716, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.05763521765218817, + "language_loss": 0.84313977, + "learning_rate": 0.0007132053128573864, + "loss": 0.85464787, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.40698242, + "step": 1970, + "time_per_iteration": 2.7791051864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143919, + "balance_loss_mlp": 1.10353041, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06905446326925666, + "language_loss": 0.84168518, + "learning_rate": 0.0007129234722166211, + "loss": 0.85312432, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.40356445, + "step": 1971, + "time_per_iteration": 2.8210554122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149932, + "balance_loss_mlp": 1.11152232, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.07023460279096982, + "language_loss": 0.91057038, + "learning_rate": 0.0007126415489116328, + "loss": 0.92206967, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3840332, + "step": 1972, + "time_per_iteration": 2.672755002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153021, + "balance_loss_mlp": 1.11210799, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06814261110374484, + "language_loss": 0.81719398, + "learning_rate": 0.0007123595430518736, + "loss": 0.82872415, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.40917969, + "step": 1973, + "time_per_iteration": 2.8325109481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_mlp": 1.10081029, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.06503005991167149, + "language_loss": 0.86840981, + "learning_rate": 0.0007120774547468282, + "loss": 0.87980628, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.38793945, + "step": 1974, + "time_per_iteration": 2.6115715503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148199, + "balance_loss_mlp": 1.10781133, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.05441443516000103, + "language_loss": 0.81729043, + "learning_rate": 0.0007117952841060128, + "loss": 0.82877243, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.40380859, + "step": 1975, + "time_per_iteration": 2.6378135681152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135454, + "balance_loss_mlp": 1.09389758, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08133175482890537, + "language_loss": 0.83869064, + "learning_rate": 0.0007115130312389756, + "loss": 0.85004514, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.41552734, + "step": 1976, + "time_per_iteration": 2.664318084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139177, + "balance_loss_mlp": 1.0974772, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.06620518382871708, + "language_loss": 0.79781663, + "learning_rate": 0.0007112306962552973, + "loss": 0.80920839, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.41699219, + "step": 1977, + "time_per_iteration": 2.6198599338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.0891974, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.05972767263520316, + "language_loss": 0.85605282, + "learning_rate": 0.0007109482792645896, + "loss": 0.86734867, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.40356445, + "step": 1978, + "time_per_iteration": 2.728576898574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132218, + "balance_loss_mlp": 1.09066188, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.09572440125940551, + "language_loss": 0.84308225, + "learning_rate": 0.0007106657803764969, + "loss": 0.85440445, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.41552734, + "step": 1979, + "time_per_iteration": 2.7279720306396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126537, + "balance_loss_mlp": 1.08340704, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.05862837672704736, + "language_loss": 0.82269728, + "learning_rate": 0.0007103831997006948, + "loss": 0.83396262, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.43164062, + "step": 1980, + "time_per_iteration": 2.746915817260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127621, + "balance_loss_mlp": 1.08663654, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.05821983888794681, + "language_loss": 0.85798764, + "learning_rate": 0.0007101005373468908, + "loss": 0.86926389, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.40991211, + "step": 1981, + "time_per_iteration": 2.878394365310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131348, + "balance_loss_mlp": 1.09060264, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.057148713710776886, + "language_loss": 0.86977971, + "learning_rate": 0.0007098177934248242, + "loss": 0.88109326, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.40771484, + "step": 1982, + "time_per_iteration": 2.7281908988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142672, + "balance_loss_mlp": 1.09918451, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.07304374640444197, + "language_loss": 0.85583997, + "learning_rate": 0.0007095349680442661, + "loss": 0.86726665, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.43505859, + "step": 1983, + "time_per_iteration": 2.831989288330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132213, + "balance_loss_mlp": 1.09015596, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.059661631452858944, + "language_loss": 0.79073238, + "learning_rate": 0.0007092520613150188, + "loss": 0.80205452, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4206543, + "step": 1984, + "time_per_iteration": 2.6566810607910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.09416926, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.0624399969319272, + "language_loss": 0.81395054, + "learning_rate": 0.0007089690733469165, + "loss": 0.82531422, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.42236328, + "step": 1985, + "time_per_iteration": 2.713041067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133128, + "balance_loss_mlp": 1.09023643, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.0833415836593691, + "language_loss": 0.83054602, + "learning_rate": 0.000708686004249825, + "loss": 0.84187728, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.42895508, + "step": 1986, + "time_per_iteration": 2.7708489894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135389, + "balance_loss_mlp": 1.09311724, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.050231849807362665, + "language_loss": 0.91983181, + "learning_rate": 0.0007084028541336413, + "loss": 0.93118572, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.42260742, + "step": 1987, + "time_per_iteration": 2.7049031257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135282, + "balance_loss_mlp": 1.09205675, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.07987509930436443, + "language_loss": 0.86416399, + "learning_rate": 0.0007081196231082942, + "loss": 0.87551689, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.43212891, + "step": 1988, + "time_per_iteration": 2.769559860229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.09949565, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.09872496004335095, + "language_loss": 0.80492568, + "learning_rate": 0.0007078363112837436, + "loss": 0.81635618, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.43579102, + "step": 1989, + "time_per_iteration": 2.836904525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144237, + "balance_loss_mlp": 1.10065365, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.05755280117815587, + "language_loss": 0.85391158, + "learning_rate": 0.000707552918769981, + "loss": 0.86535394, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43579102, + "step": 1990, + "time_per_iteration": 2.552560806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114164, + "balance_loss_mlp": 1.09846199, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.058237292508227935, + "language_loss": 0.83844453, + "learning_rate": 0.000707269445677029, + "loss": 0.84986091, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43188477, + "step": 1991, + "time_per_iteration": 2.717240571975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155192, + "balance_loss_mlp": 1.11270583, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.08345502818850435, + "language_loss": 0.85774487, + "learning_rate": 0.0007069858921149416, + "loss": 0.86929679, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.42480469, + "step": 1992, + "time_per_iteration": 2.937901496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143498, + "balance_loss_mlp": 1.10120225, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.06679457573221616, + "language_loss": 0.86415881, + "learning_rate": 0.0007067022581938043, + "loss": 0.87559378, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.4230957, + "step": 1993, + "time_per_iteration": 2.8283159732818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147458, + "balance_loss_mlp": 1.10614026, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.06079929242541683, + "language_loss": 0.83476102, + "learning_rate": 0.0007064185440237334, + "loss": 0.84623557, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.41333008, + "step": 1994, + "time_per_iteration": 2.738664150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148789, + "balance_loss_mlp": 1.10627878, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.05320553517563596, + "language_loss": 0.8495338, + "learning_rate": 0.0007061347497148764, + "loss": 0.8610217, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.42504883, + "step": 1995, + "time_per_iteration": 2.7379775047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147554, + "balance_loss_mlp": 1.10444832, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.059351713178290334, + "language_loss": 0.86747766, + "learning_rate": 0.0007058508753774122, + "loss": 0.87895322, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.4309082, + "step": 1996, + "time_per_iteration": 2.6882424354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144268, + "balance_loss_mlp": 1.10242534, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.08780844300106258, + "language_loss": 0.87086272, + "learning_rate": 0.0007055669211215505, + "loss": 0.88230544, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.41870117, + "step": 1997, + "time_per_iteration": 2.5902607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136259, + "balance_loss_mlp": 1.09236586, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.0743501008638896, + "language_loss": 0.77852333, + "learning_rate": 0.0007052828870575322, + "loss": 0.78988594, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43896484, + "step": 1998, + "time_per_iteration": 2.643887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113691, + "balance_loss_mlp": 1.09521055, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.05655172042288627, + "language_loss": 0.87035221, + "learning_rate": 0.0007049987732956291, + "loss": 0.88172132, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.41723633, + "step": 1999, + "time_per_iteration": 2.9655773639678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132979, + "balance_loss_mlp": 1.09325886, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.061738893850828154, + "language_loss": 0.83046496, + "learning_rate": 0.0007047145799461439, + "loss": 0.84179473, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.39746094, + "step": 2000, + "time_per_iteration": 2.8509583473205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_mlp": 1.0917958, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06203375299954445, + "language_loss": 0.82530397, + "learning_rate": 0.00070443030711941, + "loss": 0.83663273, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.41088867, + "step": 2001, + "time_per_iteration": 2.759324312210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134639, + "balance_loss_mlp": 1.09386945, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.05757301433327453, + "language_loss": 0.83082199, + "learning_rate": 0.0007041459549257924, + "loss": 0.84216839, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.40771484, + "step": 2002, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121155, + "balance_loss_mlp": 1.08014655, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.07528883527847323, + "language_loss": 0.78547823, + "learning_rate": 0.0007038615234756859, + "loss": 0.79668975, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.41015625, + "step": 2003, + "time_per_iteration": 3.211712598800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_mlp": 1.08257461, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.05751633762771481, + "language_loss": 0.83558142, + "learning_rate": 0.000703577012879517, + "loss": 0.84683371, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.42651367, + "step": 2004, + "time_per_iteration": 2.628211498260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130283, + "balance_loss_mlp": 1.08956099, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.08619617913051347, + "language_loss": 0.89379585, + "learning_rate": 0.0007032924232477423, + "loss": 0.90509868, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.40722656, + "step": 2005, + "time_per_iteration": 2.631619930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128848, + "balance_loss_mlp": 1.08743477, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.06586843636176778, + "language_loss": 0.80831605, + "learning_rate": 0.0007030077546908493, + "loss": 0.81960452, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.6160101890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336479, + "balance_loss_mlp": 1.3253212, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.11294410837330418, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84401143, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11181641, + "step": 2007, + "time_per_iteration": 4.7873475551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131514, + "balance_loss_mlp": 1.09014845, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.06382618687285554, + "language_loss": 0.79329109, + "learning_rate": 0.0007024381812438117, + "loss": 0.8046062, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.41381836, + "step": 2008, + "time_per_iteration": 2.5387141704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152986, + "balance_loss_mlp": 1.11390948, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.0811673363837608, + "language_loss": 0.83681285, + "learning_rate": 0.0007021532765747951, + "loss": 0.84834278, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.390625, + "step": 2009, + "time_per_iteration": 2.9795420169830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164171, + "balance_loss_mlp": 1.12082672, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.11123688030830275, + "language_loss": 0.7961666, + "learning_rate": 0.0007018682934229162, + "loss": 0.80780828, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43334961, + "step": 2010, + "time_per_iteration": 2.9108352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164881, + "balance_loss_mlp": 1.1216315, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.07913719393788664, + "language_loss": 0.83099723, + "learning_rate": 0.0007015832318988152, + "loss": 0.842646, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43237305, + "step": 2011, + "time_per_iteration": 2.605280637741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082789, + "balance_loss_mlp": 1.07096386, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.024547203760462325, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74972868, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11816406, + "step": 2012, + "time_per_iteration": 4.955415964126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161774, + "balance_loss_mlp": 1.12167192, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.062010894867637535, + "language_loss": 0.84259552, + "learning_rate": 0.0007010128741766604, + "loss": 0.85421324, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.40112305, + "step": 2013, + "time_per_iteration": 2.738905906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162675, + "balance_loss_mlp": 1.12080884, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.08443522979585812, + "language_loss": 0.84619504, + "learning_rate": 0.0007007275782000391, + "loss": 0.85782182, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.41870117, + "step": 2014, + "time_per_iteration": 2.6049582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178912, + "balance_loss_mlp": 1.13528132, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.05901822901260885, + "language_loss": 0.84836662, + "learning_rate": 0.0007004422042940605, + "loss": 0.8601557, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.43603516, + "step": 2015, + "time_per_iteration": 2.5449817180633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174031, + "balance_loss_mlp": 1.13106763, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.07137462797198264, + "language_loss": 0.89881837, + "learning_rate": 0.0007001567525695169, + "loss": 0.9105587, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.42993164, + "step": 2016, + "time_per_iteration": 2.5804128646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191346, + "balance_loss_mlp": 1.14921737, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.11416128839824946, + "language_loss": 0.84030014, + "learning_rate": 0.0006998712231372303, + "loss": 0.85221362, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.42138672, + "step": 2017, + "time_per_iteration": 2.9779462814331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182085, + "balance_loss_mlp": 1.13845432, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06300984009010882, + "language_loss": 0.86622429, + "learning_rate": 0.0006995856161080532, + "loss": 0.87804508, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43652344, + "step": 2018, + "time_per_iteration": 2.8405675888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160301, + "balance_loss_mlp": 1.11588371, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.0764923139512956, + "language_loss": 0.8250891, + "learning_rate": 0.0006992999315928679, + "loss": 0.83669221, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.44433594, + "step": 2019, + "time_per_iteration": 2.7929439544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146323, + "balance_loss_mlp": 1.10407472, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.09156853050649941, + "language_loss": 0.86159158, + "learning_rate": 0.0006990141697025871, + "loss": 0.8730548, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.42236328, + "step": 2020, + "time_per_iteration": 2.7913589477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137863, + "balance_loss_mlp": 1.12422562, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.035838926183426385, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77497506, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.13671875, + "step": 2021, + "time_per_iteration": 4.727250576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011348, + "balance_loss_mlp": 1.09398317, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0717580829802053, + "language_loss": 0.82676983, + "learning_rate": 0.0006984424142405392, + "loss": 0.8381179, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.40771484, + "step": 2022, + "time_per_iteration": 2.810420513153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.09006715, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.11474151925346394, + "language_loss": 0.8263585, + "learning_rate": 0.0006981564208907474, + "loss": 0.83766377, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.40454102, + "step": 2023, + "time_per_iteration": 2.604849100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139234, + "balance_loss_mlp": 1.09763026, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.05701984367640102, + "language_loss": 0.90312237, + "learning_rate": 0.0006978703506098102, + "loss": 0.91451472, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.41601562, + "step": 2024, + "time_per_iteration": 2.7345082759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115758, + "balance_loss_mlp": 1.11683416, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.06830457595999238, + "language_loss": 0.87819719, + "learning_rate": 0.00069758420350879, + "loss": 0.88977301, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.40722656, + "step": 2025, + "time_per_iteration": 2.6252336502075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160672, + "balance_loss_mlp": 1.11689889, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.07405760759256953, + "language_loss": 0.8637889, + "learning_rate": 0.000697297979698779, + "loss": 0.87539566, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43774414, + "step": 2026, + "time_per_iteration": 2.709831476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.11291099, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06812366476721117, + "language_loss": 0.83983821, + "learning_rate": 0.0006970116792908992, + "loss": 0.85135239, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.38500977, + "step": 2027, + "time_per_iteration": 3.0651228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.10976994, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.06881031116362346, + "language_loss": 0.82086015, + "learning_rate": 0.000696725302396302, + "loss": 0.832358, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.39990234, + "step": 2028, + "time_per_iteration": 2.6441640853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134129, + "balance_loss_mlp": 1.09400284, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.05768401763088921, + "language_loss": 0.86036873, + "learning_rate": 0.0006964388491261692, + "loss": 0.87171006, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.40136719, + "step": 2029, + "time_per_iteration": 3.3004355430603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129182, + "balance_loss_mlp": 1.08941352, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.06928638271863855, + "language_loss": 0.87596297, + "learning_rate": 0.0006961523195917114, + "loss": 0.88725477, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.39770508, + "step": 2030, + "time_per_iteration": 2.8312549591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112269, + "balance_loss_mlp": 1.08041883, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.06430070846126967, + "language_loss": 0.78209358, + "learning_rate": 0.0006958657139041696, + "loss": 0.79332048, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.4230957, + "step": 2031, + "time_per_iteration": 2.789843797683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172125, + "balance_loss_mlp": 1.1593461, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.04676690558545683, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77885091, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.12792969, + "step": 2032, + "time_per_iteration": 4.9584527015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118419, + "balance_loss_mlp": 1.07781672, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.06222398192409584, + "language_loss": 0.78433788, + "learning_rate": 0.0006952922745149434, + "loss": 0.79552209, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.40600586, + "step": 2033, + "time_per_iteration": 2.6696994304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125088, + "balance_loss_mlp": 1.08288765, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06080690179225973, + "language_loss": 0.88040847, + "learning_rate": 0.000695005441035888, + "loss": 0.89165938, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.421875, + "step": 2034, + "time_per_iteration": 2.675685167312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126781, + "balance_loss_mlp": 1.11333418, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.02489517999219278, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74850214, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.13476562, + "step": 2035, + "time_per_iteration": 4.8780670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114086, + "balance_loss_mlp": 1.10006714, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.09902005838056731, + "language_loss": 0.81387436, + "learning_rate": 0.0006944315470656863, + "loss": 0.82528299, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.40795898, + "step": 2036, + "time_per_iteration": 3.04048228263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132051, + "balance_loss_mlp": 1.08858752, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.07431126960541347, + "language_loss": 0.91352618, + "learning_rate": 0.000694144486797345, + "loss": 0.92484671, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.43432617, + "step": 2037, + "time_per_iteration": 2.692013740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110594, + "balance_loss_mlp": 1.09695601, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.027663679576331687, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8063103, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.13671875, + "step": 2038, + "time_per_iteration": 4.626150369644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128452, + "balance_loss_mlp": 1.08796859, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.06987974305662424, + "language_loss": 0.90060711, + "learning_rate": 0.0006935701402514156, + "loss": 0.91189158, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.40454102, + "step": 2039, + "time_per_iteration": 2.5738487243652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099838, + "balance_loss_mlp": 1.0864867, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.03469500580229188, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74134731, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13378906, + "step": 2040, + "time_per_iteration": 4.957871437072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140825, + "balance_loss_mlp": 1.10112846, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.08036310752647091, + "language_loss": 0.84965599, + "learning_rate": 0.0006929954931031422, + "loss": 0.86106431, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.39672852, + "step": 2041, + "time_per_iteration": 4.232867956161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_mlp": 1.09039509, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05705738410966496, + "language_loss": 0.8864727, + "learning_rate": 0.0006927080570819805, + "loss": 0.89776957, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.39282227, + "step": 2042, + "time_per_iteration": 2.6111459732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_mlp": 1.10252953, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.08862983476083965, + "language_loss": 0.81371272, + "learning_rate": 0.0006924205462449161, + "loss": 0.82514596, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.40795898, + "step": 2043, + "time_per_iteration": 2.6160669326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128783, + "balance_loss_mlp": 1.08932424, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.06601435567751561, + "language_loss": 0.82073617, + "learning_rate": 0.0006921329607035702, + "loss": 0.83202398, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.39453125, + "step": 2044, + "time_per_iteration": 3.2338860034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.08441699, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.06846789620147704, + "language_loss": 0.88441163, + "learning_rate": 0.0006918453005695938, + "loss": 0.89562631, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.37011719, + "step": 2045, + "time_per_iteration": 2.6499555110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135104, + "balance_loss_mlp": 1.09426332, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.05142411082006327, + "language_loss": 0.84655213, + "learning_rate": 0.0006915575659546662, + "loss": 0.85790318, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.40869141, + "step": 2046, + "time_per_iteration": 2.652902364730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133716, + "balance_loss_mlp": 1.09339929, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.08744808643608758, + "language_loss": 0.80837369, + "learning_rate": 0.0006912697569704959, + "loss": 0.81971085, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.40307617, + "step": 2047, + "time_per_iteration": 2.6129064559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131331, + "balance_loss_mlp": 1.09158659, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07468037026935817, + "language_loss": 0.86945641, + "learning_rate": 0.0006909818737288205, + "loss": 0.88076973, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.3972168, + "step": 2048, + "time_per_iteration": 2.5576181411743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10632348, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07110132916922086, + "language_loss": 0.81226838, + "learning_rate": 0.000690693916341406, + "loss": 0.82373071, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.39916992, + "step": 2049, + "time_per_iteration": 2.5884814262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154156, + "balance_loss_mlp": 1.11398268, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.05472880535545416, + "language_loss": 0.82429487, + "learning_rate": 0.0006904058849200475, + "loss": 0.83583641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.40185547, + "step": 2050, + "time_per_iteration": 2.7662599086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144327, + "balance_loss_mlp": 1.10565519, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.06127353443593348, + "language_loss": 0.85204089, + "learning_rate": 0.0006901177795765683, + "loss": 0.86348414, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.38647461, + "step": 2051, + "time_per_iteration": 2.577353000640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011475, + "balance_loss_mlp": 1.10768366, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.10882102145067868, + "language_loss": 0.81508064, + "learning_rate": 0.0006898296004228213, + "loss": 0.82655561, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.39819336, + "step": 2052, + "time_per_iteration": 2.7242588996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118361, + "balance_loss_mlp": 1.10605848, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03880030883121314, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79245102, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12304688, + "step": 2053, + "time_per_iteration": 4.852335691452026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.1204555, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.06533456514809383, + "language_loss": 0.79943091, + "learning_rate": 0.0006892530211320763, + "loss": 0.81103128, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.39575195, + "step": 2054, + "time_per_iteration": 2.726592779159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163981, + "balance_loss_mlp": 1.12528563, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.06955061494726521, + "language_loss": 0.8399905, + "learning_rate": 0.000688964621218926, + "loss": 0.85163033, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.38696289, + "step": 2055, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156737, + "balance_loss_mlp": 1.11737382, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.06754212988294535, + "language_loss": 0.80637926, + "learning_rate": 0.0006886761479432037, + "loss": 0.81794661, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.39379883, + "step": 2056, + "time_per_iteration": 2.8334691524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169364, + "balance_loss_mlp": 1.12866604, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.08783588969410645, + "language_loss": 0.85058302, + "learning_rate": 0.0006883876014169045, + "loss": 0.86227667, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.40698242, + "step": 2057, + "time_per_iteration": 2.4859981536865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163618, + "balance_loss_mlp": 1.12344468, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07066278036752763, + "language_loss": 0.90527105, + "learning_rate": 0.000688098981752052, + "loss": 0.91690719, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.40161133, + "step": 2058, + "time_per_iteration": 2.737825393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169191, + "balance_loss_mlp": 1.12849319, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08574741875980238, + "language_loss": 0.80283022, + "learning_rate": 0.0006878102890606982, + "loss": 0.81452215, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.40722656, + "step": 2059, + "time_per_iteration": 3.0589451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159966, + "balance_loss_mlp": 1.12034082, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.07158976818793618, + "language_loss": 0.81510139, + "learning_rate": 0.0006875215234549239, + "loss": 0.8267011, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.39648438, + "step": 2060, + "time_per_iteration": 2.5404529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11150885, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.11168111879418678, + "language_loss": 0.86092877, + "learning_rate": 0.0006872326850468376, + "loss": 0.87244487, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.40087891, + "step": 2061, + "time_per_iteration": 2.6653215885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153197, + "balance_loss_mlp": 1.11133087, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.0731410886524803, + "language_loss": 0.79433036, + "learning_rate": 0.0006869437739485762, + "loss": 0.80586231, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.41870117, + "step": 2062, + "time_per_iteration": 2.6032299995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147299, + "balance_loss_mlp": 1.1086272, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06685158443863869, + "language_loss": 0.9296748, + "learning_rate": 0.0006866547902723053, + "loss": 0.9411478, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.38647461, + "step": 2063, + "time_per_iteration": 2.676166534423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150184, + "balance_loss_mlp": 1.11148858, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10136223850880095, + "language_loss": 0.80330342, + "learning_rate": 0.000686365734130218, + "loss": 0.81480527, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.38696289, + "step": 2064, + "time_per_iteration": 2.6844232082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143564, + "balance_loss_mlp": 1.10420108, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06083764513088428, + "language_loss": 0.84282482, + "learning_rate": 0.000686076605634536, + "loss": 0.85426044, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.39379883, + "step": 2065, + "time_per_iteration": 2.6315250396728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156007, + "balance_loss_mlp": 1.11704922, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.07154960647229537, + "language_loss": 0.84777498, + "learning_rate": 0.0006857874048975088, + "loss": 0.85933506, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.38964844, + "step": 2066, + "time_per_iteration": 2.651740074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144331, + "balance_loss_mlp": 1.10298944, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06215318135177391, + "language_loss": 0.87357152, + "learning_rate": 0.0006854981320314142, + "loss": 0.88501477, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.41381836, + "step": 2067, + "time_per_iteration": 2.5062263011932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150736, + "balance_loss_mlp": 1.11089611, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.07144157906743025, + "language_loss": 0.87282014, + "learning_rate": 0.0006852087871485579, + "loss": 0.88432747, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.3984375, + "step": 2068, + "time_per_iteration": 2.6593010425567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141379, + "balance_loss_mlp": 1.10206354, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08492249089395289, + "language_loss": 0.82224536, + "learning_rate": 0.0006849193703612735, + "loss": 0.83365911, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.39282227, + "step": 2069, + "time_per_iteration": 2.755782127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137654, + "balance_loss_mlp": 1.09817159, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07327967142242812, + "language_loss": 0.78054988, + "learning_rate": 0.0006846298817819225, + "loss": 0.79192644, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.39477539, + "step": 2070, + "time_per_iteration": 2.987943410873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148271, + "balance_loss_mlp": 1.10909855, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.08050617332568782, + "language_loss": 0.81162381, + "learning_rate": 0.0006843403215228945, + "loss": 0.82310653, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.3918457, + "step": 2071, + "time_per_iteration": 2.4827940464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165055, + "balance_loss_mlp": 1.12585878, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.07083437878036915, + "language_loss": 0.80721962, + "learning_rate": 0.0006840506896966065, + "loss": 0.81887019, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.3918457, + "step": 2072, + "time_per_iteration": 2.6827309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166963, + "balance_loss_mlp": 1.12621748, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.06725102297232902, + "language_loss": 0.8278873, + "learning_rate": 0.0006837609864155038, + "loss": 0.83955693, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.40771484, + "step": 2073, + "time_per_iteration": 2.9130313396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116361, + "balance_loss_mlp": 1.12584436, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.07471059517929624, + "language_loss": 0.8375988, + "learning_rate": 0.0006834712117920592, + "loss": 0.84923482, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.37768555, + "step": 2074, + "time_per_iteration": 2.61501145362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162616, + "balance_loss_mlp": 1.12325335, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.13245970923224126, + "language_loss": 0.85901093, + "learning_rate": 0.0006831813659387729, + "loss": 0.87063706, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.39331055, + "step": 2075, + "time_per_iteration": 2.563549041748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149354, + "balance_loss_mlp": 1.11075377, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.06732512968880089, + "language_loss": 0.84738618, + "learning_rate": 0.0006828914489681733, + "loss": 0.85887969, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.38574219, + "step": 2076, + "time_per_iteration": 2.7011008262634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142979, + "balance_loss_mlp": 1.10440326, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.050728888200014394, + "language_loss": 0.85780215, + "learning_rate": 0.0006826014609928162, + "loss": 0.86923194, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.38598633, + "step": 2077, + "time_per_iteration": 2.699880838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_mlp": 1.01472485, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.012471286598558728, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84226274, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12158203, + "step": 2078, + "time_per_iteration": 4.819272518157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112436, + "balance_loss_mlp": 1.08549809, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.08765386089658693, + "language_loss": 0.80571902, + "learning_rate": 0.0006820212724781896, + "loss": 0.81696254, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.38842773, + "step": 2079, + "time_per_iteration": 2.6927945613861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112693, + "balance_loss_mlp": 1.07526088, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06830833334646268, + "language_loss": 0.84229112, + "learning_rate": 0.0006817310721641694, + "loss": 0.85341799, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.37402344, + "step": 2080, + "time_per_iteration": 2.8158507347106934 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4718509996638208.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/training_args.bin b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b28f0633932ff84d8e0fde7beb2f9c59f0d04be --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54b92ce31f27a60f5f91da41c22febbdc5fe6a9ac82c4d361c2b9dbc9096639 +size 7992 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/zero_to_fp32.py b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-2080/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/added_tokens.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4477091c8e5e4d06ea14a8a918edb0ae2310c298 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/generation_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..131d65ca3683b8072545d336820f0cb1c008066a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51cf24b33e7d8b304baec627186a06c767151c065ff66e11743045d5303c4a2f +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acffb2d8bd291ae244676305b60163ad5ce1ffe1 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:625498de63b813d68b3fa0b6397e6dfa2e784acf4f7118568dbdf2f231174f87 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18e4a6bba7feb6ae16f89f0c525611c0703bcc7f --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701cad8c404d1877e1fe86077d4dcaf56a08edf5875b04f42cb06aa836e9195d +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcf3809ca3649ee05cc2af28b085cee8168bdc8a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e06aed082f2a586177c6498bd48b2cf24189d625527313ad8ecce5f2db1b173 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..026d758686fbaa3fcd6e10e640d0ccaca9e21941 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978d7fa033bc21325331aa273e1183bf9b74a3003df5b1778bbbb425c0bb600e +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5697a2a6437b023cfed3d44892a0b8371718625c --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e03c185b40024f51b21bc34f985bf181c0a8b95a620a61d97123a6373c994c +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0902337cf19bbd996fdc68c6d6c80803743e4c4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d19fb5f90294b793dffdb480587f12de860ba3d16b7a3015acf652e7a139fa +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96c0887f6f2215336ebce89f3dc7e03f7e2d3f65 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbceb88b9b18e9e82d75d0b52594bdccb49ce1885fe638b4b66c9c9e3e10cbd9 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/latest b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/latest new file mode 100644 index 0000000000000000000000000000000000000000..804da059f781bacb3f274fb2103e4bc7f9bb7407 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/latest @@ -0,0 +1 @@ +global_step3120 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9bbf59336ea84e5f000d9d20c626b9d783a7dd71 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b91123a76a95c88a5d5fd33cf140c031925dd2f3cd1ecdce0b52f8c89b90c5d7 +size 3759025152 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model.safetensors.index.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_0.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_1.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_2.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_3.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/special_tokens_map.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/tokenizer.model b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/tokenizer_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/trainer_state.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ab017b8e699ac7adfba3902c5863b5b28f376eea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/trainer_state.json @@ -0,0 +1,46833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002308580223162, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03935784, + "balance_loss_mlp": 2.84935808, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 13.498251331228948, + "language_loss": 2.81572914, + "learning_rate": 0.0, + "loss": 1.90346789, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.30480647087097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0351246, + "balance_loss_mlp": 2.65644169, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 27.482987886380492, + "language_loss": 8.76816368, + "learning_rate": 0.00013726078121135892, + "loss": 8.80328846, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 8.578125, + "step": 2, + "time_per_iteration": 2.6929261684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03513305, + "balance_loss_mlp": 2.65728736, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 28.576563245741852, + "language_loss": 9.00053596, + "learning_rate": 0.00021755319103969496, + "loss": 9.03566933, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 8.578125, + "step": 3, + "time_per_iteration": 2.7945075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03667009, + "balance_loss_mlp": 2.78657675, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 15.694146018083416, + "language_loss": 2.74122858, + "learning_rate": 0.00027452156242271784, + "loss": 2.77789879, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 8.828125, + "step": 4, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03933422, + "balance_loss_mlp": 3.01102829, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 3.505338851882968, + "language_loss": 1.83478093, + "learning_rate": 0.0003187096642208417, + "loss": 1.87411511, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 9.2109375, + "step": 5, + "time_per_iteration": 2.651094675064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04005588, + "balance_loss_mlp": 3.05420256, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 3.050600048840319, + "language_loss": 1.61776543, + "learning_rate": 0.0003548139722510539, + "loss": 1.65782118, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 9.4921875, + "step": 6, + "time_per_iteration": 2.697614908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03708502, + "balance_loss_mlp": 2.7708497, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7974788691124679, + "language_loss": 1.32417345, + "learning_rate": 0.00038533972973918044, + "loss": 1.36125851, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 9.3515625, + "step": 7, + "time_per_iteration": 2.6407949924468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0332405, + "balance_loss_mlp": 2.38868618, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.7144720842381633, + "language_loss": 1.25956392, + "learning_rate": 0.0004117823436340768, + "loss": 1.29280448, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 9.3359375, + "step": 8, + "time_per_iteration": 2.6287930011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02785454, + "balance_loss_mlp": 1.8508532, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.3140255221758466, + "language_loss": 1.29993415, + "learning_rate": 0.00043510638207938993, + "loss": 1.32778871, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 9.3203125, + "step": 9, + "time_per_iteration": 2.8048858642578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0244685, + "balance_loss_mlp": 1.50004196, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.19799802642524775, + "language_loss": 1.19032216, + "learning_rate": 0.00045597044543220066, + "loss": 1.2147907, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 9.4453125, + "step": 10, + "time_per_iteration": 2.7669434547424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02310187, + "balance_loss_mlp": 1.35117221, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.14485632700798082, + "language_loss": 1.18421102, + "learning_rate": 0.00047484428652143135, + "loss": 1.20731282, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 9.5703125, + "step": 11, + "time_per_iteration": 2.9067423343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02309394, + "balance_loss_mlp": 1.33740926, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.1366980934684776, + "language_loss": 1.24379897, + "learning_rate": 0.0004920747534624128, + "loss": 1.26689291, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 9.703125, + "step": 12, + "time_per_iteration": 2.612813949584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022984, + "balance_loss_mlp": 1.32565212, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.11957957623458634, + "language_loss": 1.26615512, + "learning_rate": 0.0005079252465375872, + "loss": 1.28913903, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 9.7109375, + "step": 13, + "time_per_iteration": 2.879688262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02311662, + "balance_loss_mlp": 1.34730673, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.10749127497061137, + "language_loss": 1.14448667, + "learning_rate": 0.0005226005109505393, + "loss": 1.16760325, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 9.625, + "step": 14, + "time_per_iteration": 2.568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02285502, + "balance_loss_mlp": 1.3615818, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.11405493545380829, + "language_loss": 1.20514369, + "learning_rate": 0.0005362628552605367, + "loss": 1.22799873, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 9.21875, + "step": 15, + "time_per_iteration": 2.6814210414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02243131, + "balance_loss_mlp": 1.36117291, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.10465613456634369, + "language_loss": 1.24307358, + "learning_rate": 0.0005490431248454357, + "loss": 1.26550484, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 8.84375, + "step": 16, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02323403, + "balance_loss_mlp": 1.52994621, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2929644268686402, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78028512, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.90625, + "step": 17, + "time_per_iteration": 6.376815319061279 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02154669, + "balance_loss_mlp": 1.37418151, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.15081794947454089, + "language_loss": 1.11159086, + "learning_rate": 0.0005723671632907488, + "loss": 1.13313746, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.80078125, + "step": 18, + "time_per_iteration": 2.721731424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02067628, + "balance_loss_mlp": 1.35466075, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11430094844987627, + "language_loss": 1.15730095, + "learning_rate": 0.0005830738490244919, + "loss": 1.1779772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 7.12890625, + "step": 19, + "time_per_iteration": 2.691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01966178, + "balance_loss_mlp": 1.31958628, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10166759343386816, + "language_loss": 1.17760253, + "learning_rate": 0.0005932312266435596, + "loss": 1.19726431, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.46484375, + "step": 20, + "time_per_iteration": 2.779218912124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01836812, + "balance_loss_mlp": 1.26727819, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.12846528828878043, + "language_loss": 1.12106359, + "learning_rate": 0.0006028929207788754, + "loss": 1.13943172, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 5.70703125, + "step": 21, + "time_per_iteration": 2.716970443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01720951, + "balance_loss_mlp": 1.21970022, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09445288880840001, + "language_loss": 1.16516471, + "learning_rate": 0.0006121050677327902, + "loss": 1.18237424, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 5.0078125, + "step": 22, + "time_per_iteration": 2.92696475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01630624, + "balance_loss_mlp": 1.19193399, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.11621712848760359, + "language_loss": 1.06380248, + "learning_rate": 0.0006209076479463684, + "loss": 1.08010876, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.3984375, + "step": 23, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01572853, + "balance_loss_mlp": 1.18394423, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.10970997088624258, + "language_loss": 1.16519284, + "learning_rate": 0.0006293355346737718, + "loss": 1.18092132, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 3.88476562, + "step": 24, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0152954, + "balance_loss_mlp": 1.18755198, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.09735665571869598, + "language_loss": 1.12784922, + "learning_rate": 0.0006374193284416834, + "loss": 1.14314473, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 3.42382812, + "step": 25, + "time_per_iteration": 2.7919249534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0148827, + "balance_loss_mlp": 1.19282198, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.09233879954989622, + "language_loss": 1.11062908, + "learning_rate": 0.0006451860277489461, + "loss": 1.12551177, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 2.953125, + "step": 26, + "time_per_iteration": 2.581066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462989, + "balance_loss_mlp": 1.20988345, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.12330238493557526, + "language_loss": 1.19441557, + "learning_rate": 0.0006526595731190848, + "loss": 1.20904553, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 2.52929688, + "step": 27, + "time_per_iteration": 2.49725604057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423898, + "balance_loss_mlp": 1.20874906, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.09841719698503415, + "language_loss": 1.12322927, + "learning_rate": 0.0006598612921618983, + "loss": 1.13746822, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 2.15625, + "step": 28, + "time_per_iteration": 2.822068929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399446, + "balance_loss_mlp": 1.21443295, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.2589331093265968, + "language_loss": 1.06232262, + "learning_rate": 0.0006668102665011454, + "loss": 1.07631707, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 1.84765625, + "step": 29, + "time_per_iteration": 3.2402820587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444994, + "balance_loss_mlp": 1.28353739, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.1317361033328709, + "language_loss": 1.14859319, + "learning_rate": 0.0006735236364718957, + "loss": 1.16304302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 1.61425781, + "step": 30, + "time_per_iteration": 2.6861231327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333301, + "balance_loss_mlp": 1.20445967, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.07039345614882069, + "language_loss": 1.13512135, + "learning_rate": 0.0006800168558381346, + "loss": 1.14845431, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 1.28808594, + "step": 31, + "time_per_iteration": 2.6444640159606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254242, + "balance_loss_mlp": 1.153772, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.07602265872136475, + "language_loss": 1.1720531, + "learning_rate": 0.0006863039060567947, + "loss": 1.18459558, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 1.00439453, + "step": 32, + "time_per_iteration": 2.7225399017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117915, + "balance_loss_mlp": 1.10071015, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.062098451262649575, + "language_loss": 1.09530759, + "learning_rate": 0.0006923974775611263, + "loss": 1.10709918, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.78417969, + "step": 33, + "time_per_iteration": 2.795565366744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155392, + "balance_loss_mlp": 1.09416604, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0750568617782567, + "language_loss": 1.06307364, + "learning_rate": 0.0006983091239737814, + "loss": 1.0746274, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.61132812, + "step": 34, + "time_per_iteration": 3.0703423023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.0903163, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.057198892540160154, + "language_loss": 1.05094206, + "learning_rate": 0.0007040493939600222, + "loss": 1.06232452, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.47949219, + "step": 35, + "time_per_iteration": 2.8476996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136821, + "balance_loss_mlp": 1.09926963, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.07105443011946577, + "language_loss": 1.05056715, + "learning_rate": 0.0007096279445021078, + "loss": 1.06193542, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.37548828, + "step": 36, + "time_per_iteration": 2.8306472301483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_mlp": 1.12274194, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09366404592926651, + "language_loss": 1.11846077, + "learning_rate": 0.0007150536386503726, + "loss": 1.12998605, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.29736328, + "step": 37, + "time_per_iteration": 2.875190258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150569, + "balance_loss_mlp": 1.12677491, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.0928332145488954, + "language_loss": 1.04548562, + "learning_rate": 0.0007203346302358509, + "loss": 1.05699134, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.23791504, + "step": 38, + "time_per_iteration": 3.0075292587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128748, + "balance_loss_mlp": 1.10757613, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.056043607360260886, + "language_loss": 1.09224963, + "learning_rate": 0.000725478437577282, + "loss": 1.10353708, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.21179199, + "step": 39, + "time_per_iteration": 2.78564715385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_mlp": 1.09953475, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.2122838817863008, + "language_loss": 1.04638147, + "learning_rate": 0.0007304920078549186, + "loss": 1.0575583, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18151855, + "step": 40, + "time_per_iteration": 2.745100975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133734, + "balance_loss_mlp": 1.11621058, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.14528393981530327, + "language_loss": 1.06509054, + "learning_rate": 0.0007353817735343603, + "loss": 1.07642794, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.17529297, + "step": 41, + "time_per_iteration": 2.7425575256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.10357416, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.06769616325508275, + "language_loss": 1.0188365, + "learning_rate": 0.0007401537019902344, + "loss": 1.03003538, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.16308594, + "step": 42, + "time_per_iteration": 2.6797902584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118919, + "balance_loss_mlp": 1.10271883, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.14916902722339276, + "language_loss": 1.05194306, + "learning_rate": 0.0007448133392900729, + "loss": 1.06313229, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.1619873, + "step": 43, + "time_per_iteration": 2.779276132583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_mlp": 1.09945166, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.052417895665492535, + "language_loss": 1.00651026, + "learning_rate": 0.0007493658489441491, + "loss": 1.0176717, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.16711426, + "step": 44, + "time_per_iteration": 2.965435028076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_mlp": 1.09195447, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.04248825884697869, + "language_loss": 1.04600978, + "learning_rate": 0.0007538160463002316, + "loss": 1.05709875, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.16967773, + "step": 45, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_mlp": 1.08735132, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.08538228051147774, + "language_loss": 1.08093452, + "learning_rate": 0.0007581684291577274, + "loss": 1.09198785, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.17980957, + "step": 46, + "time_per_iteration": 2.6020169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.04723509056908367, + "language_loss": 1.10695386, + "learning_rate": 0.0007624272050891776, + "loss": 1.11800754, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.19006348, + "step": 47, + "time_per_iteration": 2.8620407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_mlp": 1.08244705, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.07235265954126073, + "language_loss": 1.00601125, + "learning_rate": 0.0007665963158851307, + "loss": 1.01704311, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.20751953, + "step": 48, + "time_per_iteration": 2.8312995433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114938, + "balance_loss_mlp": 1.09308696, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.10505304652404167, + "language_loss": 1.09914839, + "learning_rate": 0.0007706794594783609, + "loss": 1.1102978, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.21850586, + "step": 49, + "time_per_iteration": 2.779561758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.0874207, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.04709564792407722, + "language_loss": 1.08694363, + "learning_rate": 0.0007746801096530423, + "loss": 1.09804368, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.22583008, + "step": 50, + "time_per_iteration": 2.785332441329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_mlp": 1.09285581, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09574874491356838, + "language_loss": 1.13402438, + "learning_rate": 0.0007786015338021173, + "loss": 1.14518726, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.23425293, + "step": 51, + "time_per_iteration": 2.676326274871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_mlp": 1.09500206, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.12325193255180054, + "language_loss": 1.06019998, + "learning_rate": 0.0007824468089603051, + "loss": 1.07138121, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.23144531, + "step": 52, + "time_per_iteration": 2.688828945159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_mlp": 1.11038983, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.07208467676878935, + "language_loss": 1.05329835, + "learning_rate": 0.0007862188363098669, + "loss": 1.06464922, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.24707031, + "step": 53, + "time_per_iteration": 3.3342933654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126914, + "balance_loss_mlp": 1.10158229, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.09794855088059086, + "language_loss": 1.06043434, + "learning_rate": 0.0007899203543304438, + "loss": 1.07170355, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25354004, + "step": 54, + "time_per_iteration": 2.933236837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145083, + "balance_loss_mlp": 1.12053776, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.1404118977896248, + "language_loss": 1.20000231, + "learning_rate": 0.0007935539507422731, + "loss": 1.2114532, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.24536133, + "step": 55, + "time_per_iteration": 2.8257975578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.12969017, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.05382700946372506, + "language_loss": 1.10560298, + "learning_rate": 0.0007971220733732573, + "loss": 1.11713552, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.2355957, + "step": 56, + "time_per_iteration": 2.749382495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_mlp": 1.13151693, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.17392462927294325, + "language_loss": 1.05995011, + "learning_rate": 0.0008006270400641869, + "loss": 1.07150006, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.23474121, + "step": 57, + "time_per_iteration": 2.743929147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_mlp": 1.10234821, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.10169017538987117, + "language_loss": 1.06833839, + "learning_rate": 0.0008040710477125043, + "loss": 1.07959747, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.23547363, + "step": 58, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111797, + "balance_loss_mlp": 1.08861065, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.059941584643697095, + "language_loss": 1.07409072, + "learning_rate": 0.0008074561805429771, + "loss": 1.08520865, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.23181152, + "step": 59, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123772, + "balance_loss_mlp": 1.09970331, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.06438674129900752, + "language_loss": 1.04891515, + "learning_rate": 0.0008107844176832545, + "loss": 1.06015277, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.24072266, + "step": 60, + "time_per_iteration": 2.7009053230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.11569333, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.09833112160800331, + "language_loss": 1.0671711, + "learning_rate": 0.0008140576401132568, + "loss": 1.07856739, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.23913574, + "step": 61, + "time_per_iteration": 2.678501844406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114169, + "balance_loss_mlp": 1.11887348, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.11014501355567002, + "language_loss": 1.07748628, + "learning_rate": 0.0008172776370494935, + "loss": 1.08890319, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.22814941, + "step": 62, + "time_per_iteration": 2.7718141078948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116479, + "balance_loss_mlp": 1.09356666, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.06441650429015075, + "language_loss": 1.15269816, + "learning_rate": 0.0008204461118185703, + "loss": 1.16386294, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.22912598, + "step": 63, + "time_per_iteration": 2.5839178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_mlp": 1.09543014, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.06608006175674933, + "language_loss": 1.04523873, + "learning_rate": 0.0008235646872681536, + "loss": 1.05641007, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.21728516, + "step": 64, + "time_per_iteration": 2.5611703395843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_mlp": 1.10659182, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.07834673611922068, + "language_loss": 1.04319417, + "learning_rate": 0.0008266349107584288, + "loss": 1.05447328, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.21313477, + "step": 65, + "time_per_iteration": 2.727666139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141841, + "balance_loss_mlp": 1.1207881, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.06003338375813584, + "language_loss": 1.07126927, + "learning_rate": 0.0008296582587724851, + "loss": 1.08268762, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21057129, + "step": 66, + "time_per_iteration": 2.716701030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127113, + "balance_loss_mlp": 1.10609627, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.04807876202194694, + "language_loss": 1.04662776, + "learning_rate": 0.0008326361411800136, + "loss": 1.05789876, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21008301, + "step": 67, + "time_per_iteration": 2.9571592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114337, + "balance_loss_mlp": 1.09446514, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.05551510449528945, + "language_loss": 1.05008268, + "learning_rate": 0.0008355699051851403, + "loss": 1.06122601, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1986084, + "step": 68, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.1242373, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.0697970629442659, + "language_loss": 1.12296045, + "learning_rate": 0.0008384608389860635, + "loss": 1.13439655, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.19372559, + "step": 69, + "time_per_iteration": 2.685215711593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.122311, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.08511613263061502, + "language_loss": 1.02745342, + "learning_rate": 0.000841310175171381, + "loss": 1.03886437, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.18774414, + "step": 70, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_mlp": 1.12464356, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.055787325190813475, + "language_loss": 1.0065217, + "learning_rate": 0.000844119093875517, + "loss": 1.0179472, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.17944336, + "step": 71, + "time_per_iteration": 2.753220319747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152267, + "balance_loss_mlp": 1.13508892, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08668312915327946, + "language_loss": 1.05463254, + "learning_rate": 0.0008468887257134666, + "loss": 1.0661552, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.17199707, + "step": 72, + "time_per_iteration": 2.7056305408477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117134, + "balance_loss_mlp": 1.15478206, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07356095482564125, + "language_loss": 1.08388793, + "learning_rate": 0.0008496201545131264, + "loss": 1.09560132, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.16564941, + "step": 73, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152437, + "balance_loss_mlp": 1.13545001, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.06787935984484554, + "language_loss": 1.06090975, + "learning_rate": 0.0008523144198617317, + "loss": 1.07243395, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16992188, + "step": 74, + "time_per_iteration": 3.2090003490448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1223346, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.04825332815792917, + "language_loss": 1.053195, + "learning_rate": 0.0008549725194813783, + "loss": 1.06458783, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.16967773, + "step": 75, + "time_per_iteration": 2.654343605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.10599899, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.03887402020767282, + "language_loss": 1.04797029, + "learning_rate": 0.0008575954114472099, + "loss": 1.05919111, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.1607666, + "step": 76, + "time_per_iteration": 3.119884967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.1187191, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.056937643991546806, + "language_loss": 1.02038705, + "learning_rate": 0.0008601840162606118, + "loss": 1.03173184, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.1574707, + "step": 77, + "time_per_iteration": 3.025688886642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146725, + "balance_loss_mlp": 1.13034582, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04989291514363055, + "language_loss": 1.08127129, + "learning_rate": 0.000862739218788641, + "loss": 1.09273863, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16381836, + "step": 78, + "time_per_iteration": 2.7922520637512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149681, + "balance_loss_mlp": 1.13339734, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.06709094188277621, + "language_loss": 1.06189477, + "learning_rate": 0.0008652618700799138, + "loss": 1.07339156, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.1628418, + "step": 79, + "time_per_iteration": 2.6902618408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_mlp": 1.1367681, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.062162504049989416, + "language_loss": 1.05161238, + "learning_rate": 0.0008677527890662774, + "loss": 1.06314492, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16491699, + "step": 80, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_mlp": 1.13076603, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.04934081686369646, + "language_loss": 1.06529951, + "learning_rate": 0.0008702127641587799, + "loss": 1.0767715, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.16430664, + "step": 81, + "time_per_iteration": 2.634038209915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_mlp": 1.12558985, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.08879987127008451, + "language_loss": 1.0221808, + "learning_rate": 0.0008726425547457192, + "loss": 1.0336051, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.16845703, + "step": 82, + "time_per_iteration": 2.74308705329895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.13108134, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.06313420095488197, + "language_loss": 1.01906681, + "learning_rate": 0.0008750428925998964, + "loss": 1.03054249, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.16491699, + "step": 83, + "time_per_iteration": 2.777132511138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146759, + "balance_loss_mlp": 1.13009322, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.11663644047392754, + "language_loss": 1.07169831, + "learning_rate": 0.0008774144832015932, + "loss": 1.08316588, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16674805, + "step": 84, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01524523, + "balance_loss_mlp": 1.51412809, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.22860236459315994, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76298833, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.10400391, + "step": 85, + "time_per_iteration": 4.57580041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166169, + "balance_loss_mlp": 1.1501826, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.05249425037579876, + "language_loss": 1.01959693, + "learning_rate": 0.0008820741205014318, + "loss": 1.03125858, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.15979004, + "step": 86, + "time_per_iteration": 2.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223619, + "balance_loss_mlp": 1.20703709, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.10761462625124436, + "language_loss": 1.03955913, + "learning_rate": 0.0008843634575408404, + "loss": 1.05179524, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.16577148, + "step": 87, + "time_per_iteration": 2.6694159507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228231, + "balance_loss_mlp": 1.21267366, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.10737104518045529, + "language_loss": 1.05078888, + "learning_rate": 0.0008866266301555082, + "loss": 1.06307125, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.15551758, + "step": 88, + "time_per_iteration": 2.7686069011688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212138, + "balance_loss_mlp": 1.19609249, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.1616084590878673, + "language_loss": 1.0609467, + "learning_rate": 0.0008888642296509615, + "loss": 1.07306814, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.16040039, + "step": 89, + "time_per_iteration": 2.625988721847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199649, + "balance_loss_mlp": 1.18316197, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.07585409016808545, + "language_loss": 1.1065979, + "learning_rate": 0.0008910768275115906, + "loss": 1.11859453, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16491699, + "step": 90, + "time_per_iteration": 2.793017864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_mlp": 1.15697813, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.07277460951060387, + "language_loss": 1.06493175, + "learning_rate": 0.0008932649762767675, + "loss": 1.07666695, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16552734, + "step": 91, + "time_per_iteration": 2.5919723510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169355, + "balance_loss_mlp": 1.15323818, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.10172519854243242, + "language_loss": 1.09112859, + "learning_rate": 0.0008954292103690864, + "loss": 1.10282218, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.16113281, + "step": 92, + "time_per_iteration": 2.9366836547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174542, + "balance_loss_mlp": 1.15828145, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.07803491111319032, + "language_loss": 1.10981905, + "learning_rate": 0.0008975700468778296, + "loss": 1.12156439, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16259766, + "step": 93, + "time_per_iteration": 2.592458963394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156862, + "balance_loss_mlp": 1.14067388, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.09102852745954727, + "language_loss": 1.04703569, + "learning_rate": 0.0008996879863005366, + "loss": 1.05860424, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.71566104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148536, + "balance_loss_mlp": 1.13235974, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.03859462796979438, + "language_loss": 1.04768109, + "learning_rate": 0.0009017835132453337, + "loss": 1.05916631, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.16174316, + "step": 95, + "time_per_iteration": 2.664511203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_mlp": 1.121889, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.060963703759419355, + "language_loss": 1.04675508, + "learning_rate": 0.0009038570970964896, + "loss": 1.05813384, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.15991211, + "step": 96, + "time_per_iteration": 2.7669789791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_mlp": 1.10899043, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0943042692373462, + "language_loss": 1.02071011, + "learning_rate": 0.0009059091926454854, + "loss": 1.03196073, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16064453, + "step": 97, + "time_per_iteration": 2.6028668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_mlp": 1.11052442, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.06745462513624549, + "language_loss": 1.0144124, + "learning_rate": 0.0009079402406897198, + "loss": 1.02567911, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.16137695, + "step": 98, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127975, + "balance_loss_mlp": 1.11166739, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.10523687850003575, + "language_loss": 1.03251696, + "learning_rate": 0.0009099506686008212, + "loss": 1.04379678, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16308594, + "step": 99, + "time_per_iteration": 2.8251914978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116643, + "balance_loss_mlp": 1.10100293, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.08495157768411668, + "language_loss": 1.0609076, + "learning_rate": 0.0009119408908644013, + "loss": 1.07207406, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15625, + "step": 100, + "time_per_iteration": 2.6573309898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.12211871, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.09022378013673595, + "language_loss": 1.11755276, + "learning_rate": 0.0009139113095929519, + "loss": 1.12892556, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15124512, + "step": 101, + "time_per_iteration": 2.844698429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159661, + "balance_loss_mlp": 1.14373517, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.0892612752622512, + "language_loss": 1.05698013, + "learning_rate": 0.0009158623150134762, + "loss": 1.06857681, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15917969, + "step": 102, + "time_per_iteration": 2.589857339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_mlp": 1.12158906, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.06508497546963277, + "language_loss": 1.05496848, + "learning_rate": 0.000917794285931332, + "loss": 1.06634164, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15710449, + "step": 103, + "time_per_iteration": 2.6433918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_mlp": 1.1019367, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.07675487095909958, + "language_loss": 0.97610366, + "learning_rate": 0.0009197075901716639, + "loss": 0.98728061, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.1574707, + "step": 104, + "time_per_iteration": 2.709157943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137693, + "balance_loss_mlp": 1.12159956, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.05257934075389246, + "language_loss": 1.0758431, + "learning_rate": 0.0009216025849997171, + "loss": 1.08722019, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16088867, + "step": 105, + "time_per_iteration": 2.7638583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111903, + "balance_loss_mlp": 1.09596467, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.07457888312135433, + "language_loss": 1.02261579, + "learning_rate": 0.0009234796175212258, + "loss": 1.03373492, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.15930176, + "step": 106, + "time_per_iteration": 2.9391980171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117989, + "balance_loss_mlp": 1.10228872, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.06024423434996524, + "language_loss": 1.05948544, + "learning_rate": 0.000925339025064007, + "loss": 1.07066536, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.15686035, + "step": 107, + "time_per_iteration": 2.975294828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118819, + "balance_loss_mlp": 1.10334611, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.07105297051955457, + "language_loss": 0.99294066, + "learning_rate": 0.0009271811355418027, + "loss": 1.00412893, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.15454102, + "step": 108, + "time_per_iteration": 2.8750014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125269, + "balance_loss_mlp": 1.10940242, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09212378946406244, + "language_loss": 1.05636311, + "learning_rate": 0.0009290062678013548, + "loss": 1.06761575, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.15856934, + "step": 109, + "time_per_iteration": 2.8552017211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119898, + "balance_loss_mlp": 1.10393572, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.059465971869905314, + "language_loss": 1.04477715, + "learning_rate": 0.0009308147319536321, + "loss": 1.05597615, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.1595459, + "step": 110, + "time_per_iteration": 2.6493232250213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129754, + "balance_loss_mlp": 1.11385095, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.08324280754141193, + "language_loss": 1.10257316, + "learning_rate": 0.0009326068296900676, + "loss": 1.11387074, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.15893555, + "step": 111, + "time_per_iteration": 2.8384125232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112769, + "balance_loss_mlp": 1.11171615, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06941460102767082, + "language_loss": 1.01355243, + "learning_rate": 0.0009343828545846161, + "loss": 1.02482939, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.15966797, + "step": 112, + "time_per_iteration": 2.7743477821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114893, + "balance_loss_mlp": 1.13326573, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.047977415311889204, + "language_loss": 1.05199587, + "learning_rate": 0.0009361430923823841, + "loss": 1.06348515, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.15649414, + "step": 113, + "time_per_iteration": 2.6022982597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.10308659, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.080001842017843, + "language_loss": 1.09258401, + "learning_rate": 0.0009378878212755459, + "loss": 1.10376549, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15039062, + "step": 114, + "time_per_iteration": 2.491594076156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115419, + "balance_loss_mlp": 1.09967113, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.05036418666557463, + "language_loss": 0.9906168, + "learning_rate": 0.0009396173121672103, + "loss": 1.00177097, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.15734863, + "step": 115, + "time_per_iteration": 2.668848991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_mlp": 1.10945916, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.05918191636932359, + "language_loss": 1.04414749, + "learning_rate": 0.0009413318289238633, + "loss": 1.05539548, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.15307617, + "step": 116, + "time_per_iteration": 2.7496132850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106823, + "balance_loss_mlp": 1.09139705, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.1124204963758038, + "language_loss": 0.96924931, + "learning_rate": 0.0009430316286169771, + "loss": 0.98031747, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.15405273, + "step": 117, + "time_per_iteration": 3.026118278503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_mlp": 1.11998308, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.03693994945601898, + "language_loss": 1.02417183, + "learning_rate": 0.0009447169617543361, + "loss": 1.03552485, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15307617, + "step": 118, + "time_per_iteration": 2.575666666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156925, + "balance_loss_mlp": 1.14185703, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.10959367855453626, + "language_loss": 1.09001684, + "learning_rate": 0.0009463880725016029, + "loss": 1.1015861, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.15039062, + "step": 119, + "time_per_iteration": 2.6811347007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115422, + "balance_loss_mlp": 1.10052109, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.05068852434870314, + "language_loss": 1.03909945, + "learning_rate": 0.0009480451988946134, + "loss": 1.05025363, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.14880371, + "step": 120, + "time_per_iteration": 2.801814079284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_mlp": 1.09179425, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.05688398470992871, + "language_loss": 1.05377555, + "learning_rate": 0.0009496885730428627, + "loss": 1.06484532, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1517334, + "step": 121, + "time_per_iteration": 3.04720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_mlp": 1.10574555, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.08369646841136469, + "language_loss": 1.03908122, + "learning_rate": 0.0009513184213246156, + "loss": 1.05029583, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.15710449, + "step": 122, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129626, + "balance_loss_mlp": 1.11406958, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.05522871343558165, + "language_loss": 1.07008672, + "learning_rate": 0.0009529349645740552, + "loss": 1.08138299, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15539551, + "step": 123, + "time_per_iteration": 2.69759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129797, + "balance_loss_mlp": 1.11481285, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.053769267634074955, + "language_loss": 1.05687594, + "learning_rate": 0.0009545384182608524, + "loss": 1.06817389, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1496582, + "step": 124, + "time_per_iteration": 2.550584316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126, + "balance_loss_mlp": 1.11114669, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.08700167249890467, + "language_loss": 1.02945745, + "learning_rate": 0.0009561289926625252, + "loss": 1.04071736, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14831543, + "step": 125, + "time_per_iteration": 2.6619794368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_mlp": 1.10831082, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.07114777459455598, + "language_loss": 1.07932711, + "learning_rate": 0.0009577068930299292, + "loss": 1.09056234, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.15209961, + "step": 126, + "time_per_iteration": 2.553642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125226, + "balance_loss_mlp": 1.11038458, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.08279894264625885, + "language_loss": 1.03556633, + "learning_rate": 0.0009592723197462087, + "loss": 1.04681861, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.14819336, + "step": 127, + "time_per_iteration": 2.7255966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_mlp": 1.10936916, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07600858050716931, + "language_loss": 0.99905002, + "learning_rate": 0.0009608254684795125, + "loss": 1.01029539, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15148926, + "step": 128, + "time_per_iteration": 2.9839587211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_mlp": 1.11718702, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.08573045125619827, + "language_loss": 1.02976727, + "learning_rate": 0.0009623665303297678, + "loss": 1.04109192, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.15258789, + "step": 129, + "time_per_iteration": 2.7344865798950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_mlp": 1.10497391, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.07510500588649292, + "language_loss": 1.07057762, + "learning_rate": 0.0009638956919697878, + "loss": 1.08177161, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.14416504, + "step": 130, + "time_per_iteration": 2.864952802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_mlp": 1.08930528, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.0567118244953117, + "language_loss": 0.99135083, + "learning_rate": 0.0009654131357809714, + "loss": 1.00239229, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.14819336, + "step": 131, + "time_per_iteration": 2.6095099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_mlp": 1.1081202, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.05892082702998288, + "language_loss": 1.08188879, + "learning_rate": 0.0009669190399838441, + "loss": 1.09312594, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.15576172, + "step": 132, + "time_per_iteration": 3.096733331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_mlp": 1.08531809, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.09564892115109941, + "language_loss": 1.01233923, + "learning_rate": 0.0009684135787636724, + "loss": 1.02334726, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.15478516, + "step": 133, + "time_per_iteration": 2.8120856285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111325, + "balance_loss_mlp": 1.09529161, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.04870542745948935, + "language_loss": 1.05797207, + "learning_rate": 0.0009698969223913726, + "loss": 1.06908536, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.16027832, + "step": 134, + "time_per_iteration": 3.0269176959991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_mlp": 1.10735679, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.04083122637660085, + "language_loss": 1.08225274, + "learning_rate": 0.0009713692373399265, + "loss": 1.09348655, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.16015625, + "step": 135, + "time_per_iteration": 2.690932273864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01798361, + "balance_loss_mlp": 1.75773478, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.2058674005568875, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.8125459, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.40625, + "step": 136, + "time_per_iteration": 5.460411548614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01507549, + "balance_loss_mlp": 1.47512448, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.12866590611947104, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79318589, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.32421875, + "step": 137, + "time_per_iteration": 4.989046335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146765, + "balance_loss_mlp": 1.13081443, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.04917093034878699, + "language_loss": 1.00934815, + "learning_rate": 0.0009757216201974225, + "loss": 1.02081585, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.1595459, + "step": 138, + "time_per_iteration": 2.9566736221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162305, + "balance_loss_mlp": 1.1448524, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.06281235859244827, + "language_loss": 1.0596863, + "learning_rate": 0.0009771514130396581, + "loss": 1.07130933, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17468262, + "step": 139, + "time_per_iteration": 2.683931350708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150087, + "balance_loss_mlp": 1.1330874, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09254080332591261, + "language_loss": 1.06202602, + "learning_rate": 0.00097857095638274, + "loss": 1.07352686, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17016602, + "step": 140, + "time_per_iteration": 2.558708906173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149417, + "balance_loss_mlp": 1.13241768, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.03864103733020509, + "language_loss": 0.97399604, + "learning_rate": 0.0009799803961288726, + "loss": 0.9854902, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17016602, + "step": 141, + "time_per_iteration": 2.992034673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_mlp": 1.10685217, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.06378420241673269, + "language_loss": 1.03629804, + "learning_rate": 0.000981379875086876, + "loss": 1.0475328, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16625977, + "step": 142, + "time_per_iteration": 3.063534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121821, + "balance_loss_mlp": 1.10560894, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.046520134554953796, + "language_loss": 0.98784387, + "learning_rate": 0.0009827695330590185, + "loss": 0.99906206, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.1619873, + "step": 143, + "time_per_iteration": 2.6495330333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_mlp": 1.1078757, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.05485832849515215, + "language_loss": 0.98036379, + "learning_rate": 0.0009841495069248256, + "loss": 0.99160779, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.1652832, + "step": 144, + "time_per_iteration": 2.9577834606170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_mlp": 1.12901306, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.09798795242100523, + "language_loss": 0.97478735, + "learning_rate": 0.0009855199307219871, + "loss": 0.98624128, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.16381836, + "step": 145, + "time_per_iteration": 2.6759142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148365, + "balance_loss_mlp": 1.13168764, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.1254453322996171, + "language_loss": 0.99733889, + "learning_rate": 0.0009868809357244854, + "loss": 1.00882256, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16687012, + "step": 146, + "time_per_iteration": 2.66375994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113683, + "balance_loss_mlp": 1.11978364, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.08248071954181796, + "language_loss": 1.03600287, + "learning_rate": 0.0009882326505180556, + "loss": 1.04737115, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.1706543, + "step": 147, + "time_per_iteration": 2.719353437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_mlp": 1.13280392, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.12761243433758393, + "language_loss": 1.02101135, + "learning_rate": 0.0009895752010730906, + "loss": 1.03252351, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.1842041, + "step": 148, + "time_per_iteration": 2.9704201221466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141454, + "balance_loss_mlp": 1.12377512, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07962775403881484, + "language_loss": 1.0825479, + "learning_rate": 0.0009909087108150867, + "loss": 1.09396255, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.17687988, + "step": 149, + "time_per_iteration": 2.7516071796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151554, + "balance_loss_mlp": 1.13330352, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.10196194967952074, + "language_loss": 1.09083438, + "learning_rate": 0.0009922333006927371, + "loss": 1.10235, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.18249512, + "step": 150, + "time_per_iteration": 2.4685099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.15218103, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.13259475383105176, + "language_loss": 1.020684, + "learning_rate": 0.0009935490892437632, + "loss": 1.03238916, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.18322754, + "step": 151, + "time_per_iteration": 2.5665087699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166904, + "balance_loss_mlp": 1.14880824, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10481585745820837, + "language_loss": 1.00390673, + "learning_rate": 0.0009948561926585687, + "loss": 1.01557577, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.18103027, + "step": 152, + "time_per_iteration": 2.7641003131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139325, + "balance_loss_mlp": 1.122576, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09697971136145118, + "language_loss": 1.05073512, + "learning_rate": 0.0009961547248418122, + "loss": 1.06212831, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.16760254, + "step": 153, + "time_per_iteration": 2.631476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123418, + "balance_loss_mlp": 1.10662186, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.05437877185758658, + "language_loss": 1.01441622, + "learning_rate": 0.0009974447974719707, + "loss": 1.0256505, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.16809082, + "step": 154, + "time_per_iteration": 2.709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.11151338, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.09703401576709127, + "language_loss": 1.03478801, + "learning_rate": 0.0009987265200589763, + "loss": 1.0460813, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.17810059, + "step": 155, + "time_per_iteration": 2.77809739112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140894, + "balance_loss_mlp": 1.12376344, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.08300490544518559, + "language_loss": 1.02959824, + "learning_rate": 0.001, + "loss": 1.04100728, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.17150879, + "step": 156, + "time_per_iteration": 2.845790386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144802, + "balance_loss_mlp": 1.12720668, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07590676388764007, + "language_loss": 1.00599122, + "learning_rate": 0.0009999999029413921, + "loss": 1.01743913, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.17614746, + "step": 157, + "time_per_iteration": 2.833735227584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142594, + "balance_loss_mlp": 1.12554669, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.06607639809804342, + "language_loss": 1.01453137, + "learning_rate": 0.0009999996117656068, + "loss": 1.02595735, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.1706543, + "step": 158, + "time_per_iteration": 2.803636074066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011301, + "balance_loss_mlp": 1.11345792, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.08769352458743468, + "language_loss": 0.94982773, + "learning_rate": 0.0009999991264727564, + "loss": 0.96112871, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.16638184, + "step": 159, + "time_per_iteration": 2.7776851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.11870432, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.05788098803643346, + "language_loss": 1.06247735, + "learning_rate": 0.0009999984470630296, + "loss": 1.07383585, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.17163086, + "step": 160, + "time_per_iteration": 2.6311371326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125321, + "balance_loss_mlp": 1.10836911, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.05159431076001957, + "language_loss": 0.94850963, + "learning_rate": 0.0009999975735366902, + "loss": 0.95976287, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.16955566, + "step": 161, + "time_per_iteration": 3.0904829502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148114, + "balance_loss_mlp": 1.13099504, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0692270455282635, + "language_loss": 0.96706492, + "learning_rate": 0.0009999965058940775, + "loss": 0.97854608, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.17138672, + "step": 162, + "time_per_iteration": 3.490063428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150632, + "balance_loss_mlp": 1.13323975, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08572766411177644, + "language_loss": 1.03267431, + "learning_rate": 0.0009999952441356057, + "loss": 1.04418063, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.17382812, + "step": 163, + "time_per_iteration": 2.497690439224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130614, + "balance_loss_mlp": 1.11405563, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.05784293330097489, + "language_loss": 1.03805065, + "learning_rate": 0.000999993788261765, + "loss": 1.0493567, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.16564941, + "step": 164, + "time_per_iteration": 3.6041390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.1152972, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.05766532368121917, + "language_loss": 1.05311596, + "learning_rate": 0.00099999213827312, + "loss": 1.06444073, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.171875, + "step": 165, + "time_per_iteration": 2.806014060974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_mlp": 1.12589669, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.05992608893057494, + "language_loss": 1.00112009, + "learning_rate": 0.000999990294170312, + "loss": 1.01254439, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.16540527, + "step": 166, + "time_per_iteration": 2.6405951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.11351717, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.05363857392651908, + "language_loss": 1.03767109, + "learning_rate": 0.0009999882559540566, + "loss": 1.04897451, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.16845703, + "step": 167, + "time_per_iteration": 2.69801664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_mlp": 1.11079764, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.03971308084427602, + "language_loss": 1.00767386, + "learning_rate": 0.000999986023625145, + "loss": 1.01894999, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.16821289, + "step": 168, + "time_per_iteration": 2.710706949234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04227602, + "balance_loss_mlp": 3.93005633, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.49669676383753814, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8315202, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.96875, + "step": 169, + "time_per_iteration": 4.921034574508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178384, + "balance_loss_mlp": 1.15987098, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11256254520903143, + "language_loss": 1.01289928, + "learning_rate": 0.0009999809766328958, + "loss": 1.02468312, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.18518066, + "step": 170, + "time_per_iteration": 2.6784250736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236994, + "balance_loss_mlp": 1.21676469, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.13219145589868983, + "language_loss": 1.0357101, + "learning_rate": 0.0009999781619715177, + "loss": 1.04807997, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.20227051, + "step": 171, + "time_per_iteration": 2.5412755012512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234758, + "balance_loss_mlp": 1.21518433, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.05193788120122226, + "language_loss": 1.03408492, + "learning_rate": 0.000999975153201402, + "loss": 1.0464325, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.19567871, + "step": 172, + "time_per_iteration": 2.864586353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_mlp": 1.21688426, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.0814546252210238, + "language_loss": 1.01345742, + "learning_rate": 0.0009999719503237174, + "loss": 1.02582097, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.19470215, + "step": 173, + "time_per_iteration": 2.765923261642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_mlp": 1.24583161, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11494520888694326, + "language_loss": 1.10141742, + "learning_rate": 0.0009999685533397073, + "loss": 1.11407971, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20410156, + "step": 174, + "time_per_iteration": 2.5439114570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_mlp": 1.24525094, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.12313705571337571, + "language_loss": 1.01947784, + "learning_rate": 0.00099996496225069, + "loss": 1.03212488, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19445801, + "step": 175, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257561, + "balance_loss_mlp": 1.23677111, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07888015485072913, + "language_loss": 1.04929149, + "learning_rate": 0.0009999611770580604, + "loss": 1.06186724, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.20788574, + "step": 176, + "time_per_iteration": 2.841484785079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258013, + "balance_loss_mlp": 1.23668683, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.1202186920466195, + "language_loss": 1.03394961, + "learning_rate": 0.0009999571977632876, + "loss": 1.04652977, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21350098, + "step": 177, + "time_per_iteration": 2.567788600921631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_mlp": 1.25026441, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.09201820914192435, + "language_loss": 1.05765235, + "learning_rate": 0.0009999530243679166, + "loss": 1.07036722, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.21240234, + "step": 178, + "time_per_iteration": 2.5753743648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258548, + "balance_loss_mlp": 1.23935485, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.06529189645852858, + "language_loss": 1.00495052, + "learning_rate": 0.0009999486568735675, + "loss": 1.01753592, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.19177246, + "step": 179, + "time_per_iteration": 3.0607473850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251876, + "balance_loss_mlp": 1.23275518, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.07628849485304477, + "language_loss": 1.00889277, + "learning_rate": 0.0009999440952819362, + "loss": 1.02141166, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.19116211, + "step": 180, + "time_per_iteration": 3.6515376567840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248658, + "balance_loss_mlp": 1.22853494, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.05983966318213213, + "language_loss": 1.0115366, + "learning_rate": 0.0009999393395947935, + "loss": 1.02402306, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2010498, + "step": 181, + "time_per_iteration": 2.799633502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253433, + "balance_loss_mlp": 1.23378766, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.0770350968764605, + "language_loss": 1.04747987, + "learning_rate": 0.0009999343898139858, + "loss": 1.06001413, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19641113, + "step": 182, + "time_per_iteration": 2.627434253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258891, + "balance_loss_mlp": 1.23675334, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.06485795323962908, + "language_loss": 1.03381288, + "learning_rate": 0.0009999292459414348, + "loss": 1.04640174, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.22131348, + "step": 183, + "time_per_iteration": 2.5552356243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227697, + "balance_loss_mlp": 1.20765769, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.06837915158031915, + "language_loss": 1.07873201, + "learning_rate": 0.0009999239079791374, + "loss": 1.0910089, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.20031738, + "step": 184, + "time_per_iteration": 2.5553643703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225953, + "balance_loss_mlp": 1.20453107, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.05538225102727573, + "language_loss": 1.00595856, + "learning_rate": 0.0009999183759291659, + "loss": 1.01821804, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.21435547, + "step": 185, + "time_per_iteration": 2.6955769062042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199054, + "balance_loss_mlp": 1.17938447, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.052094207769016576, + "language_loss": 1.02581143, + "learning_rate": 0.0009999126497936682, + "loss": 1.03780198, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1965332, + "step": 186, + "time_per_iteration": 2.5304598808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198293, + "balance_loss_mlp": 1.1770494, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057723222775786294, + "language_loss": 1.05774581, + "learning_rate": 0.0009999067295748676, + "loss": 1.06972873, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21252441, + "step": 187, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225876, + "balance_loss_mlp": 1.20496714, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.0756096280824464, + "language_loss": 1.03738201, + "learning_rate": 0.000999900615275062, + "loss": 1.04964077, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.20922852, + "step": 188, + "time_per_iteration": 2.677471399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211466, + "balance_loss_mlp": 1.18979406, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.0898221855427691, + "language_loss": 1.09605587, + "learning_rate": 0.0009998943068966256, + "loss": 1.10817051, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21679688, + "step": 189, + "time_per_iteration": 2.4233202934265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217638, + "balance_loss_mlp": 1.19651425, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.10338446511893212, + "language_loss": 1.03747463, + "learning_rate": 0.0009998878044420072, + "loss": 1.04965115, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.21130371, + "step": 190, + "time_per_iteration": 2.6978025436401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177731, + "balance_loss_mlp": 1.15573716, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06881722524262912, + "language_loss": 0.99768066, + "learning_rate": 0.0009998811079137318, + "loss": 1.00945807, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22009277, + "step": 191, + "time_per_iteration": 2.5934321880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.12218916, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.0852793637050772, + "language_loss": 1.0086391, + "learning_rate": 0.0009998742173143987, + "loss": 1.02007401, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.2130127, + "step": 192, + "time_per_iteration": 2.6706249713897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139307, + "balance_loss_mlp": 1.1180048, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.07456835679934387, + "language_loss": 1.01398337, + "learning_rate": 0.0009998671326466833, + "loss": 1.02537644, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.21313477, + "step": 193, + "time_per_iteration": 2.992595672607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10519516, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.08171257283174432, + "language_loss": 1.02813613, + "learning_rate": 0.0009998598539133362, + "loss": 1.03940392, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21594238, + "step": 194, + "time_per_iteration": 3.0081543922424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113199, + "balance_loss_mlp": 1.11179638, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.05573112518601677, + "language_loss": 1.02892375, + "learning_rate": 0.0009998523811171828, + "loss": 1.04024363, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2019043, + "step": 195, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149122, + "balance_loss_mlp": 1.12843966, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0935188115694547, + "language_loss": 1.0387187, + "learning_rate": 0.0009998447142611248, + "loss": 1.05020976, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.20690918, + "step": 196, + "time_per_iteration": 2.6388566493988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160139, + "balance_loss_mlp": 1.13986123, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.047444937864230444, + "language_loss": 0.96302813, + "learning_rate": 0.0009998368533481387, + "loss": 0.97462952, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.20275879, + "step": 197, + "time_per_iteration": 3.033572196960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132809, + "balance_loss_mlp": 1.11254394, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08710369828361038, + "language_loss": 0.9995833, + "learning_rate": 0.0009998287983812762, + "loss": 1.01091146, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.20263672, + "step": 198, + "time_per_iteration": 2.8421950340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155397, + "balance_loss_mlp": 1.13373709, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.10277508525357126, + "language_loss": 1.05776644, + "learning_rate": 0.0009998205493636646, + "loss": 1.06932044, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.2166748, + "step": 199, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141939, + "balance_loss_mlp": 1.12035084, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.09429923895154278, + "language_loss": 0.98451054, + "learning_rate": 0.0009998121062985063, + "loss": 0.99592984, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.21594238, + "step": 200, + "time_per_iteration": 2.6926732063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171328, + "balance_loss_mlp": 1.15014482, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.08332681767957313, + "language_loss": 1.00419915, + "learning_rate": 0.0009998034691890794, + "loss": 1.01591253, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.21203613, + "step": 201, + "time_per_iteration": 2.7643332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165409, + "balance_loss_mlp": 1.14516699, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.11326578301102472, + "language_loss": 1.05536067, + "learning_rate": 0.0009997946380387369, + "loss": 1.06701469, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.20251465, + "step": 202, + "time_per_iteration": 2.630284070968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157571, + "balance_loss_mlp": 1.13723421, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09790094078320352, + "language_loss": 1.07388449, + "learning_rate": 0.0009997856128509076, + "loss": 1.08546019, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.20336914, + "step": 203, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144349, + "balance_loss_mlp": 1.12458408, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.1356659453961297, + "language_loss": 1.02559984, + "learning_rate": 0.0009997763936290952, + "loss": 1.03704333, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.19750977, + "step": 204, + "time_per_iteration": 2.503309965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138207, + "balance_loss_mlp": 1.11642766, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.053010676996176516, + "language_loss": 1.07603145, + "learning_rate": 0.0009997669803768789, + "loss": 1.08741355, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.21789551, + "step": 205, + "time_per_iteration": 2.7773749828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_mlp": 1.09366679, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07785432610828748, + "language_loss": 1.0289582, + "learning_rate": 0.0009997573730979134, + "loss": 1.04010415, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.20947266, + "step": 206, + "time_per_iteration": 2.7241222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04720912, + "balance_loss_mlp": 3.71993518, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.31672297251450016, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.83914113, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 10.0, + "step": 207, + "time_per_iteration": 4.65311074256897 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160001, + "balance_loss_mlp": 1.13651657, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.09244016287770654, + "language_loss": 1.01599813, + "learning_rate": 0.0009997375764747294, + "loss": 1.02759814, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.23449707, + "step": 208, + "time_per_iteration": 2.999249219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144547, + "balance_loss_mlp": 1.12159967, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.10768555369795524, + "language_loss": 0.98886019, + "learning_rate": 0.0009997273871381967, + "loss": 1.00030565, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.22949219, + "step": 209, + "time_per_iteration": 2.740895986557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154635, + "balance_loss_mlp": 1.13075733, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.0670178022721504, + "language_loss": 1.03911638, + "learning_rate": 0.0009997170037902862, + "loss": 1.05066276, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.23876953, + "step": 210, + "time_per_iteration": 2.7199809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161677, + "balance_loss_mlp": 1.13826418, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.062356382061819024, + "language_loss": 1.06535935, + "learning_rate": 0.0009997064264350292, + "loss": 1.07697606, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.23413086, + "step": 211, + "time_per_iteration": 2.85477614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164794, + "balance_loss_mlp": 1.14111865, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.11782714892356931, + "language_loss": 1.00570273, + "learning_rate": 0.0009996956550765317, + "loss": 1.01735067, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.23657227, + "step": 212, + "time_per_iteration": 2.683258295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178964, + "balance_loss_mlp": 1.15452623, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07352585681220185, + "language_loss": 0.95357072, + "learning_rate": 0.0009996846897189762, + "loss": 0.9653604, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.24438477, + "step": 213, + "time_per_iteration": 2.64486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.14665973, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.06101080420793073, + "language_loss": 1.01569629, + "learning_rate": 0.0009996735303666193, + "loss": 1.02740788, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.24499512, + "step": 214, + "time_per_iteration": 2.719754934310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189275, + "balance_loss_mlp": 1.16434813, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.09805160088916984, + "language_loss": 1.03784573, + "learning_rate": 0.0009996621770237937, + "loss": 1.04973853, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24938965, + "step": 215, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202725, + "balance_loss_mlp": 1.17728579, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.05858333324383458, + "language_loss": 0.99328029, + "learning_rate": 0.0009996506296949073, + "loss": 1.00530756, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.25463867, + "step": 216, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175124, + "balance_loss_mlp": 1.14957714, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.09898600739692984, + "language_loss": 0.99386859, + "learning_rate": 0.0009996388883844428, + "loss": 1.00561976, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.25561523, + "step": 217, + "time_per_iteration": 2.5985324382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155134, + "balance_loss_mlp": 1.13007665, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06208913439552352, + "language_loss": 1.03500867, + "learning_rate": 0.0009996269530969588, + "loss": 1.04656017, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25048828, + "step": 218, + "time_per_iteration": 2.591993808746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152332, + "balance_loss_mlp": 1.12778735, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.08789931910276294, + "language_loss": 1.02762055, + "learning_rate": 0.0009996148238370888, + "loss": 1.0391438, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24536133, + "step": 219, + "time_per_iteration": 2.7247660160064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146753, + "balance_loss_mlp": 1.12125421, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.059765696203788965, + "language_loss": 0.98427057, + "learning_rate": 0.0009996025006095421, + "loss": 0.99573809, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.25524902, + "step": 220, + "time_per_iteration": 3.314250946044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04012538, + "balance_loss_mlp": 3.61886096, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.18322335632445477, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81795681, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 3.921875, + "step": 221, + "time_per_iteration": 5.397853851318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_mlp": 1.11779404, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.10045289138425088, + "language_loss": 0.98726314, + "learning_rate": 0.0009995772722706307, + "loss": 0.99869102, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.25, + "step": 222, + "time_per_iteration": 2.8346786499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168149, + "balance_loss_mlp": 1.14130318, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.07395583213906755, + "language_loss": 1.12709904, + "learning_rate": 0.0009995643671690604, + "loss": 1.13878047, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.26879883, + "step": 223, + "time_per_iteration": 2.4760169982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157966, + "balance_loss_mlp": 1.1317513, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.08239055528326475, + "language_loss": 1.00208497, + "learning_rate": 0.0009995512681194023, + "loss": 1.01366448, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.26257324, + "step": 224, + "time_per_iteration": 2.833751916885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151672, + "balance_loss_mlp": 1.12492132, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.058356102807926864, + "language_loss": 0.97854793, + "learning_rate": 0.0009995379751267417, + "loss": 0.99006462, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.2677002, + "step": 225, + "time_per_iteration": 3.295761823654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_mlp": 1.1551652, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.09032086206875983, + "language_loss": 0.99067688, + "learning_rate": 0.0009995244881962398, + "loss": 1.00250244, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.27416992, + "step": 226, + "time_per_iteration": 2.6147754192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162924, + "balance_loss_mlp": 1.1352675, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.05273235380658081, + "language_loss": 1.00220668, + "learning_rate": 0.0009995108073331323, + "loss": 1.01383591, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27661133, + "step": 227, + "time_per_iteration": 2.575477361679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165107, + "balance_loss_mlp": 1.13835633, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.07222661628022838, + "language_loss": 1.03328192, + "learning_rate": 0.0009994969325427309, + "loss": 1.04493296, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.26733398, + "step": 228, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159475, + "balance_loss_mlp": 1.13215184, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.05690950477809338, + "language_loss": 0.99788582, + "learning_rate": 0.0009994828638304218, + "loss": 1.0094806, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.2734375, + "step": 229, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160216, + "balance_loss_mlp": 1.13327467, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.0671245201901001, + "language_loss": 1.05080867, + "learning_rate": 0.0009994686012016675, + "loss": 1.06241083, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.26953125, + "step": 230, + "time_per_iteration": 2.5507686138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200075, + "balance_loss_mlp": 1.17368245, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.08083200993131012, + "language_loss": 1.04836714, + "learning_rate": 0.000999454144662005, + "loss": 1.06036782, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.26416016, + "step": 231, + "time_per_iteration": 2.872386932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177085, + "balance_loss_mlp": 1.15090632, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.06521500069668446, + "language_loss": 0.98697901, + "learning_rate": 0.0009994394942170468, + "loss": 0.99874985, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.26208496, + "step": 232, + "time_per_iteration": 2.6734542846679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_mlp": 1.13452244, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06848368332912834, + "language_loss": 0.96340638, + "learning_rate": 0.0009994246498724808, + "loss": 0.97500765, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.25598145, + "step": 233, + "time_per_iteration": 2.735145330429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.14341569, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.09664881582101635, + "language_loss": 0.99309772, + "learning_rate": 0.00099940961163407, + "loss": 1.00479114, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.25964355, + "step": 234, + "time_per_iteration": 2.8988683223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_mlp": 1.11722803, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.06003753756121682, + "language_loss": 1.01686716, + "learning_rate": 0.0009993943795076528, + "loss": 1.02828944, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.25012207, + "step": 235, + "time_per_iteration": 2.6333067417144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132836, + "balance_loss_mlp": 1.10618043, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.08170413586498586, + "language_loss": 1.0374043, + "learning_rate": 0.0009993789534991427, + "loss": 1.04873264, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.26708984, + "step": 236, + "time_per_iteration": 2.4350106716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_mlp": 1.0960753, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.0440176634981383, + "language_loss": 0.99063611, + "learning_rate": 0.0009993633336145287, + "loss": 1.00186157, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26513672, + "step": 237, + "time_per_iteration": 2.6414294242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134799, + "balance_loss_mlp": 1.10904956, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.04213473561248219, + "language_loss": 1.02718055, + "learning_rate": 0.0009993475198598752, + "loss": 1.03852856, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.25756836, + "step": 238, + "time_per_iteration": 2.9781904220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152995, + "balance_loss_mlp": 1.12614954, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08613106589232603, + "language_loss": 1.00055635, + "learning_rate": 0.0009993315122413212, + "loss": 1.01208627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.26879883, + "step": 239, + "time_per_iteration": 2.6395275592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_mlp": 1.13594294, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.06839694959482054, + "language_loss": 0.99973977, + "learning_rate": 0.0009993153107650818, + "loss": 1.01136363, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.2644043, + "step": 240, + "time_per_iteration": 2.563133716583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_mlp": 1.13391829, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.06471449859153773, + "language_loss": 0.98970807, + "learning_rate": 0.0009992989154374468, + "loss": 1.00131631, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.26928711, + "step": 241, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145463, + "balance_loss_mlp": 1.11914206, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06957696695924716, + "language_loss": 1.05868769, + "learning_rate": 0.0009992823262647817, + "loss": 1.07014227, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26342773, + "step": 242, + "time_per_iteration": 2.6841883659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111302, + "balance_loss_mlp": 1.08692503, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.0649477492764712, + "language_loss": 0.99848783, + "learning_rate": 0.0009992655432535264, + "loss": 1.00961804, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.2611084, + "step": 243, + "time_per_iteration": 2.7613234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107198, + "balance_loss_mlp": 1.08162785, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.05612685480258275, + "language_loss": 1.00329947, + "learning_rate": 0.0009992485664101973, + "loss": 1.01437151, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.25598145, + "step": 244, + "time_per_iteration": 2.717280387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.09556472, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.10316769075352135, + "language_loss": 1.02662849, + "learning_rate": 0.000999231395741385, + "loss": 1.03785205, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.26831055, + "step": 245, + "time_per_iteration": 3.095249891281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_mlp": 1.11837006, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.09647975042234339, + "language_loss": 1.01015186, + "learning_rate": 0.0009992140312537557, + "loss": 1.02159202, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.25671387, + "step": 246, + "time_per_iteration": 2.633258819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.09845233, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.09798218580430706, + "language_loss": 0.95550418, + "learning_rate": 0.000999196472954051, + "loss": 0.96674085, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.25231934, + "step": 247, + "time_per_iteration": 3.024939775466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02466762, + "balance_loss_mlp": 2.43700695, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.2831653982047738, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81891614, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 0.296875, + "step": 248, + "time_per_iteration": 5.486468076705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162703, + "balance_loss_mlp": 1.13626289, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.12969478117477343, + "language_loss": 1.03178453, + "learning_rate": 0.0009991607749457578, + "loss": 1.04341149, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.26464844, + "step": 249, + "time_per_iteration": 2.5253713130950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119774, + "balance_loss_mlp": 1.16941571, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.09425507858465235, + "language_loss": 1.01008546, + "learning_rate": 0.0009991426352510286, + "loss": 1.0220629, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.28295898, + "step": 250, + "time_per_iteration": 3.0042202472686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204128, + "balance_loss_mlp": 1.174016, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.07677732337183582, + "language_loss": 1.0282234, + "learning_rate": 0.0009991243017719422, + "loss": 1.04026473, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30126953, + "step": 251, + "time_per_iteration": 2.709934711456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206766, + "balance_loss_mlp": 1.17522311, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.1103729500964747, + "language_loss": 0.97436613, + "learning_rate": 0.0009991057745156165, + "loss": 0.9864338, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.31518555, + "step": 252, + "time_per_iteration": 2.5961716175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03348202, + "balance_loss_mlp": 3.30471396, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.3811060337507454, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85259187, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.43554688, + "step": 253, + "time_per_iteration": 5.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_mlp": 1.1623621, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.07473951959737497, + "language_loss": 1.05491519, + "learning_rate": 0.0009990681387000943, + "loss": 1.06686831, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.3293457, + "step": 254, + "time_per_iteration": 2.7937283515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121698, + "balance_loss_mlp": 1.18345821, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.06898181212790383, + "language_loss": 1.01063621, + "learning_rate": 0.0009990490301555093, + "loss": 1.02280605, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.33544922, + "step": 255, + "time_per_iteration": 2.9615726470947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05252755, + "balance_loss_mlp": 5.12458086, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.5609302024280507, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.84467912, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.28125, + "step": 256, + "time_per_iteration": 4.8413920402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03162439, + "balance_loss_mlp": 3.09758925, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.1723793408951341, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8240518, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6484375, + "step": 257, + "time_per_iteration": 4.985513687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03630928, + "balance_loss_mlp": 3.55844903, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4079591987734508, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73606813, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.7265625, + "step": 258, + "time_per_iteration": 4.858096361160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403117, + "balance_loss_mlp": 1.35569584, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.11330256318865821, + "language_loss": 0.95339322, + "learning_rate": 0.0009989706585723202, + "loss": 0.96742439, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.47436523, + "step": 259, + "time_per_iteration": 2.794419765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437412, + "balance_loss_mlp": 1.38651013, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.10381773722922016, + "language_loss": 1.0219605, + "learning_rate": 0.0009989505813633442, + "loss": 1.03633475, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.50927734, + "step": 260, + "time_per_iteration": 2.6660099029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145174, + "balance_loss_mlp": 1.39776254, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12909552841436595, + "language_loss": 1.02080631, + "learning_rate": 0.000998930310444573, + "loss": 1.03532374, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.5402832, + "step": 261, + "time_per_iteration": 2.7547266483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429363, + "balance_loss_mlp": 1.37698281, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.08616818959721087, + "language_loss": 0.99936116, + "learning_rate": 0.0009989098458238765, + "loss": 1.01365471, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.52441406, + "step": 262, + "time_per_iteration": 2.804656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431577, + "balance_loss_mlp": 1.38310647, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.10103635045761167, + "language_loss": 0.99213421, + "learning_rate": 0.0009988891875091998, + "loss": 1.00644994, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.48486328, + "step": 263, + "time_per_iteration": 2.780696392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359367, + "balance_loss_mlp": 1.31771505, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09437475228894394, + "language_loss": 0.93793595, + "learning_rate": 0.0009988683355085636, + "loss": 0.95152962, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.41625977, + "step": 264, + "time_per_iteration": 2.758275032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314446, + "balance_loss_mlp": 1.27684712, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09784246378207673, + "language_loss": 1.02612829, + "learning_rate": 0.000998847289830063, + "loss": 1.03927279, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37524414, + "step": 265, + "time_per_iteration": 2.8752288818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289086, + "balance_loss_mlp": 1.25468266, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.06973466471853282, + "language_loss": 0.95293748, + "learning_rate": 0.0009988260504818682, + "loss": 0.9658283, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.34423828, + "step": 266, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290407, + "balance_loss_mlp": 1.2563374, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.0971565340820806, + "language_loss": 1.02148294, + "learning_rate": 0.000998804617472226, + "loss": 1.03438699, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.34082031, + "step": 267, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275377, + "balance_loss_mlp": 1.24085402, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.10761719469623075, + "language_loss": 0.96939588, + "learning_rate": 0.0009987829908094568, + "loss": 0.98214972, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.34545898, + "step": 268, + "time_per_iteration": 2.8270740509033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271333, + "balance_loss_mlp": 1.23785877, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.1226169977774822, + "language_loss": 1.04002702, + "learning_rate": 0.0009987611705019569, + "loss": 1.05274034, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.33496094, + "step": 269, + "time_per_iteration": 4.483954429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277218, + "balance_loss_mlp": 1.24267149, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.07374197309260985, + "language_loss": 1.02401245, + "learning_rate": 0.0009987391565581978, + "loss": 1.03678453, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34594727, + "step": 270, + "time_per_iteration": 2.627356767654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304636, + "balance_loss_mlp": 1.26977956, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06923057034816653, + "language_loss": 0.94496262, + "learning_rate": 0.000998716948986726, + "loss": 0.95800889, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34887695, + "step": 271, + "time_per_iteration": 2.804185628890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322736, + "balance_loss_mlp": 1.28718746, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.1173780328671846, + "language_loss": 0.97372609, + "learning_rate": 0.0009986945477961633, + "loss": 0.9869535, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.35571289, + "step": 272, + "time_per_iteration": 2.739595890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297409, + "balance_loss_mlp": 1.2620039, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07261359465506025, + "language_loss": 1.02136993, + "learning_rate": 0.0009986719529952066, + "loss": 1.03434396, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.35424805, + "step": 273, + "time_per_iteration": 2.8717877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_mlp": 1.20389819, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.13624684616705834, + "language_loss": 1.01736569, + "learning_rate": 0.000998649164592628, + "loss": 1.0297575, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.35327148, + "step": 274, + "time_per_iteration": 2.590993642807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206885, + "balance_loss_mlp": 1.16945291, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.061304815826305474, + "language_loss": 0.99439085, + "learning_rate": 0.0009986261825972748, + "loss": 1.00645971, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.37426758, + "step": 275, + "time_per_iteration": 2.702202081680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_mlp": 1.14466429, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.10486338408500256, + "language_loss": 1.01433325, + "learning_rate": 0.000998603007018069, + "loss": 1.02616751, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.38745117, + "step": 276, + "time_per_iteration": 2.876267671585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190326, + "balance_loss_mlp": 1.15055728, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.08719890934761923, + "language_loss": 0.99445826, + "learning_rate": 0.0009985796378640089, + "loss": 1.00636148, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.39746094, + "step": 277, + "time_per_iteration": 2.74886155128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165278, + "balance_loss_mlp": 1.12720275, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.06292174667602014, + "language_loss": 0.99806106, + "learning_rate": 0.0009985560751441665, + "loss": 1.00971389, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.38061523, + "step": 278, + "time_per_iteration": 2.8894753456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175743, + "balance_loss_mlp": 1.13790607, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.06329003141341145, + "language_loss": 1.01538157, + "learning_rate": 0.00099853231886769, + "loss": 1.02713895, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.37792969, + "step": 279, + "time_per_iteration": 2.783085823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183406, + "balance_loss_mlp": 1.14633179, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06545769746199957, + "language_loss": 1.01316965, + "learning_rate": 0.0009985083690438024, + "loss": 1.02500367, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.37084961, + "step": 280, + "time_per_iteration": 2.707329511642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147788, + "balance_loss_mlp": 1.11245418, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.05305898567294309, + "language_loss": 0.9175781, + "learning_rate": 0.0009984842256818016, + "loss": 0.92905599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.35400391, + "step": 281, + "time_per_iteration": 3.1014201641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_mlp": 1.13106215, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.05782684737590577, + "language_loss": 1.02446878, + "learning_rate": 0.0009984598887910613, + "loss": 1.03612816, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.34912109, + "step": 282, + "time_per_iteration": 2.75343656539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_mlp": 1.14555514, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0631633618899466, + "language_loss": 0.98333299, + "learning_rate": 0.0009984353583810297, + "loss": 0.99513876, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.3503418, + "step": 283, + "time_per_iteration": 2.8092565536499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.15350997, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0821933313576245, + "language_loss": 1.00416183, + "learning_rate": 0.0009984106344612302, + "loss": 1.01602352, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.32666016, + "step": 284, + "time_per_iteration": 2.7632908821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_mlp": 1.1310904, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.06349155766627652, + "language_loss": 0.95740765, + "learning_rate": 0.0009983857170412615, + "loss": 0.96904278, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.32421875, + "step": 285, + "time_per_iteration": 2.9946134090423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130334, + "balance_loss_mlp": 1.09912539, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.0487694941790178, + "language_loss": 0.95326382, + "learning_rate": 0.000998360606130798, + "loss": 0.96456718, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.31176758, + "step": 286, + "time_per_iteration": 2.8205370903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.09512836, + "balance_loss_mlp": 7.26674223, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.42812971022266805, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.78585953, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 22.5, + "step": 287, + "time_per_iteration": 4.986966848373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173658, + "balance_loss_mlp": 1.14278328, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08917023960137904, + "language_loss": 1.01027536, + "learning_rate": 0.0009983098038774552, + "loss": 1.02201188, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.30834961, + "step": 288, + "time_per_iteration": 2.8100168704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06110836, + "balance_loss_mlp": 5.25634384, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.4031517895181362, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.84281063, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 8.5625, + "step": 289, + "time_per_iteration": 4.790200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_mlp": 1.23435044, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.18275347501036113, + "language_loss": 0.9955281, + "learning_rate": 0.0009982582277800948, + "loss": 1.00819802, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.32641602, + "step": 290, + "time_per_iteration": 2.5976333618164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281728, + "balance_loss_mlp": 1.24694288, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.14603269886404707, + "language_loss": 1.06751418, + "learning_rate": 0.0009982321495648908, + "loss": 1.08033144, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.34838867, + "step": 291, + "time_per_iteration": 2.8513312339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250537, + "balance_loss_mlp": 1.21348643, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.09283742859778188, + "language_loss": 0.97403693, + "learning_rate": 0.0009982058779188115, + "loss": 0.98654234, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.37011719, + "step": 292, + "time_per_iteration": 2.728203773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230786, + "balance_loss_mlp": 1.19170928, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.08826519450204054, + "language_loss": 1.05705655, + "learning_rate": 0.0009981794128520567, + "loss": 1.06936455, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.39038086, + "step": 293, + "time_per_iteration": 2.79616379737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253904, + "balance_loss_mlp": 1.21258569, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.08065602932127632, + "language_loss": 1.01724029, + "learning_rate": 0.000998152754374901, + "loss": 1.02977943, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.41333008, + "step": 294, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232141, + "balance_loss_mlp": 1.19132411, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.07309017642696977, + "language_loss": 0.9826439, + "learning_rate": 0.0009981259024976943, + "loss": 0.99496531, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.40820312, + "step": 295, + "time_per_iteration": 2.7376105785369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244019, + "balance_loss_mlp": 1.20112753, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.07769478500482971, + "language_loss": 0.96765345, + "learning_rate": 0.0009980988572308612, + "loss": 0.9800936, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.42871094, + "step": 296, + "time_per_iteration": 3.001779556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226011, + "balance_loss_mlp": 1.18197489, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.0588150430335769, + "language_loss": 0.99343681, + "learning_rate": 0.0009980716185849015, + "loss": 1.00569689, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44067383, + "step": 297, + "time_per_iteration": 2.9817121028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223805, + "balance_loss_mlp": 1.18153381, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06400414638033543, + "language_loss": 0.95616293, + "learning_rate": 0.0009980441865703904, + "loss": 0.96840101, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4230957, + "step": 298, + "time_per_iteration": 2.615875244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122669, + "balance_loss_mlp": 1.18513405, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.09089975305964836, + "language_loss": 1.03662193, + "learning_rate": 0.000998016561197978, + "loss": 1.04888892, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.41577148, + "step": 299, + "time_per_iteration": 2.765833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219698, + "balance_loss_mlp": 1.17835617, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.05662219614280908, + "language_loss": 0.94978034, + "learning_rate": 0.0009979887424783895, + "loss": 0.96197736, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.41357422, + "step": 300, + "time_per_iteration": 2.8931760787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122099, + "balance_loss_mlp": 1.17850339, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.05388706690809858, + "language_loss": 0.94851983, + "learning_rate": 0.0009979607304224248, + "loss": 0.96072972, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.42504883, + "step": 301, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213648, + "balance_loss_mlp": 1.16951644, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.0564182452216587, + "language_loss": 1.02312028, + "learning_rate": 0.000997932525040959, + "loss": 1.03525686, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.44140625, + "step": 302, + "time_per_iteration": 2.7084572315216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.14165473, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.07525794393376325, + "language_loss": 1.04335976, + "learning_rate": 0.000997904126344943, + "loss": 1.05521822, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.44165039, + "step": 303, + "time_per_iteration": 2.6271631717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121438, + "balance_loss_mlp": 1.17055893, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.0664075129682053, + "language_loss": 1.00263453, + "learning_rate": 0.0009978755343454018, + "loss": 1.01477838, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.791146993637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182664, + "balance_loss_mlp": 1.13869941, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07350056034493838, + "language_loss": 1.01461756, + "learning_rate": 0.0009978467490534355, + "loss": 1.0264442, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.43969727, + "step": 305, + "time_per_iteration": 2.614455461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186922, + "balance_loss_mlp": 1.14424467, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.056638515612222363, + "language_loss": 0.97774673, + "learning_rate": 0.00099781777048022, + "loss": 0.98961592, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.42700195, + "step": 306, + "time_per_iteration": 2.717700481414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011718, + "balance_loss_mlp": 1.12855101, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.056560878082468485, + "language_loss": 0.99827361, + "learning_rate": 0.0009977885986370057, + "loss": 1.00999165, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.43310547, + "step": 307, + "time_per_iteration": 2.557203531265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164879, + "balance_loss_mlp": 1.12263095, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.05991229640473007, + "language_loss": 0.9525907, + "learning_rate": 0.000997759233535118, + "loss": 0.9642396, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.42285156, + "step": 308, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174986, + "balance_loss_mlp": 1.1345737, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.06710738832596337, + "language_loss": 1.01122141, + "learning_rate": 0.0009977296751859576, + "loss": 1.02297115, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.40405273, + "step": 309, + "time_per_iteration": 2.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164837, + "balance_loss_mlp": 1.12487829, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.05223481097130428, + "language_loss": 1.03482628, + "learning_rate": 0.0009976999236009998, + "loss": 1.0464747, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.39941406, + "step": 310, + "time_per_iteration": 2.769092321395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164403, + "balance_loss_mlp": 1.1263994, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.05685909644716586, + "language_loss": 1.04877043, + "learning_rate": 0.0009976699787917955, + "loss": 1.06041443, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37963867, + "step": 311, + "time_per_iteration": 2.6526851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08775091, + "balance_loss_mlp": 7.79852915, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.2725707199289832, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.82218087, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 9.75, + "step": 312, + "time_per_iteration": 5.006884813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_mlp": 1.12172294, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.06726838636277511, + "language_loss": 0.96427834, + "learning_rate": 0.0009976095095472243, + "loss": 0.97589004, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39428711, + "step": 313, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166252, + "balance_loss_mlp": 1.12738967, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.0761643630364548, + "language_loss": 0.97957367, + "learning_rate": 0.0009975789851353334, + "loss": 0.99123621, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.38818359, + "step": 314, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_mlp": 1.13191843, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07475166161853689, + "language_loss": 1.00319684, + "learning_rate": 0.0009975482675461487, + "loss": 1.0149318, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.41601562, + "step": 315, + "time_per_iteration": 2.65468692779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159286, + "balance_loss_mlp": 1.11591756, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08252555003670439, + "language_loss": 0.98425788, + "learning_rate": 0.0009975173567915952, + "loss": 0.99585068, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.43383789, + "step": 316, + "time_per_iteration": 2.6916940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.12767935, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.0640207679679256, + "language_loss": 0.91960573, + "learning_rate": 0.000997486252883674, + "loss": 0.93133986, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.45727539, + "step": 317, + "time_per_iteration": 2.8535635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188261, + "balance_loss_mlp": 1.13979006, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.0671416603225842, + "language_loss": 0.97457695, + "learning_rate": 0.0009974549558344602, + "loss": 0.98645949, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.484375, + "step": 318, + "time_per_iteration": 3.6911113262176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189393, + "balance_loss_mlp": 1.14037383, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.09268216800999254, + "language_loss": 1.06808639, + "learning_rate": 0.000997423465656105, + "loss": 1.07998025, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.49023438, + "step": 319, + "time_per_iteration": 2.727130651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.096205, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.06029287427116143, + "language_loss": 1.04509127, + "learning_rate": 0.0009973917823608335, + "loss": 1.05656588, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.51318359, + "step": 320, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.09605646, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.03213952729051003, + "language_loss": 0.98612553, + "learning_rate": 0.0009973599059609462, + "loss": 0.99760658, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.52075195, + "step": 321, + "time_per_iteration": 2.7024786472320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.09133446, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.04984356389382333, + "language_loss": 0.97161096, + "learning_rate": 0.000997327836468819, + "loss": 0.9830358, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.51147461, + "step": 322, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_mlp": 1.0917964, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.06671524152363617, + "language_loss": 0.99795449, + "learning_rate": 0.000997295573896902, + "loss": 1.00938356, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.51171875, + "step": 323, + "time_per_iteration": 2.834237813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03299168, + "balance_loss_mlp": 3.12445545, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.43556355854402456, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.84495211, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.75, + "step": 324, + "time_per_iteration": 4.770992040634155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02151431, + "balance_loss_mlp": 1.9545927, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.14082611715048204, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80723369, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.9609375, + "step": 325, + "time_per_iteration": 4.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.14768362, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.08367806581965369, + "language_loss": 0.93651855, + "learning_rate": 0.000997197627828043, + "loss": 0.94848073, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.4855957, + "step": 326, + "time_per_iteration": 2.5508148670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215208, + "balance_loss_mlp": 1.16862106, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.06635735350324974, + "language_loss": 0.89348811, + "learning_rate": 0.0009971645930629716, + "loss": 0.90564024, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.46533203, + "step": 327, + "time_per_iteration": 2.711386203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125047, + "balance_loss_mlp": 1.20192814, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.08863859510008423, + "language_loss": 1.03147936, + "learning_rate": 0.0009971313652814872, + "loss": 1.04398406, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.48486328, + "step": 328, + "time_per_iteration": 2.8484854698181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225004, + "balance_loss_mlp": 1.17553234, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.08503417282278386, + "language_loss": 1.0059731, + "learning_rate": 0.0009970979444964903, + "loss": 1.01822317, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.49487305, + "step": 329, + "time_per_iteration": 2.957482099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197604, + "balance_loss_mlp": 1.14846587, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.06790724972181753, + "language_loss": 1.01849604, + "learning_rate": 0.0009970643307209556, + "loss": 1.03047216, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.49121094, + "step": 330, + "time_per_iteration": 2.8220374584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170349, + "balance_loss_mlp": 1.1215446, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.06721894230078661, + "language_loss": 0.98097444, + "learning_rate": 0.0009970305239679334, + "loss": 0.99267793, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.48803711, + "step": 331, + "time_per_iteration": 2.8813369274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176679, + "balance_loss_mlp": 1.12754059, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.056286161373139375, + "language_loss": 1.03013992, + "learning_rate": 0.0009969965242505483, + "loss": 1.04190671, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.4909668, + "step": 332, + "time_per_iteration": 2.6662604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168774, + "balance_loss_mlp": 1.11932611, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06031850484613652, + "language_loss": 0.99096131, + "learning_rate": 0.0009969623315820007, + "loss": 1.00264907, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.49487305, + "step": 333, + "time_per_iteration": 2.6671581268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.10619712, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06229524640691676, + "language_loss": 0.99215055, + "learning_rate": 0.000996927945975565, + "loss": 1.00368309, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.47070312, + "step": 334, + "time_per_iteration": 2.568838357925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.1125921, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.05620099657237302, + "language_loss": 0.95852566, + "learning_rate": 0.0009968933674445906, + "loss": 0.97011936, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.46728516, + "step": 335, + "time_per_iteration": 2.6725666522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160514, + "balance_loss_mlp": 1.1122818, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.05589062806096766, + "language_loss": 0.97974062, + "learning_rate": 0.0009968585960025028, + "loss": 0.99134576, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.48217773, + "step": 336, + "time_per_iteration": 2.945194959640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0396516, + "balance_loss_mlp": 3.85834861, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.42886267506062575, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.81618351, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.0703125, + "step": 337, + "time_per_iteration": 4.802944183349609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215082, + "balance_loss_mlp": 1.16968668, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.09324534870618859, + "language_loss": 0.96021777, + "learning_rate": 0.0009967884744390583, + "loss": 0.9723686, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.45361328, + "step": 338, + "time_per_iteration": 3.5247950553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251582, + "balance_loss_mlp": 1.2060678, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.09123718626917265, + "language_loss": 0.97373873, + "learning_rate": 0.0009967531243449256, + "loss": 0.98625457, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.45507812, + "step": 339, + "time_per_iteration": 2.681973695755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211309, + "balance_loss_mlp": 1.163077, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.06030156589334856, + "language_loss": 1.04525125, + "learning_rate": 0.000996717581394126, + "loss": 1.05736434, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.48242188, + "step": 340, + "time_per_iteration": 2.6031126976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205107, + "balance_loss_mlp": 1.15630233, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.06934362388274598, + "language_loss": 1.05133414, + "learning_rate": 0.000996681845600459, + "loss": 1.06338525, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.48803711, + "step": 341, + "time_per_iteration": 2.6689491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190009, + "balance_loss_mlp": 1.1402986, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07929020766121274, + "language_loss": 0.97276402, + "learning_rate": 0.0009966459169777982, + "loss": 0.98466408, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.49731445, + "step": 342, + "time_per_iteration": 2.5235347747802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183772, + "balance_loss_mlp": 1.13444376, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.06503113555429127, + "language_loss": 1.05431008, + "learning_rate": 0.0009966097955400924, + "loss": 1.0661478, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.4934082, + "step": 343, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195626, + "balance_loss_mlp": 1.14772749, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.05810753199069879, + "language_loss": 0.99792945, + "learning_rate": 0.0009965734813013652, + "loss": 1.00988579, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.47924805, + "step": 344, + "time_per_iteration": 2.8092823028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211149, + "balance_loss_mlp": 1.16191518, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.08606224500635251, + "language_loss": 1.02011895, + "learning_rate": 0.0009965369742757151, + "loss": 1.03223062, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.49243164, + "step": 345, + "time_per_iteration": 2.5981764793395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193116, + "balance_loss_mlp": 1.14435959, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.0619511290056959, + "language_loss": 0.98293203, + "learning_rate": 0.0009965002744773152, + "loss": 0.99486327, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.48730469, + "step": 346, + "time_per_iteration": 3.4968950748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178364, + "balance_loss_mlp": 1.13115668, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.04856723246232052, + "language_loss": 0.95658922, + "learning_rate": 0.0009964633819204139, + "loss": 0.96837282, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.47167969, + "step": 347, + "time_per_iteration": 2.6705336570739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04576048, + "balance_loss_mlp": 4.3029151, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.32603271390487504, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.86377156, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 2.734375, + "step": 348, + "time_per_iteration": 4.961863994598389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03789769, + "balance_loss_mlp": 3.60590124, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.16497869204612428, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.78943658, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.8359375, + "step": 349, + "time_per_iteration": 4.876751184463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181375, + "balance_loss_mlp": 1.13578987, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.07770510755269132, + "language_loss": 0.96067584, + "learning_rate": 0.000996351547842304, + "loss": 0.9724896, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.45581055, + "step": 350, + "time_per_iteration": 3.166680097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217287, + "balance_loss_mlp": 1.16969919, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.06167835917893234, + "language_loss": 0.94333142, + "learning_rate": 0.0009963138843953744, + "loss": 0.9555043, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.47558594, + "step": 351, + "time_per_iteration": 2.5784904956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122803, + "balance_loss_mlp": 1.18005991, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.06188972934791396, + "language_loss": 0.98543227, + "learning_rate": 0.000996276028262306, + "loss": 0.99771261, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.47924805, + "step": 352, + "time_per_iteration": 2.7985076904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216963, + "balance_loss_mlp": 1.16760993, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.0659402302829914, + "language_loss": 1.04801619, + "learning_rate": 0.0009962379794577964, + "loss": 1.06018579, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.49365234, + "step": 353, + "time_per_iteration": 2.608032703399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123128, + "balance_loss_mlp": 1.18266606, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.051231802586423875, + "language_loss": 0.94352609, + "learning_rate": 0.000996199737996617, + "loss": 0.95583886, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.48657227, + "step": 354, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227436, + "balance_loss_mlp": 1.17770219, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.05676190931504088, + "language_loss": 1.03759205, + "learning_rate": 0.0009961613038936149, + "loss": 1.04986644, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.49755859, + "step": 355, + "time_per_iteration": 2.617859125137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216553, + "balance_loss_mlp": 1.16572189, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.04878484453506707, + "language_loss": 0.95482612, + "learning_rate": 0.000996122677163711, + "loss": 0.96699166, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.50878906, + "step": 356, + "time_per_iteration": 2.8171308040618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230039, + "balance_loss_mlp": 1.18037653, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.06504242786199886, + "language_loss": 1.01527905, + "learning_rate": 0.000996083857821902, + "loss": 1.02757955, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.49682617, + "step": 357, + "time_per_iteration": 3.0562636852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221322, + "balance_loss_mlp": 1.17237508, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.043415107047687695, + "language_loss": 0.99947309, + "learning_rate": 0.0009960448458832588, + "loss": 1.01168633, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.48925781, + "step": 358, + "time_per_iteration": 2.6778266429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224961, + "balance_loss_mlp": 1.17675292, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.061398357107108094, + "language_loss": 0.99686754, + "learning_rate": 0.000996005641362927, + "loss": 1.00911713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.48193359, + "step": 359, + "time_per_iteration": 2.5839953422546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218039, + "balance_loss_mlp": 1.16792321, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.045504813624839685, + "language_loss": 1.02907789, + "learning_rate": 0.0009959662442761274, + "loss": 1.04125834, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.5012207, + "step": 360, + "time_per_iteration": 2.9012227058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225991, + "balance_loss_mlp": 1.17504108, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.05242893208235044, + "language_loss": 0.96392268, + "learning_rate": 0.000995926654638155, + "loss": 0.97618258, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.50976562, + "step": 361, + "time_per_iteration": 2.7972850799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120421, + "balance_loss_mlp": 1.15323579, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0452718414118582, + "language_loss": 0.98678619, + "learning_rate": 0.00099588687246438, + "loss": 0.99882829, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.51025391, + "step": 362, + "time_per_iteration": 2.845742702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011953, + "balance_loss_mlp": 1.14241886, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.06654716127982052, + "language_loss": 1.06146324, + "learning_rate": 0.0009958468977702471, + "loss": 1.07341623, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.52978516, + "step": 363, + "time_per_iteration": 2.5876591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05386722, + "balance_loss_mlp": 5.09527922, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.35536528906135745, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.85121429, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 2.921875, + "step": 364, + "time_per_iteration": 4.7958595752716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183221, + "balance_loss_mlp": 1.12800324, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.06493728064972926, + "language_loss": 0.94085538, + "learning_rate": 0.0009957663708830612, + "loss": 0.95268762, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.55273438, + "step": 365, + "time_per_iteration": 3.238919258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188034, + "balance_loss_mlp": 1.13048029, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.06418297657416602, + "language_loss": 0.98210049, + "learning_rate": 0.0009957258187212714, + "loss": 0.99398077, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.57470703, + "step": 366, + "time_per_iteration": 3.0337131023406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0292345, + "balance_loss_mlp": 2.78612089, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.09868001986151984, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.82118309, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.375, + "step": 367, + "time_per_iteration": 4.825684070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118916, + "balance_loss_mlp": 1.12988925, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.06345017711900697, + "language_loss": 0.94456601, + "learning_rate": 0.0009956441370400167, + "loss": 0.95645761, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.59179688, + "step": 368, + "time_per_iteration": 2.6685595512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203671, + "balance_loss_mlp": 1.14411354, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.07550644934377632, + "language_loss": 1.00098681, + "learning_rate": 0.0009956030075522636, + "loss": 1.0130235, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.59472656, + "step": 369, + "time_per_iteration": 2.7824065685272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.12555027, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0634963537383221, + "language_loss": 1.00245738, + "learning_rate": 0.0009955616856543587, + "loss": 1.01431036, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.59667969, + "step": 370, + "time_per_iteration": 2.6869115829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117739, + "balance_loss_mlp": 1.11649847, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.04749901473855408, + "language_loss": 0.92605507, + "learning_rate": 0.0009955201713623448, + "loss": 0.93782902, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.60791016, + "step": 371, + "time_per_iteration": 2.7894065380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03553003, + "balance_loss_mlp": 3.34700894, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.1539254818196356, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.80225718, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 2.0625, + "step": 372, + "time_per_iteration": 5.025646924972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_mlp": 1.12739396, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.05697389015463885, + "language_loss": 1.05361807, + "learning_rate": 0.0009954365656605333, + "loss": 1.06550562, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.61328125, + "step": 373, + "time_per_iteration": 2.5767741203308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203971, + "balance_loss_mlp": 1.13878703, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.0561234241567743, + "language_loss": 0.98981488, + "learning_rate": 0.0009953944742831947, + "loss": 1.00185454, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.65185547, + "step": 374, + "time_per_iteration": 3.0126912593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209318, + "balance_loss_mlp": 1.14351439, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.05197007853134015, + "language_loss": 1.02623391, + "learning_rate": 0.0009953521905766642, + "loss": 1.0383271, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.65820312, + "step": 375, + "time_per_iteration": 2.9678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207965, + "balance_loss_mlp": 1.14464104, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.05250799377029981, + "language_loss": 1.01212132, + "learning_rate": 0.0009953097145573577, + "loss": 1.02420104, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.6328125, + "step": 376, + "time_per_iteration": 2.7048561573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121329, + "balance_loss_mlp": 1.1502521, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.050651846587156886, + "language_loss": 0.98499894, + "learning_rate": 0.000995267046241766, + "loss": 0.99713182, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.62988281, + "step": 377, + "time_per_iteration": 3.287705421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225924, + "balance_loss_mlp": 1.16341114, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.05776369312695448, + "language_loss": 0.98701203, + "learning_rate": 0.0009952241856464547, + "loss": 0.99927127, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.62451172, + "step": 378, + "time_per_iteration": 2.5897629261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220563, + "balance_loss_mlp": 1.16010034, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.05450855675542614, + "language_loss": 1.05642247, + "learning_rate": 0.0009951811327880632, + "loss": 1.06862807, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.60351562, + "step": 379, + "time_per_iteration": 2.7320594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220943, + "balance_loss_mlp": 1.15924072, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.04947645913164449, + "language_loss": 0.99005401, + "learning_rate": 0.0009951378876833063, + "loss": 1.00226343, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.61669922, + "step": 380, + "time_per_iteration": 2.595810651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196634, + "balance_loss_mlp": 1.13798296, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.058807068798268386, + "language_loss": 1.05567527, + "learning_rate": 0.0009950944503489736, + "loss": 1.06764162, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.5859375, + "step": 381, + "time_per_iteration": 2.733560562133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197608, + "balance_loss_mlp": 1.13914812, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.06747680453051412, + "language_loss": 0.99337935, + "learning_rate": 0.0009950508208019285, + "loss": 1.00535548, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.58398438, + "step": 382, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176507, + "balance_loss_mlp": 1.12062192, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.05827239016363537, + "language_loss": 1.03707182, + "learning_rate": 0.0009950069990591096, + "loss": 1.04883695, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.55908203, + "step": 383, + "time_per_iteration": 2.6856980323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05393736, + "balance_loss_mlp": 5.19079447, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.38241300139143997, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.81795102, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 2.03125, + "step": 384, + "time_per_iteration": 4.860661268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128436, + "balance_loss_mlp": 1.07369518, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.06005395599718801, + "language_loss": 0.96679938, + "learning_rate": 0.0009949187790542777, + "loss": 0.97808379, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.54760742, + "step": 385, + "time_per_iteration": 2.7245922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.09042215, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.06780842756482337, + "language_loss": 0.9270733, + "learning_rate": 0.0009948743808265148, + "loss": 0.93854064, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.56298828, + "step": 386, + "time_per_iteration": 2.6745331287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187036, + "balance_loss_mlp": 1.13334417, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.04295711334598506, + "language_loss": 1.02854586, + "learning_rate": 0.0009948297904714782, + "loss": 1.04041624, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.53759766, + "step": 387, + "time_per_iteration": 2.681718111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202671, + "balance_loss_mlp": 1.15167296, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.05564614333293379, + "language_loss": 0.94366896, + "learning_rate": 0.0009947850080064796, + "loss": 0.95569569, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.51049805, + "step": 388, + "time_per_iteration": 2.788663148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216483, + "balance_loss_mlp": 1.16817975, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.07112384111458, + "language_loss": 0.99713415, + "learning_rate": 0.0009947400334489047, + "loss": 1.00929892, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.48291016, + "step": 389, + "time_per_iteration": 2.9905049800872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227498, + "balance_loss_mlp": 1.17926562, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.06900212518032732, + "language_loss": 0.91264081, + "learning_rate": 0.0009946948668162145, + "loss": 0.92491579, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.48168945, + "step": 390, + "time_per_iteration": 2.767531394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012247, + "balance_loss_mlp": 1.17277205, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.052104168644034804, + "language_loss": 0.95126128, + "learning_rate": 0.0009946495081259441, + "loss": 0.96350825, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.52001953, + "step": 391, + "time_per_iteration": 2.816908597946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192311, + "balance_loss_mlp": 1.14057434, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.051504782312047234, + "language_loss": 0.99421549, + "learning_rate": 0.0009946039573957035, + "loss": 1.00613856, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.51782227, + "step": 392, + "time_per_iteration": 2.9265222549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116666, + "balance_loss_mlp": 1.11478019, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.055053573084277836, + "language_loss": 0.95799196, + "learning_rate": 0.000994558214643177, + "loss": 0.96965855, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.51928711, + "step": 393, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165121, + "balance_loss_mlp": 1.11352682, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.05925711706254076, + "language_loss": 0.97585773, + "learning_rate": 0.000994512279886123, + "loss": 0.98750889, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.51660156, + "step": 394, + "time_per_iteration": 3.0709142684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.09191656, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.04191079383555719, + "language_loss": 0.97239089, + "learning_rate": 0.0009944661531423758, + "loss": 0.98382699, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.51757812, + "step": 395, + "time_per_iteration": 2.7044599056243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134219, + "balance_loss_mlp": 1.08338809, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.05545815376917658, + "language_loss": 0.96390671, + "learning_rate": 0.000994419834429843, + "loss": 0.97524893, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.50854492, + "step": 396, + "time_per_iteration": 2.6767609119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135922, + "balance_loss_mlp": 1.08525789, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.05307630449121137, + "language_loss": 1.01208472, + "learning_rate": 0.0009943733237665069, + "loss": 1.02344394, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.50683594, + "step": 397, + "time_per_iteration": 2.819148302078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124426, + "balance_loss_mlp": 1.07502615, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.049844903289807924, + "language_loss": 0.99488425, + "learning_rate": 0.0009943266211704248, + "loss": 1.00612843, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.49389648, + "step": 398, + "time_per_iteration": 2.9555482864379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125466, + "balance_loss_mlp": 1.07675719, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.05620775813161816, + "language_loss": 1.01430082, + "learning_rate": 0.000994279726659728, + "loss": 1.02555549, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.48706055, + "step": 399, + "time_per_iteration": 2.5138003826141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.07761765, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.05674792404596756, + "language_loss": 0.99883693, + "learning_rate": 0.0009942326402526231, + "loss": 1.01010823, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.49511719, + "step": 400, + "time_per_iteration": 2.5245604515075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_mlp": 1.07793891, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.036646942736225624, + "language_loss": 0.9767518, + "learning_rate": 0.0009941853619673902, + "loss": 0.98802906, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.49804688, + "step": 401, + "time_per_iteration": 2.644771099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_mlp": 1.07451057, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.057554732491620374, + "language_loss": 1.01884329, + "learning_rate": 0.0009941378918223844, + "loss": 1.0300777, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.48876953, + "step": 402, + "time_per_iteration": 3.051617383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_mlp": 1.07618988, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.04510164642433069, + "language_loss": 0.94372368, + "learning_rate": 0.0009940902298360354, + "loss": 0.95496523, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.47924805, + "step": 403, + "time_per_iteration": 2.7302582263946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118279, + "balance_loss_mlp": 1.0687592, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.062376946911402976, + "language_loss": 1.04687834, + "learning_rate": 0.0009940423760268473, + "loss": 1.05806112, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.49536133, + "step": 404, + "time_per_iteration": 2.856938600540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118682, + "balance_loss_mlp": 1.07009196, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.046838991637930295, + "language_loss": 0.97888398, + "learning_rate": 0.0009939943304133982, + "loss": 0.99007082, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.48608398, + "step": 405, + "time_per_iteration": 2.6161091327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115161, + "balance_loss_mlp": 1.06881261, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.04496148345425058, + "language_loss": 1.04081011, + "learning_rate": 0.0009939460930143416, + "loss": 1.0519619, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.46337891, + "step": 406, + "time_per_iteration": 2.6310677528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119218, + "balance_loss_mlp": 1.07332289, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.037201804651944344, + "language_loss": 0.98071587, + "learning_rate": 0.0009938976638484043, + "loss": 0.99190807, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.45874023, + "step": 407, + "time_per_iteration": 2.8977036476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.06844616, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.04629061554837057, + "language_loss": 0.97991359, + "learning_rate": 0.0009938490429343887, + "loss": 0.99104249, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.44458008, + "step": 408, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07315516, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.04004461216150975, + "language_loss": 0.97974342, + "learning_rate": 0.0009938002302911709, + "loss": 0.99092889, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.4543457, + "step": 409, + "time_per_iteration": 2.738518238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123547, + "balance_loss_mlp": 1.07915401, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.07048914756312923, + "language_loss": 1.00401747, + "learning_rate": 0.0009937512259377015, + "loss": 1.01525307, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.44384766, + "step": 410, + "time_per_iteration": 2.670149564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110678, + "balance_loss_mlp": 1.0668565, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.049646402233970426, + "language_loss": 0.99659574, + "learning_rate": 0.000993702029893006, + "loss": 1.00770259, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.4387207, + "step": 411, + "time_per_iteration": 2.7853777408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118473, + "balance_loss_mlp": 1.07200527, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.04880092350488667, + "language_loss": 0.98862529, + "learning_rate": 0.0009936526421761838, + "loss": 0.99981004, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.46435547, + "step": 412, + "time_per_iteration": 3.030674457550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114951, + "balance_loss_mlp": 1.07043815, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.04383720282943398, + "language_loss": 1.01490402, + "learning_rate": 0.000993603062806409, + "loss": 1.02605367, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.4453125, + "step": 413, + "time_per_iteration": 2.7101500034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0637151, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.046157231925668944, + "language_loss": 1.04664707, + "learning_rate": 0.0009935532918029298, + "loss": 1.05774391, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.45947266, + "step": 414, + "time_per_iteration": 2.593390941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118947, + "balance_loss_mlp": 1.07278943, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.058468816323775735, + "language_loss": 0.97956645, + "learning_rate": 0.0009935033291850694, + "loss": 0.99075592, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.46166992, + "step": 415, + "time_per_iteration": 2.6693851947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_mlp": 1.0654031, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.061030352209764355, + "language_loss": 1.00225627, + "learning_rate": 0.0009934531749722247, + "loss": 1.01337099, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.46044922, + "step": 416, + "time_per_iteration": 2.578746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_mlp": 1.07337523, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.05071064772829009, + "language_loss": 0.98778659, + "learning_rate": 0.0009934028291838672, + "loss": 0.99898028, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.45996094, + "step": 417, + "time_per_iteration": 2.7096333503723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106202, + "balance_loss_mlp": 1.06166553, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.045680808340910005, + "language_loss": 0.94326293, + "learning_rate": 0.0009933522918395433, + "loss": 0.95432496, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.44555664, + "step": 418, + "time_per_iteration": 2.644414186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04959176, + "balance_loss_mlp": 4.71808767, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.3214703434406663, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.83210278, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 2.40625, + "step": 419, + "time_per_iteration": 4.868964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_mlp": 1.07108891, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08060687528614664, + "language_loss": 1.13036489, + "learning_rate": 0.000993250642561551, + "loss": 1.14152122, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.4453125, + "step": 420, + "time_per_iteration": 2.632162094116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121548, + "balance_loss_mlp": 1.07538986, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.08633853635548816, + "language_loss": 0.9784801, + "learning_rate": 0.0009931995306673466, + "loss": 0.98969555, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.46118164, + "step": 421, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134412, + "balance_loss_mlp": 1.08815861, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.038770411105538145, + "language_loss": 1.03907061, + "learning_rate": 0.000993148227296103, + "loss": 1.05041468, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.4621582, + "step": 422, + "time_per_iteration": 2.669496536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133253, + "balance_loss_mlp": 1.08707166, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.053095831055692516, + "language_loss": 0.9112367, + "learning_rate": 0.000993096732467738, + "loss": 0.92256927, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.46166992, + "step": 423, + "time_per_iteration": 2.961660861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150855, + "balance_loss_mlp": 1.10498345, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.08137036582560589, + "language_loss": 0.99760056, + "learning_rate": 0.0009930450462022435, + "loss": 1.00910902, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.45874023, + "step": 424, + "time_per_iteration": 2.7952311038970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03600409, + "balance_loss_mlp": 3.48901963, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.18349806711668631, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.82790214, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.1171875, + "step": 425, + "time_per_iteration": 4.8854875564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.11344862, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.06491953183218531, + "language_loss": 0.9776966, + "learning_rate": 0.0009929410994402065, + "loss": 0.98928833, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.45703125, + "step": 426, + "time_per_iteration": 4.275091886520386 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169515, + "balance_loss_mlp": 1.12223697, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.07437504582125473, + "language_loss": 1.02033544, + "learning_rate": 0.0009928888389840196, + "loss": 1.03203058, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.47241211, + "step": 427, + "time_per_iteration": 2.7036454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145234, + "balance_loss_mlp": 1.09941018, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.05964472172349544, + "language_loss": 1.03706717, + "learning_rate": 0.0009928363871714147, + "loss": 1.04851961, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.45849609, + "step": 428, + "time_per_iteration": 2.6669116020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.10254741, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.07530468467255677, + "language_loss": 0.97491598, + "learning_rate": 0.0009927837440227556, + "loss": 0.98641634, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.47485352, + "step": 429, + "time_per_iteration": 2.8463807106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120703, + "balance_loss_mlp": 1.07588065, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.04140843961960757, + "language_loss": 0.92054397, + "learning_rate": 0.0009927309095584798, + "loss": 0.93175101, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.44824219, + "step": 430, + "time_per_iteration": 2.9767606258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116415, + "balance_loss_mlp": 1.07278419, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.04726827868993605, + "language_loss": 1.04780793, + "learning_rate": 0.0009926778837991, + "loss": 1.05897212, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.43652344, + "step": 431, + "time_per_iteration": 2.5883395671844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.06749809, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.049074519776006666, + "language_loss": 1.0243988, + "learning_rate": 0.000992624666765202, + "loss": 1.0355196, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.44604492, + "step": 432, + "time_per_iteration": 2.7943906784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_mlp": 1.07200766, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.04417562175093811, + "language_loss": 1.00109053, + "learning_rate": 0.000992571258477447, + "loss": 1.01224887, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.43823242, + "step": 433, + "time_per_iteration": 2.836127758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_mlp": 1.07260084, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.04319706549365549, + "language_loss": 0.93695247, + "learning_rate": 0.0009925176589565695, + "loss": 0.94812053, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.44213867, + "step": 434, + "time_per_iteration": 2.8157734870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131219, + "balance_loss_mlp": 1.08756483, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.04172416189060796, + "language_loss": 1.04242814, + "learning_rate": 0.0009924638682233791, + "loss": 1.05374026, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.43652344, + "step": 435, + "time_per_iteration": 2.5577316284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503783, + "balance_loss_mlp": 2.3527205, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06968128915635463, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82068378, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.5078125, + "step": 436, + "time_per_iteration": 4.594938516616821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.08348453, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.0610737753852808, + "language_loss": 0.94037408, + "learning_rate": 0.0009923557132036668, + "loss": 0.95166528, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.45629883, + "step": 437, + "time_per_iteration": 3.0716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_mlp": 1.07430601, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.04662895628051273, + "language_loss": 0.97730738, + "learning_rate": 0.0009923013489591345, + "loss": 0.98849535, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.4453125, + "step": 438, + "time_per_iteration": 2.726792812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_mlp": 1.06685066, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04626496214247174, + "language_loss": 0.96079296, + "learning_rate": 0.0009922467935862681, + "loss": 0.97189873, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.4375, + "step": 439, + "time_per_iteration": 3.0908052921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119416, + "balance_loss_mlp": 1.07273376, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.048922855388473234, + "language_loss": 0.99432743, + "learning_rate": 0.0009921920471062478, + "loss": 1.00552154, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.46655273, + "step": 440, + "time_per_iteration": 2.622451066970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117818, + "balance_loss_mlp": 1.07342434, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.07502031783190574, + "language_loss": 0.9797709, + "learning_rate": 0.0009921371095403281, + "loss": 0.99094903, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.44433594, + "step": 441, + "time_per_iteration": 2.705152750015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011116, + "balance_loss_mlp": 1.06863689, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.04941418140969711, + "language_loss": 1.00754833, + "learning_rate": 0.0009920819809098379, + "loss": 1.01866436, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.42993164, + "step": 442, + "time_per_iteration": 2.5887317657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119689, + "balance_loss_mlp": 1.07715499, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.06964486535702215, + "language_loss": 0.96275294, + "learning_rate": 0.0009920266612361798, + "loss": 0.97394979, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.42578125, + "step": 443, + "time_per_iteration": 2.745222330093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_mlp": 1.06587708, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.05163049083883061, + "language_loss": 0.96866751, + "learning_rate": 0.0009919711505408308, + "loss": 0.97974443, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.41821289, + "step": 444, + "time_per_iteration": 2.780095100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106314, + "balance_loss_mlp": 1.0654248, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.054748359311131624, + "language_loss": 0.94535226, + "learning_rate": 0.000991915448845342, + "loss": 0.95641541, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.40893555, + "step": 445, + "time_per_iteration": 2.5229337215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.06279922, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.0575820537988498, + "language_loss": 1.03181779, + "learning_rate": 0.000991859556171339, + "loss": 1.04284596, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.40039062, + "step": 446, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_mlp": 1.06497526, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.04289742759235468, + "language_loss": 1.05262291, + "learning_rate": 0.000991803472540521, + "loss": 1.06367946, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.40673828, + "step": 447, + "time_per_iteration": 2.6220486164093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_mlp": 1.06550729, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.04330621576945977, + "language_loss": 1.00096428, + "learning_rate": 0.0009917471979746615, + "loss": 1.01202178, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.40234375, + "step": 448, + "time_per_iteration": 2.9767467975616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.07379115, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.03609686036920932, + "language_loss": 0.98485255, + "learning_rate": 0.0009916907324956086, + "loss": 0.99600053, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.41015625, + "step": 449, + "time_per_iteration": 2.701143980026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117003, + "balance_loss_mlp": 1.07480288, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.04834207301210501, + "language_loss": 0.95441091, + "learning_rate": 0.0009916340761252837, + "loss": 0.965581, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.42211914, + "step": 450, + "time_per_iteration": 2.6036393642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129901, + "balance_loss_mlp": 1.08910751, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.07269963588094165, + "language_loss": 0.9243114, + "learning_rate": 0.0009915772288856832, + "loss": 0.93561041, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.40820312, + "step": 451, + "time_per_iteration": 3.05719256401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125865, + "balance_loss_mlp": 1.08359361, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.05954656443346509, + "language_loss": 0.93746579, + "learning_rate": 0.000991520190798877, + "loss": 0.94872439, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.42285156, + "step": 452, + "time_per_iteration": 2.804128885269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_mlp": 1.07723105, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.05604676795867647, + "language_loss": 1.04000187, + "learning_rate": 0.0009914629618870089, + "loss": 1.05120206, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.42797852, + "step": 453, + "time_per_iteration": 2.8959083557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02032313, + "balance_loss_mlp": 1.86675501, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.06678910630402063, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.80708182, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.65625, + "step": 454, + "time_per_iteration": 4.753306865692139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974747, + "balance_loss_mlp": 1.80537415, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.06350102966569023, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83402705, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.6953125, + "step": 455, + "time_per_iteration": 4.909627914428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_mlp": 1.05778539, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.07384563339861851, + "language_loss": 0.95938599, + "learning_rate": 0.0009912901304235883, + "loss": 0.97038674, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.42333984, + "step": 456, + "time_per_iteration": 3.0303096771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_mlp": 1.05112898, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.061767025741825826, + "language_loss": 0.93898749, + "learning_rate": 0.000991232138434397, + "loss": 0.94991863, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.41992188, + "step": 457, + "time_per_iteration": 2.834221601486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089137, + "balance_loss_mlp": 1.04824805, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.05183647995223567, + "language_loss": 1.00765896, + "learning_rate": 0.000991173955731976, + "loss": 1.0185504, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.40869141, + "step": 458, + "time_per_iteration": 2.628783702850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_mlp": 1.05569601, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.052575936673692925, + "language_loss": 1.04489028, + "learning_rate": 0.0009911155823389137, + "loss": 1.0558753, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.42797852, + "step": 459, + "time_per_iteration": 2.964416742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_mlp": 1.06523609, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.05270293395412616, + "language_loss": 1.00385904, + "learning_rate": 0.000991057018277873, + "loss": 1.01492882, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.41748047, + "step": 460, + "time_per_iteration": 2.6944808959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_mlp": 1.06245136, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.04953210926048159, + "language_loss": 1.01399374, + "learning_rate": 0.0009909982635715898, + "loss": 1.02504039, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.42236328, + "step": 461, + "time_per_iteration": 2.6137924194335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_mlp": 1.05374336, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.050729417377465176, + "language_loss": 1.00123549, + "learning_rate": 0.0009909393182428751, + "loss": 1.01219559, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.42285156, + "step": 462, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109539, + "balance_loss_mlp": 1.06891286, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.043715633324142876, + "language_loss": 0.94138575, + "learning_rate": 0.000990880182314614, + "loss": 0.95248115, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.40625, + "step": 463, + "time_per_iteration": 2.733408212661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_mlp": 1.06121325, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.051961844945365605, + "language_loss": 0.94176865, + "learning_rate": 0.0009908208558097643, + "loss": 0.9527818, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.40087891, + "step": 464, + "time_per_iteration": 2.9006474018096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105621, + "balance_loss_mlp": 1.06508923, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.04470923680131565, + "language_loss": 0.9716863, + "learning_rate": 0.000990761338751359, + "loss": 0.98274255, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.40527344, + "step": 465, + "time_per_iteration": 2.775830030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410893, + "balance_loss_mlp": 1.25296497, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.0425617539044403, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75070524, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.578125, + "step": 466, + "time_per_iteration": 5.023500919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_mlp": 1.05869305, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.04007163966797277, + "language_loss": 0.9983623, + "learning_rate": 0.0009906417330663815, + "loss": 1.00936306, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.41381836, + "step": 467, + "time_per_iteration": 2.6194305419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099405, + "balance_loss_mlp": 1.05889773, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.03985353179312445, + "language_loss": 0.96447593, + "learning_rate": 0.0009905816444862442, + "loss": 0.97546995, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.4050293, + "step": 468, + "time_per_iteration": 2.623267889022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_mlp": 1.06568456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.038840192804800056, + "language_loss": 0.93513083, + "learning_rate": 0.0009905213654454216, + "loss": 0.94620228, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.41455078, + "step": 469, + "time_per_iteration": 2.9024641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_mlp": 1.06466317, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.04985478927164425, + "language_loss": 1.01848495, + "learning_rate": 0.0009904608959673158, + "loss": 1.02953827, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.40649414, + "step": 470, + "time_per_iteration": 2.7711682319641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097659, + "balance_loss_mlp": 1.0588448, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.04989175862356038, + "language_loss": 1.02851224, + "learning_rate": 0.000990400236075403, + "loss": 1.03948903, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.38793945, + "step": 471, + "time_per_iteration": 2.536189317703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109095, + "balance_loss_mlp": 1.05113411, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.03738902964718639, + "language_loss": 0.98994756, + "learning_rate": 0.0009903393857932338, + "loss": 1.000857, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.39794922, + "step": 472, + "time_per_iteration": 2.6588857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097802, + "balance_loss_mlp": 1.05908275, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.045733529486957185, + "language_loss": 0.97091877, + "learning_rate": 0.0009902783451444317, + "loss": 0.98189688, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.38720703, + "step": 473, + "time_per_iteration": 2.6981122493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091239, + "balance_loss_mlp": 1.05406976, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.04942472768420212, + "language_loss": 1.00819659, + "learning_rate": 0.0009902171141526956, + "loss": 1.01910901, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.37158203, + "step": 474, + "time_per_iteration": 2.527256727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099497, + "balance_loss_mlp": 1.06225586, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.04275448033987936, + "language_loss": 0.88210893, + "learning_rate": 0.000990155692841797, + "loss": 0.8931039, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.37231445, + "step": 475, + "time_per_iteration": 2.989063262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_mlp": 1.06084871, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.04412440376655801, + "language_loss": 1.00229144, + "learning_rate": 0.0009900940812355818, + "loss": 1.01326227, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.36254883, + "step": 476, + "time_per_iteration": 2.8778445720672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105736, + "balance_loss_mlp": 1.07011676, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.06417087981964828, + "language_loss": 0.97168529, + "learning_rate": 0.00099003227935797, + "loss": 0.98274267, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.35620117, + "step": 477, + "time_per_iteration": 2.708608627319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101416, + "balance_loss_mlp": 1.06369829, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.06707216335576115, + "language_loss": 1.01291215, + "learning_rate": 0.000989970287232955, + "loss": 1.02392626, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.37695312, + "step": 478, + "time_per_iteration": 2.783325672149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090795, + "balance_loss_mlp": 1.05431736, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.05564878549890474, + "language_loss": 0.9726451, + "learning_rate": 0.0009899081048846043, + "loss": 0.98355305, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.36474609, + "step": 479, + "time_per_iteration": 2.6017916202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097049, + "balance_loss_mlp": 1.05964088, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.06044394784495309, + "language_loss": 1.03484094, + "learning_rate": 0.0009898457323370593, + "loss": 1.04581141, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.37402344, + "step": 480, + "time_per_iteration": 2.575676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.0533123, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.05778783373137127, + "language_loss": 0.99753714, + "learning_rate": 0.000989783169614535, + "loss": 1.00844884, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.37817383, + "step": 481, + "time_per_iteration": 2.646942615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283887, + "balance_loss_mlp": 1.15876544, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.01956789957612316, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80036646, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.25, + "step": 482, + "time_per_iteration": 4.860741376876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_mlp": 1.06158745, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.06801501049369231, + "language_loss": 0.97102278, + "learning_rate": 0.000989657473741779, + "loss": 0.98201108, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.37231445, + "step": 483, + "time_per_iteration": 2.819138526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095911, + "balance_loss_mlp": 1.05979109, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.038333848574242754, + "language_loss": 0.98462784, + "learning_rate": 0.0009895943406403465, + "loss": 0.99558693, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.36132812, + "step": 484, + "time_per_iteration": 2.7088170051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_mlp": 1.06854701, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.05828015098596693, + "language_loss": 0.92231822, + "learning_rate": 0.0009895310174615338, + "loss": 0.933357, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.35351562, + "step": 485, + "time_per_iteration": 2.760511636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_mlp": 1.14983261, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.018538812380254305, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76984316, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.15625, + "step": 486, + "time_per_iteration": 4.656491994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_mlp": 1.0699296, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.04721263549483299, + "language_loss": 0.95839012, + "learning_rate": 0.0009894038009701782, + "loss": 0.96944392, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.35498047, + "step": 487, + "time_per_iteration": 2.6169629096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_mlp": 1.06868315, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.05102581257360949, + "language_loss": 0.98848963, + "learning_rate": 0.0009893399077070253, + "loss": 0.99952644, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.3503418, + "step": 488, + "time_per_iteration": 2.5845744609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_mlp": 1.07193291, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.05918319403016569, + "language_loss": 0.92944884, + "learning_rate": 0.0009892758244652718, + "loss": 0.94051951, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.35180664, + "step": 489, + "time_per_iteration": 2.660200357437134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091731, + "balance_loss_mlp": 1.05801892, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.041386989889926534, + "language_loss": 1.00010514, + "learning_rate": 0.0009892115512697968, + "loss": 1.01102245, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.33740234, + "step": 490, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_mlp": 1.05631554, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.04182034264497562, + "language_loss": 1.00108159, + "learning_rate": 0.0009891470881455537, + "loss": 1.01198137, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.33666992, + "step": 491, + "time_per_iteration": 2.746169328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108736, + "balance_loss_mlp": 1.05319476, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.0458284589248403, + "language_loss": 0.98654628, + "learning_rate": 0.0009890824351175692, + "loss": 0.99741989, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.34204102, + "step": 492, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.05654192, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.041327442652051224, + "language_loss": 1.0219661, + "learning_rate": 0.0009890175922109435, + "loss": 1.0328722, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.34082031, + "step": 493, + "time_per_iteration": 2.6482973098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010971, + "balance_loss_mlp": 1.06086028, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.06926989533772566, + "language_loss": 1.01090789, + "learning_rate": 0.0009889525594508513, + "loss": 1.02187896, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.36254883, + "step": 494, + "time_per_iteration": 3.0095505714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_mlp": 1.05596447, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.04986765426945594, + "language_loss": 0.94310975, + "learning_rate": 0.0009888873368625404, + "loss": 0.95402986, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.3605957, + "step": 495, + "time_per_iteration": 2.5451042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05426204, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.05650320770937666, + "language_loss": 0.98877072, + "learning_rate": 0.0009888219244713326, + "loss": 0.99966443, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.3515625, + "step": 496, + "time_per_iteration": 2.8157310485839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086342, + "balance_loss_mlp": 1.05100799, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.05039739829653265, + "language_loss": 0.99588835, + "learning_rate": 0.0009887563223026229, + "loss": 1.00675178, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.35375977, + "step": 497, + "time_per_iteration": 2.6563401222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244906, + "balance_loss_mlp": 1.14648652, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.01649790273231252, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80313075, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.98046875, + "step": 498, + "time_per_iteration": 4.8689799308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098776, + "balance_loss_mlp": 1.0630604, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.06260101269903841, + "language_loss": 0.97272921, + "learning_rate": 0.0009886245487346482, + "loss": 0.98371696, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35742188, + "step": 499, + "time_per_iteration": 3.0292818546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.08159947, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.055723050712230264, + "language_loss": 1.00704551, + "learning_rate": 0.0009885583773865422, + "loss": 1.01822114, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.35986328, + "step": 500, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117756, + "balance_loss_mlp": 1.08137345, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.06268683986847115, + "language_loss": 0.9714855, + "learning_rate": 0.0009884920163632524, + "loss": 0.98266304, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.36352539, + "step": 501, + "time_per_iteration": 2.666341781616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111747, + "balance_loss_mlp": 1.07638931, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.04553274405873497, + "language_loss": 1.01245189, + "learning_rate": 0.000988425465690543, + "loss": 1.02356935, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35375977, + "step": 502, + "time_per_iteration": 2.55082106590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06867552, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.04373339165225573, + "language_loss": 0.99427342, + "learning_rate": 0.0009883587253942505, + "loss": 1.00530469, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.34472656, + "step": 503, + "time_per_iteration": 2.7674455642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_mlp": 1.07378531, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.051161986083573203, + "language_loss": 1.04393589, + "learning_rate": 0.0009882917955002862, + "loss": 1.05501866, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.3449707, + "step": 504, + "time_per_iteration": 2.549203872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_mlp": 1.07116556, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.04840022534917253, + "language_loss": 0.95342839, + "learning_rate": 0.0009882246760346343, + "loss": 0.96448457, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.3449707, + "step": 505, + "time_per_iteration": 2.653627872467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115925, + "balance_loss_mlp": 1.08128262, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.08271599518488834, + "language_loss": 1.02799106, + "learning_rate": 0.0009881573670233533, + "loss": 1.03915036, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.34692383, + "step": 506, + "time_per_iteration": 2.5279319286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104761, + "balance_loss_mlp": 1.07061946, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.05291653517072512, + "language_loss": 0.96169406, + "learning_rate": 0.0009880898684925747, + "loss": 0.97274166, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.34179688, + "step": 507, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_mlp": 1.06039834, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.053809005456099755, + "language_loss": 0.94680405, + "learning_rate": 0.0009880221804685037, + "loss": 0.95776224, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.35424805, + "step": 508, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245061, + "balance_loss_mlp": 1.15503371, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.024665830319341657, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80589479, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.8984375, + "step": 509, + "time_per_iteration": 4.705655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094606, + "balance_loss_mlp": 1.05932045, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.06644626598388864, + "language_loss": 1.02131915, + "learning_rate": 0.0009878862360456733, + "loss": 1.03226519, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.35327148, + "step": 510, + "time_per_iteration": 2.682035446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097961, + "balance_loss_mlp": 1.06336641, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.06543943311749917, + "language_loss": 0.9266718, + "learning_rate": 0.0009878179796996922, + "loss": 0.9376514, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.34619141, + "step": 511, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105256, + "balance_loss_mlp": 1.07030368, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.054213046356477584, + "language_loss": 0.96428764, + "learning_rate": 0.0009877495339659754, + "loss": 0.97534013, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.34985352, + "step": 512, + "time_per_iteration": 2.746337413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105714, + "balance_loss_mlp": 1.07190621, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.0573170093193853, + "language_loss": 0.91841626, + "learning_rate": 0.000987680898871096, + "loss": 0.9294734, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.33837891, + "step": 513, + "time_per_iteration": 2.7060482501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110186, + "balance_loss_mlp": 1.07675993, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.0786420176645203, + "language_loss": 0.95400196, + "learning_rate": 0.0009876120744417, + "loss": 0.96510386, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33447266, + "step": 514, + "time_per_iteration": 2.9473536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105767, + "balance_loss_mlp": 1.07071972, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.04861145683213968, + "language_loss": 1.01586378, + "learning_rate": 0.0009875430607045078, + "loss": 1.02692139, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.35058594, + "step": 515, + "time_per_iteration": 2.6745734214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095325, + "balance_loss_mlp": 1.06044412, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.061184004848699555, + "language_loss": 0.96467805, + "learning_rate": 0.000987473857686313, + "loss": 0.97563124, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.34887695, + "step": 516, + "time_per_iteration": 2.70771861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_mlp": 1.06909752, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.06268031252544905, + "language_loss": 1.01795554, + "learning_rate": 0.0009874044654139824, + "loss": 1.02899015, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34399414, + "step": 517, + "time_per_iteration": 2.7501027584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104488, + "balance_loss_mlp": 1.07020378, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.05802057466070587, + "language_loss": 1.01047516, + "learning_rate": 0.0009873348839144563, + "loss": 1.02152014, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34301758, + "step": 518, + "time_per_iteration": 2.5247762203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125408, + "balance_loss_mlp": 1.09100425, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.057276560313135924, + "language_loss": 1.0153054, + "learning_rate": 0.000987265113214749, + "loss": 1.02655947, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34448242, + "step": 519, + "time_per_iteration": 2.569776773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151248, + "balance_loss_mlp": 1.11705852, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.06886779278024428, + "language_loss": 1.05486548, + "learning_rate": 0.0009871951533419476, + "loss": 1.066378, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.34204102, + "step": 520, + "time_per_iteration": 2.646489381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155904, + "balance_loss_mlp": 1.12085652, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.06947260655531057, + "language_loss": 0.93715644, + "learning_rate": 0.0009871250043232132, + "loss": 0.94871557, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.35058594, + "step": 521, + "time_per_iteration": 2.729825258255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145676, + "balance_loss_mlp": 1.11196363, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.05700460680955029, + "language_loss": 0.94319808, + "learning_rate": 0.0009870546661857797, + "loss": 0.95465487, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.33740234, + "step": 522, + "time_per_iteration": 2.589205026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.10572577, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.0627280587118585, + "language_loss": 1.04607201, + "learning_rate": 0.0009869841389569553, + "loss": 1.05746591, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.33666992, + "step": 523, + "time_per_iteration": 3.007927656173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_mlp": 1.07816648, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.07025860249961899, + "language_loss": 0.94709289, + "learning_rate": 0.0009869134226641206, + "loss": 0.95821834, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.34399414, + "step": 524, + "time_per_iteration": 2.5647661685943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096367, + "balance_loss_mlp": 1.06134343, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.0754869647085307, + "language_loss": 0.96719551, + "learning_rate": 0.0009868425173347303, + "loss": 0.97815919, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.3503418, + "step": 525, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_mlp": 1.04816294, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.04461045481777941, + "language_loss": 1.01427031, + "learning_rate": 0.0009867714229963125, + "loss": 1.02508664, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.3347168, + "step": 526, + "time_per_iteration": 2.7551424503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_mlp": 1.06672287, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.06519670287778681, + "language_loss": 0.99495387, + "learning_rate": 0.000986700139676468, + "loss": 1.00596797, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34716797, + "step": 527, + "time_per_iteration": 2.5689845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_mlp": 1.08317983, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.055001529425537175, + "language_loss": 0.97175169, + "learning_rate": 0.0009866286674028717, + "loss": 0.98293233, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.34936523, + "step": 528, + "time_per_iteration": 2.6308236122131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118149, + "balance_loss_mlp": 1.08307743, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.06791274268555884, + "language_loss": 0.93964088, + "learning_rate": 0.0009865570062032717, + "loss": 0.95082229, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.35083008, + "step": 529, + "time_per_iteration": 2.931939125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117806, + "balance_loss_mlp": 1.08104193, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.05469252484924326, + "language_loss": 0.97321147, + "learning_rate": 0.0009864851561054893, + "loss": 0.98438954, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.36743164, + "step": 530, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_mlp": 1.0567745, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.053032092698093954, + "language_loss": 0.97237867, + "learning_rate": 0.0009864131171374191, + "loss": 0.9832958, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34936523, + "step": 531, + "time_per_iteration": 2.671963930130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_mlp": 1.05704737, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.037042660663456926, + "language_loss": 0.97530323, + "learning_rate": 0.0009863408893270292, + "loss": 0.98621887, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.34521484, + "step": 532, + "time_per_iteration": 2.8692965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080567, + "balance_loss_mlp": 1.0459249, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.045189468397627275, + "language_loss": 0.93818736, + "learning_rate": 0.0009862684727023605, + "loss": 0.94899297, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34692383, + "step": 533, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_mlp": 1.04978406, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.041807858593286534, + "language_loss": 0.94846106, + "learning_rate": 0.0009861958672915283, + "loss": 0.95930672, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.34814453, + "step": 534, + "time_per_iteration": 2.7894833087921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088029, + "balance_loss_mlp": 1.05348206, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.04113334704287127, + "language_loss": 0.93477535, + "learning_rate": 0.0009861230731227201, + "loss": 0.94565558, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.34570312, + "step": 535, + "time_per_iteration": 2.8369100093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_mlp": 1.06589389, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.06472741174466715, + "language_loss": 0.9716177, + "learning_rate": 0.0009860500902241973, + "loss": 0.98262858, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.35205078, + "step": 536, + "time_per_iteration": 2.6308608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_mlp": 1.06559658, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.06015330648509861, + "language_loss": 1.02488375, + "learning_rate": 0.0009859769186242942, + "loss": 1.0358845, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.34521484, + "step": 537, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094076, + "balance_loss_mlp": 1.06188989, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04182272700248836, + "language_loss": 0.96166039, + "learning_rate": 0.0009859035583514187, + "loss": 0.97260106, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32177734, + "step": 538, + "time_per_iteration": 2.665483236312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107546, + "balance_loss_mlp": 1.07497787, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.03728554890083732, + "language_loss": 0.9932602, + "learning_rate": 0.0009858300094340517, + "loss": 1.00433564, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.32568359, + "step": 539, + "time_per_iteration": 2.772207021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_mlp": 1.07908368, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.05284254114338104, + "language_loss": 0.91679931, + "learning_rate": 0.0009857562719007473, + "loss": 0.92790818, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.31787109, + "step": 540, + "time_per_iteration": 2.633002519607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_mlp": 1.06964111, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.07454941449424961, + "language_loss": 0.93962657, + "learning_rate": 0.0009856823457801331, + "loss": 0.95063812, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.888354539871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098965, + "balance_loss_mlp": 1.06682634, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.06016078646373104, + "language_loss": 1.01014686, + "learning_rate": 0.00098560823110091, + "loss": 1.02113652, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32128906, + "step": 542, + "time_per_iteration": 2.612365484237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.05664408, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.07331709746631812, + "language_loss": 0.99634022, + "learning_rate": 0.000985533927891851, + "loss": 1.00722837, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.3215332, + "step": 543, + "time_per_iteration": 2.6642584800720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_mlp": 1.05406535, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.07406485241554656, + "language_loss": 0.99318308, + "learning_rate": 0.0009854594361818044, + "loss": 1.00405657, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33300781, + "step": 544, + "time_per_iteration": 2.650541067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087044, + "balance_loss_mlp": 1.05357027, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.05515562757052397, + "language_loss": 0.98072803, + "learning_rate": 0.0009853847559996897, + "loss": 0.99159849, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.3347168, + "step": 545, + "time_per_iteration": 2.7268693447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098973, + "balance_loss_mlp": 1.0640682, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.05014767442192859, + "language_loss": 0.9781934, + "learning_rate": 0.0009853098873745, + "loss": 0.98918307, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34936523, + "step": 546, + "time_per_iteration": 3.001844644546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094885, + "balance_loss_mlp": 1.06010008, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.06665960072991474, + "language_loss": 0.96499509, + "learning_rate": 0.0009852348303353027, + "loss": 0.97594392, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34814453, + "step": 547, + "time_per_iteration": 2.7768120765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109085, + "balance_loss_mlp": 1.05692363, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.04477171592325676, + "language_loss": 0.89746928, + "learning_rate": 0.000985159584911237, + "loss": 0.90837783, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33959961, + "step": 548, + "time_per_iteration": 3.1397063732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109064, + "balance_loss_mlp": 1.0567131, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.057455808878804256, + "language_loss": 0.97617745, + "learning_rate": 0.0009850841511315162, + "loss": 0.98708391, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.33959961, + "step": 549, + "time_per_iteration": 2.6143858432769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.05660701, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.04134640300819554, + "language_loss": 0.97230792, + "learning_rate": 0.0009850085290254256, + "loss": 0.98321134, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33740234, + "step": 550, + "time_per_iteration": 2.784057855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_mlp": 1.05478084, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.041486348142279396, + "language_loss": 0.9340632, + "learning_rate": 0.0009849327186223246, + "loss": 0.94494367, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.33276367, + "step": 551, + "time_per_iteration": 2.822755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086641, + "balance_loss_mlp": 1.0536921, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.044652358506572586, + "language_loss": 1.00453854, + "learning_rate": 0.000984856719951646, + "loss": 1.01540482, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.32958984, + "step": 552, + "time_per_iteration": 2.561384439468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_mlp": 1.05577254, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.05595352831954139, + "language_loss": 0.98322356, + "learning_rate": 0.0009847805330428943, + "loss": 0.99410868, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.32739258, + "step": 553, + "time_per_iteration": 2.8988356590270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04940784, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05618387686115577, + "language_loss": 1.02895415, + "learning_rate": 0.0009847041579256481, + "loss": 1.03977895, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.33081055, + "step": 554, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088152, + "balance_loss_mlp": 1.05548859, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.04459262579832553, + "language_loss": 0.99802542, + "learning_rate": 0.0009846275946295592, + "loss": 1.00890684, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32641602, + "step": 555, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108533, + "balance_loss_mlp": 1.05347764, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.04108965909817336, + "language_loss": 0.92502242, + "learning_rate": 0.0009845508431843518, + "loss": 0.93587577, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.31835938, + "step": 556, + "time_per_iteration": 3.0189473628997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087957, + "balance_loss_mlp": 1.05612838, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.05029379164990677, + "language_loss": 0.95060432, + "learning_rate": 0.0009844739036198233, + "loss": 0.96148396, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.31811523, + "step": 557, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06340766, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.047100661757994676, + "language_loss": 1.0152961, + "learning_rate": 0.0009843967759658448, + "loss": 1.02625763, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.32739258, + "step": 558, + "time_per_iteration": 2.6677682399749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264894, + "balance_loss_mlp": 1.19775486, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.03689581784010691, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74032652, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.671875, + "step": 559, + "time_per_iteration": 4.873044013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.07234466, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.06480790167761245, + "language_loss": 1.01098323, + "learning_rate": 0.000984241956509384, + "loss": 1.02203977, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.33325195, + "step": 560, + "time_per_iteration": 2.655430555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095265, + "balance_loss_mlp": 1.0617907, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.05361377514900226, + "language_loss": 1.00074768, + "learning_rate": 0.0009841642647670078, + "loss": 1.01170027, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.33496094, + "step": 561, + "time_per_iteration": 2.5627329349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.05633116, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.04993888185520414, + "language_loss": 0.93071151, + "learning_rate": 0.0009840863850553944, + "loss": 0.94160575, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33105469, + "step": 562, + "time_per_iteration": 3.0020592212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108807, + "balance_loss_mlp": 1.05686092, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.046287089248472475, + "language_loss": 0.97956204, + "learning_rate": 0.0009840083174047782, + "loss": 0.99044275, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.31176758, + "step": 563, + "time_per_iteration": 2.7123258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_mlp": 1.06275535, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.036863902598139514, + "language_loss": 0.91394317, + "learning_rate": 0.0009839300618454685, + "loss": 0.92488301, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31176758, + "step": 564, + "time_per_iteration": 2.855482578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_mlp": 1.05386496, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0447892393855046, + "language_loss": 0.97269231, + "learning_rate": 0.0009838516184078466, + "loss": 0.98355657, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.32568359, + "step": 565, + "time_per_iteration": 2.8027093410491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090742, + "balance_loss_mlp": 1.05881739, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.039430635834492286, + "language_loss": 0.95326865, + "learning_rate": 0.0009837729871223669, + "loss": 0.964176, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3190918, + "step": 566, + "time_per_iteration": 2.621044158935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097443, + "balance_loss_mlp": 1.06473231, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.03524126234366562, + "language_loss": 0.96988255, + "learning_rate": 0.0009836941680195568, + "loss": 0.98085701, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.32714844, + "step": 567, + "time_per_iteration": 2.8241846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_mlp": 1.06359148, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.05940738915226433, + "language_loss": 0.94011569, + "learning_rate": 0.0009836151611300166, + "loss": 0.95106757, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.31567383, + "step": 568, + "time_per_iteration": 3.2259325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093362, + "balance_loss_mlp": 1.06327355, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.04952949609465528, + "language_loss": 1.01886261, + "learning_rate": 0.0009835359664844194, + "loss": 1.02979624, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.30029297, + "step": 569, + "time_per_iteration": 2.61936616897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235986, + "balance_loss_mlp": 1.17113578, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.02580255803672051, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82272792, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.6484375, + "step": 570, + "time_per_iteration": 4.946800470352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_mlp": 1.06947398, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.04088785760268294, + "language_loss": 0.98121774, + "learning_rate": 0.0009833770140481118, + "loss": 0.99224108, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.32861328, + "step": 571, + "time_per_iteration": 2.6676580905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_mlp": 1.07113993, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.04146527084622454, + "language_loss": 0.88084227, + "learning_rate": 0.000983297256319112, + "loss": 0.89187813, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.32446289, + "step": 572, + "time_per_iteration": 3.1977450847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098726, + "balance_loss_mlp": 1.06503749, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.11112801331440751, + "language_loss": 0.93675387, + "learning_rate": 0.000983217310957477, + "loss": 0.94774115, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33691406, + "step": 573, + "time_per_iteration": 2.771477222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08530974, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.046936313049011164, + "language_loss": 0.98079342, + "learning_rate": 0.000983137177994244, + "loss": 0.99198341, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.3371582, + "step": 574, + "time_per_iteration": 2.842641830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127826, + "balance_loss_mlp": 1.0945909, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.047970587572460185, + "language_loss": 0.91368234, + "learning_rate": 0.0009830568574605235, + "loss": 0.92496061, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.33227539, + "step": 575, + "time_per_iteration": 2.9841148853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136053, + "balance_loss_mlp": 1.10260296, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.06212944390612344, + "language_loss": 0.95608473, + "learning_rate": 0.0009829763493874992, + "loss": 0.96744525, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3347168, + "step": 576, + "time_per_iteration": 3.094599485397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122949, + "balance_loss_mlp": 1.08918953, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.040009357062280086, + "language_loss": 1.0022918, + "learning_rate": 0.0009828956538064264, + "loss": 1.01352131, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.33764648, + "step": 577, + "time_per_iteration": 2.7913765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128001, + "balance_loss_mlp": 1.09428823, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07834189266391174, + "language_loss": 0.97103804, + "learning_rate": 0.0009828147707486344, + "loss": 0.98231804, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.33740234, + "step": 578, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.0659467, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.066476002167881, + "language_loss": 0.94244707, + "learning_rate": 0.0009827337002455245, + "loss": 0.95344198, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.33544922, + "step": 579, + "time_per_iteration": 2.6212143898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.05940461, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.0598380025645264, + "language_loss": 0.93403691, + "learning_rate": 0.0009826524423285712, + "loss": 0.94494587, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.31469727, + "step": 580, + "time_per_iteration": 2.916363000869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_mlp": 1.05466461, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.051352596452175936, + "language_loss": 0.95457065, + "learning_rate": 0.0009825709970293218, + "loss": 0.96543789, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.32055664, + "step": 581, + "time_per_iteration": 2.975459575653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094975, + "balance_loss_mlp": 1.06414759, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.06330579048660655, + "language_loss": 1.01360774, + "learning_rate": 0.0009824893643793956, + "loss": 1.02455735, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.30810547, + "step": 582, + "time_per_iteration": 3.0850436687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109989, + "balance_loss_mlp": 1.06772757, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.05517621871728721, + "language_loss": 0.96568394, + "learning_rate": 0.0009824075444104857, + "loss": 0.9766829, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3215332, + "step": 583, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104353, + "balance_loss_mlp": 1.07214284, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.05273776870459213, + "language_loss": 1.00669086, + "learning_rate": 0.000982325537154357, + "loss": 1.01773441, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.32202148, + "step": 584, + "time_per_iteration": 2.566066265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109419, + "balance_loss_mlp": 1.07768583, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.05755454669423396, + "language_loss": 1.01869726, + "learning_rate": 0.0009822433426428484, + "loss": 1.02979159, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31713867, + "step": 585, + "time_per_iteration": 2.611968994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_mlp": 1.08987498, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.06034275506000564, + "language_loss": 0.93750811, + "learning_rate": 0.0009821609609078697, + "loss": 0.94872963, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.32275391, + "step": 586, + "time_per_iteration": 2.584847927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0726887, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.06416707827025614, + "language_loss": 0.95279968, + "learning_rate": 0.0009820783919814045, + "loss": 0.96384937, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.32275391, + "step": 587, + "time_per_iteration": 2.7885184288024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06359744, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.049104346633589514, + "language_loss": 0.92135406, + "learning_rate": 0.0009819956358955095, + "loss": 0.93231547, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32543945, + "step": 588, + "time_per_iteration": 2.560117483139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_mlp": 1.05427432, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.05114307144868452, + "language_loss": 0.93675017, + "learning_rate": 0.0009819126926823127, + "loss": 0.94761813, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.32519531, + "step": 589, + "time_per_iteration": 2.517035722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.05966008, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.04613241529975588, + "language_loss": 0.94437975, + "learning_rate": 0.000981829562374016, + "loss": 0.95531201, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.33569336, + "step": 590, + "time_per_iteration": 2.8174262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_mlp": 1.05913091, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.05348492004263644, + "language_loss": 1.04949331, + "learning_rate": 0.0009817462450028933, + "loss": 1.0604248, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.34057617, + "step": 591, + "time_per_iteration": 2.6302859783172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.0668143, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.2030818500746725, + "language_loss": 0.92329478, + "learning_rate": 0.0009816627406012916, + "loss": 0.93430716, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.34472656, + "step": 592, + "time_per_iteration": 2.8384313583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.09943521, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.0774704650100976, + "language_loss": 0.91851664, + "learning_rate": 0.0009815790492016295, + "loss": 0.92987645, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36523438, + "step": 593, + "time_per_iteration": 2.9409682750701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136834, + "balance_loss_mlp": 1.10192943, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.09332707993556091, + "language_loss": 0.94690275, + "learning_rate": 0.0009814951708363993, + "loss": 0.95827115, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.34912109, + "step": 594, + "time_per_iteration": 2.8599631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221657, + "balance_loss_mlp": 1.16023993, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.030934197408724044, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79212642, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.61328125, + "step": 595, + "time_per_iteration": 4.801583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_mlp": 1.10138512, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.0746127254366864, + "language_loss": 0.94972038, + "learning_rate": 0.0009813268533395648, + "loss": 0.96109354, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.359375, + "step": 596, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_mlp": 1.0882678, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.061536990211155544, + "language_loss": 0.95371294, + "learning_rate": 0.0009812424142733073, + "loss": 0.96494377, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.34765625, + "step": 597, + "time_per_iteration": 2.5663998126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07387781, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.04795398370622496, + "language_loss": 0.91199464, + "learning_rate": 0.000981157788372175, + "loss": 0.92308056, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.34716797, + "step": 598, + "time_per_iteration": 3.004436492919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_mlp": 1.06864619, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.04762632796488997, + "language_loss": 0.94997883, + "learning_rate": 0.0009810729756690223, + "loss": 0.96100628, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.34106445, + "step": 599, + "time_per_iteration": 2.704676628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_mlp": 1.06947374, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.06699944809564747, + "language_loss": 0.98224139, + "learning_rate": 0.0009809879761967766, + "loss": 0.99328732, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35107422, + "step": 600, + "time_per_iteration": 2.953348159790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_mlp": 1.07922578, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.06801646297960097, + "language_loss": 0.96874714, + "learning_rate": 0.0009809027899884378, + "loss": 0.97988677, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.34765625, + "step": 601, + "time_per_iteration": 2.896559953689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104267, + "balance_loss_mlp": 1.07014918, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.062436318450634756, + "language_loss": 0.9484992, + "learning_rate": 0.0009808174170770779, + "loss": 0.95954192, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.34130859, + "step": 602, + "time_per_iteration": 2.814558982849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220455, + "balance_loss_mlp": 1.16704941, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.025680107820064087, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86118698, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.53515625, + "step": 603, + "time_per_iteration": 4.897503614425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118739, + "balance_loss_mlp": 1.08566999, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.05533944227900463, + "language_loss": 1.0028702, + "learning_rate": 0.0009806461112779462, + "loss": 1.01405764, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.33081055, + "step": 604, + "time_per_iteration": 2.6172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115094, + "balance_loss_mlp": 1.08281231, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.07231087595972972, + "language_loss": 0.97971618, + "learning_rate": 0.0009805601784566814, + "loss": 0.99086702, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.32250977, + "step": 605, + "time_per_iteration": 2.4791650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125941, + "balance_loss_mlp": 1.09208584, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.06015253149930396, + "language_loss": 1.02430916, + "learning_rate": 0.0009804740590654089, + "loss": 1.03556848, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.33862305, + "step": 606, + "time_per_iteration": 2.614476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124787, + "balance_loss_mlp": 1.09229016, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.08034134565527169, + "language_loss": 0.97153747, + "learning_rate": 0.0009803877531375635, + "loss": 0.9827854, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.32495117, + "step": 607, + "time_per_iteration": 2.851011276245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_mlp": 1.09228706, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.05400582488055185, + "language_loss": 0.97512484, + "learning_rate": 0.0009803012607066523, + "loss": 0.9864068, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.35913086, + "step": 608, + "time_per_iteration": 2.700596570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128537, + "balance_loss_mlp": 1.09294093, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.15792902837654846, + "language_loss": 0.95375645, + "learning_rate": 0.0009802145818062543, + "loss": 0.96504182, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.35620117, + "step": 609, + "time_per_iteration": 2.693417549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123637, + "balance_loss_mlp": 1.08742094, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.06851059455565046, + "language_loss": 0.99132365, + "learning_rate": 0.0009801277164700212, + "loss": 1.00256002, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36254883, + "step": 610, + "time_per_iteration": 2.5825185775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131797, + "balance_loss_mlp": 1.09541452, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.1113382534985323, + "language_loss": 0.96033651, + "learning_rate": 0.0009800406647316776, + "loss": 0.97165447, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.36376953, + "step": 611, + "time_per_iteration": 2.8625166416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231096, + "balance_loss_mlp": 1.18112373, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.03346184177846584, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78145558, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.49804688, + "step": 612, + "time_per_iteration": 4.748431444168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137214, + "balance_loss_mlp": 1.09880471, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.07612220197102978, + "language_loss": 0.95326376, + "learning_rate": 0.000979866002183916, + "loss": 0.96463591, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.38378906, + "step": 613, + "time_per_iteration": 2.6311473846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155666, + "balance_loss_mlp": 1.11482501, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.0832714106614858, + "language_loss": 0.96221644, + "learning_rate": 0.0009797783914423082, + "loss": 0.97377312, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.40844727, + "step": 614, + "time_per_iteration": 2.8568782806396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126933, + "balance_loss_mlp": 1.08721232, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.08355321383380138, + "language_loss": 0.91733479, + "learning_rate": 0.0009796905944342094, + "loss": 0.92860413, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.3972168, + "step": 615, + "time_per_iteration": 2.8348331451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07517743, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.05175964705030883, + "language_loss": 0.94486296, + "learning_rate": 0.0009796026111937057, + "loss": 0.9560017, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.38671875, + "step": 616, + "time_per_iteration": 2.609276056289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111065, + "balance_loss_mlp": 1.07393384, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.1779679576065946, + "language_loss": 0.94108498, + "learning_rate": 0.0009795144417549552, + "loss": 0.95219147, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.3671875, + "step": 617, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.07760203, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.0639893702788804, + "language_loss": 0.95137906, + "learning_rate": 0.0009794260861521883, + "loss": 0.96252483, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36987305, + "step": 618, + "time_per_iteration": 2.779780387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125632, + "balance_loss_mlp": 1.08908224, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.062080445707157726, + "language_loss": 0.94238096, + "learning_rate": 0.0009793375444197075, + "loss": 0.95363724, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.3659668, + "step": 619, + "time_per_iteration": 2.6269500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.12132859, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.05728911446624217, + "language_loss": 0.93181753, + "learning_rate": 0.000979248816591888, + "loss": 0.94341516, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.38452148, + "step": 620, + "time_per_iteration": 2.7879464626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155629, + "balance_loss_mlp": 1.11600351, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.05539388103354017, + "language_loss": 0.93241715, + "learning_rate": 0.0009791599027031766, + "loss": 0.94397342, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.39624023, + "step": 621, + "time_per_iteration": 3.058497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152711, + "balance_loss_mlp": 1.11439681, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.05959109763307043, + "language_loss": 0.93889141, + "learning_rate": 0.0009790708027880932, + "loss": 0.95041847, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.38330078, + "step": 622, + "time_per_iteration": 2.857905864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217773, + "balance_loss_mlp": 1.17447615, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.033264976771994935, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78645062, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.43359375, + "step": 623, + "time_per_iteration": 4.817517518997192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130228, + "balance_loss_mlp": 1.09372652, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.07130736684785184, + "language_loss": 0.99442542, + "learning_rate": 0.0009788920450172487, + "loss": 1.00572777, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.36499023, + "step": 624, + "time_per_iteration": 2.6089231967926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_mlp": 1.0987401, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.053387747347518576, + "language_loss": 0.97139525, + "learning_rate": 0.0009788023872308875, + "loss": 0.98273742, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35522461, + "step": 625, + "time_per_iteration": 2.5482659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171918, + "balance_loss_mlp": 1.12614214, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.016755812295179123, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76600921, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.45703125, + "step": 626, + "time_per_iteration": 4.767898797988892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142548, + "balance_loss_mlp": 1.10609388, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.053046953839951706, + "language_loss": 0.99526918, + "learning_rate": 0.0009786225140303285, + "loss": 1.00669467, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.36425781, + "step": 627, + "time_per_iteration": 2.666975975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145866, + "balance_loss_mlp": 1.10974586, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.06539343990980159, + "language_loss": 0.97403502, + "learning_rate": 0.0009785322986859634, + "loss": 0.98549366, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.36132812, + "step": 628, + "time_per_iteration": 2.6613006591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116443, + "balance_loss_mlp": 1.12830925, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.05337423256033143, + "language_loss": 0.99038112, + "learning_rate": 0.0009784418975588838, + "loss": 1.00202537, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.36108398, + "step": 629, + "time_per_iteration": 2.7266693115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.11248696, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.06598420413892771, + "language_loss": 0.97636682, + "learning_rate": 0.0009783513106841862, + "loss": 0.98784697, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.35522461, + "step": 630, + "time_per_iteration": 2.7734336853027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122661, + "balance_loss_mlp": 1.17663717, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.0364602282496576, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77959311, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.5, + "step": 631, + "time_per_iteration": 4.955650091171265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118689, + "balance_loss_mlp": 1.08283055, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.061523486228641615, + "language_loss": 0.94419873, + "learning_rate": 0.0009781695798326854, + "loss": 0.95538557, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35888672, + "step": 632, + "time_per_iteration": 2.6072514057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111815, + "balance_loss_mlp": 1.08319819, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.05761126083629287, + "language_loss": 0.93996418, + "learning_rate": 0.0009780784359264365, + "loss": 0.95114571, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.34985352, + "step": 633, + "time_per_iteration": 2.6186299324035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201074, + "balance_loss_mlp": 1.15548825, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.024414945484573326, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75389773, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.45507812, + "step": 634, + "time_per_iteration": 4.757866144180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_mlp": 1.05732846, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.05071444395915749, + "language_loss": 0.91919303, + "learning_rate": 0.000977895591329867, + "loss": 0.93010104, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.3347168, + "step": 635, + "time_per_iteration": 2.7802233695983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094425, + "balance_loss_mlp": 1.06006885, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.05652682698430024, + "language_loss": 0.93613631, + "learning_rate": 0.000977803890710533, + "loss": 0.94708061, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.34399414, + "step": 636, + "time_per_iteration": 2.719989538192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109182, + "balance_loss_mlp": 1.0546267, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.05019916823038997, + "language_loss": 0.97873759, + "learning_rate": 0.0009777120045912774, + "loss": 0.98965579, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.37231445, + "step": 637, + "time_per_iteration": 2.5960683822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099212, + "balance_loss_mlp": 1.06139851, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.05186361253186237, + "language_loss": 0.97095829, + "learning_rate": 0.0009776199330077736, + "loss": 0.9819504, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37841797, + "step": 638, + "time_per_iteration": 2.7152581214904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_mlp": 1.05121303, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.05467339203371928, + "language_loss": 0.99154645, + "learning_rate": 0.0009775276759957667, + "loss": 1.00242841, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.37011719, + "step": 639, + "time_per_iteration": 2.6985981464385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090176, + "balance_loss_mlp": 1.05465198, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.06600893718108056, + "language_loss": 0.97933781, + "learning_rate": 0.0009774352335910745, + "loss": 0.99023956, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.35546875, + "step": 640, + "time_per_iteration": 2.813744306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_mlp": 1.05298471, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.05927901471916764, + "language_loss": 0.99468219, + "learning_rate": 0.000977342605829586, + "loss": 1.00554824, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.33642578, + "step": 641, + "time_per_iteration": 2.73280668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110577, + "balance_loss_mlp": 1.07240582, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.07046674646118828, + "language_loss": 0.92099506, + "learning_rate": 0.0009772497927472623, + "loss": 0.93210077, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.38183594, + "step": 642, + "time_per_iteration": 3.1258397102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.09514427, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.07438352262018386, + "language_loss": 0.93366879, + "learning_rate": 0.0009771567943801368, + "loss": 0.94501698, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3972168, + "step": 643, + "time_per_iteration": 2.6720776557922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149366, + "balance_loss_mlp": 1.10912085, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.055730629552303436, + "language_loss": 0.96261084, + "learning_rate": 0.0009770636107643152, + "loss": 0.97410446, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.40234375, + "step": 644, + "time_per_iteration": 2.7093722820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144915, + "balance_loss_mlp": 1.10734022, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.05250459899213186, + "language_loss": 0.92937833, + "learning_rate": 0.0009769702419359738, + "loss": 0.94082749, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.37597656, + "step": 645, + "time_per_iteration": 2.661512613296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173428, + "balance_loss_mlp": 1.13146591, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.052890865129340166, + "language_loss": 0.94770992, + "learning_rate": 0.000976876687931362, + "loss": 0.95944417, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.41943359, + "step": 646, + "time_per_iteration": 2.972522258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164317, + "balance_loss_mlp": 1.12555003, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.07033761546633982, + "language_loss": 0.91270661, + "learning_rate": 0.0009767829487868005, + "loss": 0.92434984, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.38769531, + "step": 647, + "time_per_iteration": 2.6150805950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164183, + "balance_loss_mlp": 1.12281775, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.07269814667774141, + "language_loss": 0.95938772, + "learning_rate": 0.000976689024538682, + "loss": 0.97102952, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.41381836, + "step": 648, + "time_per_iteration": 2.6567764282226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_mlp": 1.11497951, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.06659282576896536, + "language_loss": 0.94783676, + "learning_rate": 0.0009765949152234716, + "loss": 0.95937783, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.39135742, + "step": 649, + "time_per_iteration": 2.9032628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118823, + "balance_loss_mlp": 1.15084565, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.027365485913225348, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79874313, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.37304688, + "step": 650, + "time_per_iteration": 4.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145487, + "balance_loss_mlp": 1.10395491, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.07758701561639549, + "language_loss": 0.88880539, + "learning_rate": 0.0009764061415379919, + "loss": 0.90026021, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.41552734, + "step": 651, + "time_per_iteration": 3.2588987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_mlp": 1.09766221, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08409279007421946, + "language_loss": 0.94380724, + "learning_rate": 0.0009763114772410109, + "loss": 0.95518184, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.39794922, + "step": 652, + "time_per_iteration": 2.5698702335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112103, + "balance_loss_mlp": 1.08359814, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.056536251661147445, + "language_loss": 0.92061114, + "learning_rate": 0.0009762166280235146, + "loss": 0.93182147, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37451172, + "step": 653, + "time_per_iteration": 2.938668966293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_mlp": 1.08191729, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.0771848817407848, + "language_loss": 0.94092464, + "learning_rate": 0.0009761215939223267, + "loss": 0.95209974, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.35644531, + "step": 654, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_mlp": 1.06834149, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.07424845664771389, + "language_loss": 0.9475044, + "learning_rate": 0.0009760263749743428, + "loss": 0.95853353, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.34570312, + "step": 655, + "time_per_iteration": 2.5710902214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101838, + "balance_loss_mlp": 1.06771994, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.053259035011575195, + "language_loss": 0.94285154, + "learning_rate": 0.0009759309712165299, + "loss": 0.95386994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34130859, + "step": 656, + "time_per_iteration": 2.70626163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101868, + "balance_loss_mlp": 1.06858444, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.0693418830287988, + "language_loss": 0.9812479, + "learning_rate": 0.0009758353826859272, + "loss": 0.99226654, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.33300781, + "step": 657, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_mlp": 1.0663563, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.06782991509763603, + "language_loss": 0.96008623, + "learning_rate": 0.0009757396094196456, + "loss": 0.97111744, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36791992, + "step": 658, + "time_per_iteration": 2.8277065753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115911, + "balance_loss_mlp": 1.07926583, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.053606842709613675, + "language_loss": 0.89398581, + "learning_rate": 0.0009756436514548673, + "loss": 0.90514493, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36645508, + "step": 659, + "time_per_iteration": 2.796175718307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120986, + "balance_loss_mlp": 1.0811224, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.060525818769901533, + "language_loss": 0.92384607, + "learning_rate": 0.0009755475088288466, + "loss": 0.93505597, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.39916992, + "step": 660, + "time_per_iteration": 2.678682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133341, + "balance_loss_mlp": 1.09271395, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08191197530717065, + "language_loss": 0.958794, + "learning_rate": 0.0009754511815789095, + "loss": 0.97012746, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.40600586, + "step": 661, + "time_per_iteration": 2.7371177673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130898, + "balance_loss_mlp": 1.09093928, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08687138171908054, + "language_loss": 0.92166948, + "learning_rate": 0.0009753546697424533, + "loss": 0.93297845, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.39941406, + "step": 662, + "time_per_iteration": 2.704432249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125889, + "balance_loss_mlp": 1.08700323, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.06194581367760624, + "language_loss": 0.95628935, + "learning_rate": 0.0009752579733569475, + "loss": 0.96754825, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.38891602, + "step": 663, + "time_per_iteration": 2.682892084121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165087, + "balance_loss_mlp": 1.1326623, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.0245621431528993, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76046479, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.32421875, + "step": 664, + "time_per_iteration": 4.981603622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146598, + "balance_loss_mlp": 1.1060189, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.07818489478946229, + "language_loss": 0.96962506, + "learning_rate": 0.0009750640270890217, + "loss": 0.98109102, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.40576172, + "step": 665, + "time_per_iteration": 2.7139556407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139516, + "balance_loss_mlp": 1.10115409, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.10418725554084544, + "language_loss": 1.02824736, + "learning_rate": 0.0009749667772818983, + "loss": 1.03964257, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.38354492, + "step": 666, + "time_per_iteration": 3.000227689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148195, + "balance_loss_mlp": 1.11481678, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.027847994605201966, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78084135, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.33398438, + "step": 667, + "time_per_iteration": 4.858838319778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.1255703, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.0747922247275706, + "language_loss": 1.00932169, + "learning_rate": 0.0009747717245101093, + "loss": 1.0209403, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.36303711, + "step": 668, + "time_per_iteration": 2.4917514324188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172854, + "balance_loss_mlp": 1.13518405, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0795363237311063, + "language_loss": 0.91087645, + "learning_rate": 0.00097467392162117, + "loss": 0.92260504, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.37719727, + "step": 669, + "time_per_iteration": 2.601151466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196603, + "balance_loss_mlp": 1.15540457, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.0744221392925499, + "language_loss": 0.95630497, + "learning_rate": 0.0009745759344474708, + "loss": 0.96827102, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.41162109, + "step": 670, + "time_per_iteration": 2.878068447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200159, + "balance_loss_mlp": 1.16012812, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.07162427386273244, + "language_loss": 0.95158428, + "learning_rate": 0.0009744777630270536, + "loss": 0.96358585, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.40063477, + "step": 671, + "time_per_iteration": 2.5778517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220294, + "balance_loss_mlp": 1.17752171, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.07459259564874297, + "language_loss": 0.99775112, + "learning_rate": 0.000974379407398032, + "loss": 1.00995398, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.42797852, + "step": 672, + "time_per_iteration": 2.862168073654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_mlp": 1.15175724, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.05795101219152752, + "language_loss": 0.86696863, + "learning_rate": 0.0009742808675985913, + "loss": 0.87888587, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.3996582, + "step": 673, + "time_per_iteration": 3.0987160205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011832, + "balance_loss_mlp": 1.14142871, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.06292984682523013, + "language_loss": 0.96893597, + "learning_rate": 0.0009741821436669876, + "loss": 0.98076797, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.41772461, + "step": 674, + "time_per_iteration": 2.565317153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160814, + "balance_loss_mlp": 1.12123656, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.07127578315040689, + "language_loss": 0.99621803, + "learning_rate": 0.0009740832356415492, + "loss": 1.00782621, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.39550781, + "step": 675, + "time_per_iteration": 2.4777724742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144275, + "balance_loss_mlp": 1.10538852, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.07563598794059366, + "language_loss": 0.94837546, + "learning_rate": 0.0009739841435606756, + "loss": 0.95981824, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.38867188, + "step": 676, + "time_per_iteration": 2.9838767051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_mlp": 1.09186864, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.06693149578557214, + "language_loss": 0.94293654, + "learning_rate": 0.0009738848674628377, + "loss": 0.95424765, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.39233398, + "step": 677, + "time_per_iteration": 2.7052054405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130656, + "balance_loss_mlp": 1.0923903, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.05501746541124835, + "language_loss": 0.94784498, + "learning_rate": 0.000973785407386578, + "loss": 0.95915151, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.38232422, + "step": 678, + "time_per_iteration": 2.7535152435302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_mlp": 1.09727383, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.05430769504454563, + "language_loss": 0.91185606, + "learning_rate": 0.0009736857633705103, + "loss": 0.92322862, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.3996582, + "step": 679, + "time_per_iteration": 2.8686013221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135266, + "balance_loss_mlp": 1.09575987, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.06387426976514826, + "language_loss": 0.97335434, + "learning_rate": 0.0009735859354533196, + "loss": 0.984707, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.39501953, + "step": 680, + "time_per_iteration": 2.6952273845672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09626174, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.07637025474680663, + "language_loss": 0.97434723, + "learning_rate": 0.0009734859236737628, + "loss": 0.98571181, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.40185547, + "step": 681, + "time_per_iteration": 2.607431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_mlp": 1.09720194, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.06515090437153119, + "language_loss": 0.9831785, + "learning_rate": 0.0009733857280706678, + "loss": 0.99454683, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.39599609, + "step": 682, + "time_per_iteration": 2.5730957984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140739, + "balance_loss_mlp": 1.1007328, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.08408851923922504, + "language_loss": 0.89817083, + "learning_rate": 0.000973285348682934, + "loss": 0.90957826, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.39990234, + "step": 683, + "time_per_iteration": 2.7041609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_mlp": 1.08460057, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.021197399820989362, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7901845, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.35546875, + "step": 684, + "time_per_iteration": 4.7803051471710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145399, + "balance_loss_mlp": 1.10579789, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.06796914093678033, + "language_loss": 0.90116858, + "learning_rate": 0.0009730840387095046, + "loss": 0.91262257, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.39575195, + "step": 685, + "time_per_iteration": 3.289513111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154301, + "balance_loss_mlp": 1.11412716, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.0690044047280534, + "language_loss": 0.95956922, + "learning_rate": 0.0009729831082019642, + "loss": 0.97111225, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.40185547, + "step": 686, + "time_per_iteration": 2.8214356899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131343, + "balance_loss_mlp": 1.09383941, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08080780289155233, + "language_loss": 0.93596351, + "learning_rate": 0.0009728819940660958, + "loss": 0.94727689, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.375, + "step": 687, + "time_per_iteration": 2.749385118484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011246, + "balance_loss_mlp": 1.08542764, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.08853955851107219, + "language_loss": 0.91695315, + "learning_rate": 0.0009727806963411557, + "loss": 0.92819917, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.39135742, + "step": 688, + "time_per_iteration": 2.592099666595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_mlp": 1.08777368, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.06370494383790047, + "language_loss": 0.92130053, + "learning_rate": 0.000972679215066471, + "loss": 0.93258381, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.40551758, + "step": 689, + "time_per_iteration": 2.7344043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114145, + "balance_loss_mlp": 1.10246885, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.08478699193898473, + "language_loss": 1.04583168, + "learning_rate": 0.0009725775502814401, + "loss": 1.05724621, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.3894043, + "step": 690, + "time_per_iteration": 2.5881311893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155383, + "balance_loss_mlp": 1.1147325, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.07994389842197654, + "language_loss": 0.90077579, + "learning_rate": 0.0009724757020255327, + "loss": 0.91232961, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.40649414, + "step": 691, + "time_per_iteration": 2.8452539443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_mlp": 1.12566948, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09039906445052394, + "language_loss": 0.91914684, + "learning_rate": 0.0009723736703382902, + "loss": 0.93079573, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.39208984, + "step": 692, + "time_per_iteration": 2.5472824573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198661, + "balance_loss_mlp": 1.15557849, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07689546631051256, + "language_loss": 0.86461794, + "learning_rate": 0.0009722714552593244, + "loss": 0.87660456, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.4309082, + "step": 693, + "time_per_iteration": 2.6273465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199876, + "balance_loss_mlp": 1.15560198, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08142665414192346, + "language_loss": 1.00438499, + "learning_rate": 0.000972169056828319, + "loss": 1.01638389, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.44262695, + "step": 694, + "time_per_iteration": 2.477491617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221806, + "balance_loss_mlp": 1.17741275, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.07001491486919184, + "language_loss": 0.90590984, + "learning_rate": 0.0009720664750850283, + "loss": 0.91812789, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.4440918, + "step": 695, + "time_per_iteration": 2.7817704677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209285, + "balance_loss_mlp": 1.16870594, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.07077521288835904, + "language_loss": 0.97240067, + "learning_rate": 0.0009719637100692784, + "loss": 0.98449349, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.40625, + "step": 696, + "time_per_iteration": 2.7099833488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214589, + "balance_loss_mlp": 1.17069626, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.06395797985697109, + "language_loss": 0.87399805, + "learning_rate": 0.0009718607618209661, + "loss": 0.88614392, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.43896484, + "step": 697, + "time_per_iteration": 2.8280160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226846, + "balance_loss_mlp": 1.18445516, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.08853583224950028, + "language_loss": 0.91527486, + "learning_rate": 0.0009717576303800595, + "loss": 0.92754334, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.42382812, + "step": 698, + "time_per_iteration": 3.0102553367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206724, + "balance_loss_mlp": 1.16385674, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.07140979809376953, + "language_loss": 0.90443981, + "learning_rate": 0.0009716543157865975, + "loss": 0.91650712, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.4284668, + "step": 699, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192988, + "balance_loss_mlp": 1.15047789, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.0971528894423257, + "language_loss": 0.87731719, + "learning_rate": 0.0009715508180806907, + "loss": 0.88924706, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.42504883, + "step": 700, + "time_per_iteration": 3.183608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.12189686, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07253928509691168, + "language_loss": 0.94940412, + "learning_rate": 0.0009714471373025202, + "loss": 0.96104908, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.42578125, + "step": 701, + "time_per_iteration": 3.4071736335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_mlp": 1.10978746, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07349692890686976, + "language_loss": 0.93387866, + "learning_rate": 0.0009713432734923386, + "loss": 0.94542348, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.44702148, + "step": 702, + "time_per_iteration": 2.61545467376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149917, + "balance_loss_mlp": 1.10523736, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.07475145021416552, + "language_loss": 0.90919894, + "learning_rate": 0.0009712392266904696, + "loss": 0.92069811, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.44702148, + "step": 703, + "time_per_iteration": 2.739295482635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156115, + "balance_loss_mlp": 1.11219811, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.09690331363255131, + "language_loss": 0.90325272, + "learning_rate": 0.0009711349969373076, + "loss": 0.91481388, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.43945312, + "step": 704, + "time_per_iteration": 3.1653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175158, + "balance_loss_mlp": 1.12780786, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.09111648779989767, + "language_loss": 0.84997714, + "learning_rate": 0.0009710305842733178, + "loss": 0.86172873, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.47314453, + "step": 705, + "time_per_iteration": 2.7402727603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117117, + "balance_loss_mlp": 1.12737262, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.10189351673448747, + "language_loss": 0.9379847, + "learning_rate": 0.0009709259887390373, + "loss": 0.94969636, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.43774414, + "step": 706, + "time_per_iteration": 2.5640039443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.10467625, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.07946562356881365, + "language_loss": 0.95178437, + "learning_rate": 0.0009708212103750737, + "loss": 0.96325481, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.42382812, + "step": 707, + "time_per_iteration": 2.6138036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153312, + "balance_loss_mlp": 1.1095618, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.07708082078191984, + "language_loss": 0.91549516, + "learning_rate": 0.0009707162492221051, + "loss": 0.9270283, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.43725586, + "step": 708, + "time_per_iteration": 2.879612684249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143626, + "balance_loss_mlp": 1.10121179, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.08764140181907645, + "language_loss": 0.92509496, + "learning_rate": 0.0009706111053208815, + "loss": 0.93653119, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.42431641, + "step": 709, + "time_per_iteration": 2.804469347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156089, + "balance_loss_mlp": 1.10947847, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.07097269092186763, + "language_loss": 0.89579999, + "learning_rate": 0.0009705057787122232, + "loss": 0.90736091, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.46630859, + "step": 710, + "time_per_iteration": 2.568406105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174212, + "balance_loss_mlp": 1.12874603, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.06463299548184855, + "language_loss": 0.94250202, + "learning_rate": 0.0009704002694370216, + "loss": 0.9542442, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.45410156, + "step": 711, + "time_per_iteration": 2.525240659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116324, + "balance_loss_mlp": 1.11820245, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.06677275778781674, + "language_loss": 0.90675253, + "learning_rate": 0.0009702945775362388, + "loss": 0.91838491, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.45043945, + "step": 712, + "time_per_iteration": 2.572566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171995, + "balance_loss_mlp": 1.12478852, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.06549167744569931, + "language_loss": 0.91151595, + "learning_rate": 0.0009701887030509086, + "loss": 0.92323589, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.47167969, + "step": 713, + "time_per_iteration": 2.645202875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_mlp": 1.11450684, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.07696267649297317, + "language_loss": 0.95333648, + "learning_rate": 0.0009700826460221346, + "loss": 0.96490526, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.42382812, + "step": 714, + "time_per_iteration": 2.649831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187773, + "balance_loss_mlp": 1.13980293, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.08597126409557068, + "language_loss": 0.96336859, + "learning_rate": 0.0009699764064910921, + "loss": 0.97524625, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.47998047, + "step": 715, + "time_per_iteration": 2.8645238876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178216, + "balance_loss_mlp": 1.1317718, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08366808602410432, + "language_loss": 0.90892398, + "learning_rate": 0.0009698699844990268, + "loss": 0.92070615, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.46435547, + "step": 716, + "time_per_iteration": 2.635460376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171183, + "balance_loss_mlp": 1.12731409, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.051528021496160425, + "language_loss": 0.91132116, + "learning_rate": 0.0009697633800872555, + "loss": 0.923033, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.4387207, + "step": 717, + "time_per_iteration": 2.887854814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189757, + "balance_loss_mlp": 1.1432178, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.07388540586481528, + "language_loss": 0.94422555, + "learning_rate": 0.0009696565932971655, + "loss": 0.95612311, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.46557617, + "step": 718, + "time_per_iteration": 2.8565313816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171127, + "balance_loss_mlp": 1.12580407, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.06166568969162735, + "language_loss": 0.92794299, + "learning_rate": 0.0009695496241702153, + "loss": 0.93965423, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.45361328, + "step": 719, + "time_per_iteration": 2.827193021774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178611, + "balance_loss_mlp": 1.13152349, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.07046673128739296, + "language_loss": 0.8903814, + "learning_rate": 0.0009694424727479339, + "loss": 0.9021675, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.47094727, + "step": 720, + "time_per_iteration": 2.958855628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12150323, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.07332050167219753, + "language_loss": 0.91946507, + "learning_rate": 0.0009693351390719213, + "loss": 0.93114913, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.46899414, + "step": 721, + "time_per_iteration": 2.6910197734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012083, + "balance_loss_mlp": 1.15742183, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.06188248769550966, + "language_loss": 0.93531096, + "learning_rate": 0.000969227623183848, + "loss": 0.94739395, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.50830078, + "step": 722, + "time_per_iteration": 2.791097640991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.14776587, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06666345220966835, + "language_loss": 0.93550557, + "learning_rate": 0.0009691199251254554, + "loss": 0.94745386, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.47045898, + "step": 723, + "time_per_iteration": 2.8282151222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173107, + "balance_loss_mlp": 1.13059711, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.07191970231420823, + "language_loss": 0.88703346, + "learning_rate": 0.0009690120449385555, + "loss": 0.89876461, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.42504883, + "step": 724, + "time_per_iteration": 2.775456190109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158197, + "balance_loss_mlp": 1.11332655, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.06680700276551169, + "language_loss": 0.95181078, + "learning_rate": 0.0009689039826650312, + "loss": 0.96339279, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.44824219, + "step": 725, + "time_per_iteration": 2.7623417377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164951, + "balance_loss_mlp": 1.12756717, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.03995326528410751, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77688015, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.37304688, + "step": 726, + "time_per_iteration": 4.914167642593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146003, + "balance_loss_mlp": 1.09567261, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.07822541163530779, + "language_loss": 0.90488958, + "learning_rate": 0.0009686873120259941, + "loss": 0.91634959, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.50341797, + "step": 727, + "time_per_iteration": 2.563333749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132914, + "balance_loss_mlp": 1.09092879, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.0725242002086287, + "language_loss": 0.89161742, + "learning_rate": 0.0009685787037446004, + "loss": 0.90294659, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.41992188, + "step": 728, + "time_per_iteration": 2.7803192138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137226, + "balance_loss_mlp": 1.09192598, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.10183800223701604, + "language_loss": 0.9064362, + "learning_rate": 0.0009684699135448201, + "loss": 0.91780847, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.453125, + "step": 729, + "time_per_iteration": 2.750023603439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142476, + "balance_loss_mlp": 1.0995841, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.06503689668024501, + "language_loss": 0.94054115, + "learning_rate": 0.0009683609414688895, + "loss": 0.95196593, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.42895508, + "step": 730, + "time_per_iteration": 2.708470344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116576, + "balance_loss_mlp": 1.11652613, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.07277464462784268, + "language_loss": 0.89072424, + "learning_rate": 0.0009682517875591154, + "loss": 0.9023819, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.49243164, + "step": 731, + "time_per_iteration": 2.734145402908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173563, + "balance_loss_mlp": 1.12640429, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.08810260071203486, + "language_loss": 0.88790858, + "learning_rate": 0.0009681424518578749, + "loss": 0.8996442, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.47192383, + "step": 732, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166119, + "balance_loss_mlp": 1.11900759, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.07053265121681873, + "language_loss": 0.9010576, + "learning_rate": 0.000968032934407616, + "loss": 0.91271877, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.47143555, + "step": 733, + "time_per_iteration": 2.625128746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161975, + "balance_loss_mlp": 1.11514974, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.08143861058365946, + "language_loss": 0.84579933, + "learning_rate": 0.0009679232352508571, + "loss": 0.85741913, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.46850586, + "step": 734, + "time_per_iteration": 2.7461798191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145124, + "balance_loss_mlp": 1.10046864, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.0788084271092868, + "language_loss": 0.83272535, + "learning_rate": 0.0009678133544301871, + "loss": 0.84417665, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.44677734, + "step": 735, + "time_per_iteration": 2.68129301071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130971, + "balance_loss_mlp": 1.08731616, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.05044431767963513, + "language_loss": 0.93706036, + "learning_rate": 0.0009677032919882658, + "loss": 0.94837004, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.43652344, + "step": 736, + "time_per_iteration": 2.663874387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141167, + "balance_loss_mlp": 1.0970124, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.07155994363363784, + "language_loss": 0.94151366, + "learning_rate": 0.000967593047967823, + "loss": 0.95292532, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.44116211, + "step": 737, + "time_per_iteration": 2.512871265411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.10376751, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.07145762863961741, + "language_loss": 0.89657855, + "learning_rate": 0.0009674826224116593, + "loss": 0.90808284, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.46655273, + "step": 738, + "time_per_iteration": 2.797337293624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_mlp": 1.09865868, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.07589062836694223, + "language_loss": 0.89765012, + "learning_rate": 0.0009673720153626455, + "loss": 0.90910375, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.46728516, + "step": 739, + "time_per_iteration": 2.5743062496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.09274864, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07239717331604524, + "language_loss": 0.89863205, + "learning_rate": 0.0009672612268637235, + "loss": 0.9100163, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.45678711, + "step": 740, + "time_per_iteration": 2.6074059009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125723, + "balance_loss_mlp": 1.08125818, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.08552249660547784, + "language_loss": 0.8725301, + "learning_rate": 0.0009671502569579048, + "loss": 0.88378727, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.44458008, + "step": 741, + "time_per_iteration": 2.729733467102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116563, + "balance_loss_mlp": 1.07338512, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.05753110737252733, + "language_loss": 0.92330521, + "learning_rate": 0.0009670391056882719, + "loss": 0.93447083, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.43188477, + "step": 742, + "time_per_iteration": 2.69399356842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115871, + "balance_loss_mlp": 1.07367063, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.06711892894426404, + "language_loss": 0.91615599, + "learning_rate": 0.0009669277730979776, + "loss": 0.92731464, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.421875, + "step": 743, + "time_per_iteration": 3.1732802391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123454, + "balance_loss_mlp": 1.079561, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.07488288596065623, + "language_loss": 0.88249421, + "learning_rate": 0.0009668162592302449, + "loss": 0.89372879, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.43896484, + "step": 744, + "time_per_iteration": 2.88962459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_mlp": 1.09551311, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.08170086657731683, + "language_loss": 0.8873378, + "learning_rate": 0.0009667045641283676, + "loss": 0.89875567, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.46289062, + "step": 745, + "time_per_iteration": 2.6374380588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136601, + "balance_loss_mlp": 1.09158731, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.07376324969806651, + "language_loss": 0.9752661, + "learning_rate": 0.0009665926878357092, + "loss": 0.98663211, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.44995117, + "step": 746, + "time_per_iteration": 2.908377170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138589, + "balance_loss_mlp": 1.09283662, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.055840413500964095, + "language_loss": 0.93229979, + "learning_rate": 0.0009664806303957043, + "loss": 0.94368571, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.45751953, + "step": 747, + "time_per_iteration": 2.6940197944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_mlp": 1.11397541, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.07422855656653271, + "language_loss": 0.89923358, + "learning_rate": 0.0009663683918518571, + "loss": 0.91087878, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.50463867, + "step": 748, + "time_per_iteration": 2.8905599117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_mlp": 1.10977423, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.06951396400432043, + "language_loss": 0.88074797, + "learning_rate": 0.0009662559722477428, + "loss": 0.89237428, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.52880859, + "step": 749, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111749, + "balance_loss_mlp": 1.09059644, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.031134761916572575, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77280462, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.26953125, + "step": 750, + "time_per_iteration": 4.978729009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_mlp": 1.09359622, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.06451546089111031, + "language_loss": 0.9124738, + "learning_rate": 0.0009660305900333632, + "loss": 0.92388898, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.47973633, + "step": 751, + "time_per_iteration": 2.6556403636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145582, + "balance_loss_mlp": 1.09849465, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08083819383046088, + "language_loss": 0.8480792, + "learning_rate": 0.0009659176275105992, + "loss": 0.85953498, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.47070312, + "step": 752, + "time_per_iteration": 2.6868016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154886, + "balance_loss_mlp": 1.10667825, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.0601727082776222, + "language_loss": 0.87400204, + "learning_rate": 0.0009658044841025701, + "loss": 0.88555086, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.48217773, + "step": 753, + "time_per_iteration": 2.7701456546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189813, + "balance_loss_mlp": 1.136765, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.0800468655776831, + "language_loss": 0.83957088, + "learning_rate": 0.0009656911598532021, + "loss": 0.85146904, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.53051758, + "step": 754, + "time_per_iteration": 2.630211353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192149, + "balance_loss_mlp": 1.13943434, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.0631545589319864, + "language_loss": 0.9278729, + "learning_rate": 0.0009655776548064917, + "loss": 0.93979442, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.52758789, + "step": 755, + "time_per_iteration": 2.6447510719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.12506902, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.06497808848967317, + "language_loss": 0.90460694, + "learning_rate": 0.0009654639690065054, + "loss": 0.91637456, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.51708984, + "step": 756, + "time_per_iteration": 2.910578727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116602, + "balance_loss_mlp": 1.11785972, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.0580393303136577, + "language_loss": 0.90340179, + "learning_rate": 0.00096535010249738, + "loss": 0.91506201, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.48120117, + "step": 757, + "time_per_iteration": 2.7232277393341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149847, + "balance_loss_mlp": 1.10092402, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.07370663524734816, + "language_loss": 0.8531146, + "learning_rate": 0.0009652360553233224, + "loss": 0.86461306, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.48901367, + "step": 758, + "time_per_iteration": 2.7501397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.03528047, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.02263224740377231, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74837828, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.28710938, + "step": 759, + "time_per_iteration": 4.953639268875122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150341, + "balance_loss_mlp": 1.1019187, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.05750780582661247, + "language_loss": 0.83513778, + "learning_rate": 0.0009650074191575883, + "loss": 0.84664118, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.48388672, + "step": 760, + "time_per_iteration": 3.202252149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152626, + "balance_loss_mlp": 1.10179496, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.05303129095981597, + "language_loss": 0.88240772, + "learning_rate": 0.0009648928302546766, + "loss": 0.89393395, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.50878906, + "step": 761, + "time_per_iteration": 2.65380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_mlp": 1.09960222, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.06114398209353547, + "language_loss": 0.87573165, + "learning_rate": 0.0009647780608643613, + "loss": 0.88720453, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.47705078, + "step": 762, + "time_per_iteration": 3.3394339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.10831833, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.09093438426480749, + "language_loss": 0.90765309, + "learning_rate": 0.0009646631110312001, + "loss": 0.91919315, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.45678711, + "step": 763, + "time_per_iteration": 2.622671604156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.11200595, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.047784585244551814, + "language_loss": 0.90468627, + "learning_rate": 0.0009645479807998203, + "loss": 0.91626436, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.45751953, + "step": 764, + "time_per_iteration": 2.7322580814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156125, + "balance_loss_mlp": 1.11487842, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06523928090243644, + "language_loss": 0.94106412, + "learning_rate": 0.0009644326702149196, + "loss": 0.95262539, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.41235352, + "step": 765, + "time_per_iteration": 2.7013158798217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174535, + "balance_loss_mlp": 1.12761474, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.08055574364553787, + "language_loss": 0.86730242, + "learning_rate": 0.0009643171793212653, + "loss": 0.87904775, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.46923828, + "step": 766, + "time_per_iteration": 3.083709478378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_mlp": 1.11473966, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.07722330054572468, + "language_loss": 0.92188174, + "learning_rate": 0.0009642015081636952, + "loss": 0.93350834, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.47949219, + "step": 767, + "time_per_iteration": 2.6836585998535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.1132586, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.07123168873353844, + "language_loss": 0.90995437, + "learning_rate": 0.0009640856567871166, + "loss": 0.9215681, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.48168945, + "step": 768, + "time_per_iteration": 2.543670177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156907, + "balance_loss_mlp": 1.10626745, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07039727350928661, + "language_loss": 0.9123286, + "learning_rate": 0.0009639696252365072, + "loss": 0.92389768, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.50634766, + "step": 769, + "time_per_iteration": 3.027188539505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146634, + "balance_loss_mlp": 1.10326576, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.06094559984807647, + "language_loss": 0.83659029, + "learning_rate": 0.0009638534135569144, + "loss": 0.84805667, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.43359375, + "step": 770, + "time_per_iteration": 2.9126267433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_mlp": 1.09489226, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.06702358278695762, + "language_loss": 0.92293191, + "learning_rate": 0.0009637370217934554, + "loss": 0.93433982, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.45922852, + "step": 771, + "time_per_iteration": 2.6426541805267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.08600211, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.04968709901212579, + "language_loss": 0.84857935, + "learning_rate": 0.0009636204499913175, + "loss": 0.85987568, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.43603516, + "step": 772, + "time_per_iteration": 2.830029010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_mlp": 1.08478057, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06444605868824185, + "language_loss": 0.90028566, + "learning_rate": 0.0009635036981957581, + "loss": 0.91150796, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.37451172, + "step": 773, + "time_per_iteration": 2.850893259048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128047, + "balance_loss_mlp": 1.08546507, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.07558916443605426, + "language_loss": 0.92137265, + "learning_rate": 0.0009633867664521043, + "loss": 0.93265319, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.42553711, + "step": 774, + "time_per_iteration": 2.8405416011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154281, + "balance_loss_mlp": 1.10614467, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.07793461844194936, + "language_loss": 0.8938297, + "learning_rate": 0.0009632696548057527, + "loss": 0.9053725, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.48168945, + "step": 775, + "time_per_iteration": 2.5543088912963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158921, + "balance_loss_mlp": 1.11419404, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.07948352168051111, + "language_loss": 0.86982578, + "learning_rate": 0.0009631523633021704, + "loss": 0.88141501, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.44702148, + "step": 776, + "time_per_iteration": 2.8373982906341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151944, + "balance_loss_mlp": 1.10726452, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.07613081492567164, + "language_loss": 0.90593684, + "learning_rate": 0.0009630348919868936, + "loss": 0.91745627, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.4465332, + "step": 777, + "time_per_iteration": 2.688340187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164011, + "balance_loss_mlp": 1.1162796, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.07284380806791231, + "language_loss": 0.83743048, + "learning_rate": 0.0009629172409055293, + "loss": 0.84907055, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.47753906, + "step": 778, + "time_per_iteration": 2.496121406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_mlp": 1.13260555, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.0582041699055768, + "language_loss": 0.89173234, + "learning_rate": 0.0009627994101037531, + "loss": 0.9034642, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.40576172, + "step": 779, + "time_per_iteration": 2.7287445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116917, + "balance_loss_mlp": 1.12670779, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.06429714570378213, + "language_loss": 0.91374522, + "learning_rate": 0.0009626813996273114, + "loss": 0.92543697, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.42431641, + "step": 780, + "time_per_iteration": 2.8357532024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174237, + "balance_loss_mlp": 1.13258517, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.07735356487079731, + "language_loss": 0.90820873, + "learning_rate": 0.0009625632095220198, + "loss": 0.91995108, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.41625977, + "step": 781, + "time_per_iteration": 2.8360986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165333, + "balance_loss_mlp": 1.12408686, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.07591811383481707, + "language_loss": 0.88784671, + "learning_rate": 0.0009624448398337637, + "loss": 0.89950007, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.41259766, + "step": 782, + "time_per_iteration": 2.550873041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_mlp": 1.09920812, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.06500535683801296, + "language_loss": 0.90907973, + "learning_rate": 0.0009623262906084984, + "loss": 0.92046738, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.39550781, + "step": 783, + "time_per_iteration": 3.002237319946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127947, + "balance_loss_mlp": 1.08622408, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.06722303964642193, + "language_loss": 0.92323947, + "learning_rate": 0.0009622075618922486, + "loss": 0.93451893, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.41699219, + "step": 784, + "time_per_iteration": 2.669541120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117088, + "balance_loss_mlp": 1.07636571, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.06286377137641418, + "language_loss": 0.88948303, + "learning_rate": 0.0009620886537311091, + "loss": 0.90065384, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.40722656, + "step": 785, + "time_per_iteration": 2.6505391597747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132411, + "balance_loss_mlp": 1.08563375, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.06858268632652799, + "language_loss": 0.87318397, + "learning_rate": 0.000961969566171244, + "loss": 0.88450807, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.46777344, + "step": 786, + "time_per_iteration": 2.5492002964019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143812, + "balance_loss_mlp": 1.10037243, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.06762455123923776, + "language_loss": 0.9226557, + "learning_rate": 0.0009618502992588873, + "loss": 0.93409383, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.43481445, + "step": 787, + "time_per_iteration": 2.6596381664276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153043, + "balance_loss_mlp": 1.10714722, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07210135364095939, + "language_loss": 0.90213263, + "learning_rate": 0.0009617308530403424, + "loss": 0.91366303, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.45922852, + "step": 788, + "time_per_iteration": 2.9965012073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133874, + "balance_loss_mlp": 1.09358144, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0646084728999688, + "language_loss": 0.89177096, + "learning_rate": 0.0009616112275619825, + "loss": 0.90310967, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.40283203, + "step": 789, + "time_per_iteration": 2.702927350997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.08760214, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.04914514873585108, + "language_loss": 0.85434246, + "learning_rate": 0.0009614914228702503, + "loss": 0.86562753, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.40917969, + "step": 790, + "time_per_iteration": 2.734309196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120439, + "balance_loss_mlp": 1.08031344, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.0510031662309952, + "language_loss": 0.90581405, + "learning_rate": 0.0009613714390116581, + "loss": 0.91701841, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.40112305, + "step": 791, + "time_per_iteration": 2.9846036434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119178, + "balance_loss_mlp": 1.07890868, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.06466161117660295, + "language_loss": 0.87842512, + "learning_rate": 0.0009612512760327879, + "loss": 0.88961697, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.40283203, + "step": 792, + "time_per_iteration": 2.879507303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.0749234, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.06761791569724282, + "language_loss": 0.86834276, + "learning_rate": 0.0009611309339802909, + "loss": 0.87955594, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.46435547, + "step": 793, + "time_per_iteration": 2.4628419876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125819, + "balance_loss_mlp": 1.08180666, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.06955338926819006, + "language_loss": 0.85776877, + "learning_rate": 0.0009610104129008881, + "loss": 0.86902696, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.43994141, + "step": 794, + "time_per_iteration": 3.1157610416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112048, + "balance_loss_mlp": 1.07751703, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.0812849574801687, + "language_loss": 0.89832217, + "learning_rate": 0.0009608897128413701, + "loss": 0.90952694, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.4296875, + "step": 795, + "time_per_iteration": 2.7580387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_mlp": 1.08070254, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.07320179377966478, + "language_loss": 0.87414771, + "learning_rate": 0.0009607688338485965, + "loss": 0.88536048, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.40576172, + "step": 796, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112803, + "balance_loss_mlp": 1.08358848, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.08676784428227541, + "language_loss": 0.92063487, + "learning_rate": 0.0009606477759694969, + "loss": 0.93191516, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.4440918, + "step": 797, + "time_per_iteration": 3.0136139392852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129989, + "balance_loss_mlp": 1.08547592, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07379760567815713, + "language_loss": 0.89430279, + "learning_rate": 0.0009605265392510703, + "loss": 0.90560269, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.44555664, + "step": 798, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_mlp": 1.10169339, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.06797963908333281, + "language_loss": 0.93481082, + "learning_rate": 0.0009604051237403846, + "loss": 0.94626689, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.43896484, + "step": 799, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167168, + "balance_loss_mlp": 1.1217972, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.06891264186704958, + "language_loss": 0.88271165, + "learning_rate": 0.0009602835294845776, + "loss": 0.89438331, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.45361328, + "step": 800, + "time_per_iteration": 2.4739739894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12188447, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.06820302888180714, + "language_loss": 0.91848779, + "learning_rate": 0.0009601617565308565, + "loss": 0.93017173, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.46557617, + "step": 801, + "time_per_iteration": 2.599102020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196072, + "balance_loss_mlp": 1.14941311, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.08155438121007776, + "language_loss": 0.88506758, + "learning_rate": 0.0009600398049264977, + "loss": 0.89702827, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.46679688, + "step": 802, + "time_per_iteration": 2.9645981788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193217, + "balance_loss_mlp": 1.14574742, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.10468166660144326, + "language_loss": 0.93512642, + "learning_rate": 0.0009599176747188469, + "loss": 0.94705856, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.47485352, + "step": 803, + "time_per_iteration": 2.7997000217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160191, + "balance_loss_mlp": 1.11856318, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.07174757520021151, + "language_loss": 0.84728193, + "learning_rate": 0.0009597953659553196, + "loss": 0.85888386, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.41625977, + "step": 804, + "time_per_iteration": 2.700530529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_mlp": 1.09408379, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.4143347029392257, + "language_loss": 0.9033978, + "learning_rate": 0.0009596728786833997, + "loss": 0.91473466, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.39575195, + "step": 805, + "time_per_iteration": 2.6122889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150784, + "balance_loss_mlp": 1.10772574, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.061887733402931855, + "language_loss": 0.91321814, + "learning_rate": 0.0009595502129506415, + "loss": 0.92472601, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.43066406, + "step": 806, + "time_per_iteration": 3.336061716079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180508, + "balance_loss_mlp": 1.13694847, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.06807019640067784, + "language_loss": 0.84292483, + "learning_rate": 0.0009594273688046678, + "loss": 0.85472989, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.43579102, + "step": 807, + "time_per_iteration": 2.709182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210101, + "balance_loss_mlp": 1.15960383, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.0856522073787927, + "language_loss": 0.8780278, + "learning_rate": 0.000959304346293171, + "loss": 0.89012885, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.50512695, + "step": 808, + "time_per_iteration": 2.6307153701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236008, + "balance_loss_mlp": 1.18305564, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.09531038088821206, + "language_loss": 0.90107393, + "learning_rate": 0.0009591811454639125, + "loss": 0.91343403, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.52954102, + "step": 809, + "time_per_iteration": 2.742725372314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197955, + "balance_loss_mlp": 1.15184498, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.06212883071305714, + "language_loss": 0.902493, + "learning_rate": 0.0009590577663647234, + "loss": 0.91447246, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.4609375, + "step": 810, + "time_per_iteration": 2.711411237716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187108, + "balance_loss_mlp": 1.13837492, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.06321996034865444, + "language_loss": 0.88015836, + "learning_rate": 0.0009589342090435036, + "loss": 0.8920294, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.48779297, + "step": 811, + "time_per_iteration": 2.763784170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.12610841, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07315119709604147, + "language_loss": 0.89953744, + "learning_rate": 0.0009588104735482223, + "loss": 0.91127443, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.47631836, + "step": 812, + "time_per_iteration": 2.645106077194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169234, + "balance_loss_mlp": 1.12019134, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.06895714089970095, + "language_loss": 0.86002952, + "learning_rate": 0.0009586865599269177, + "loss": 0.87172186, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.49047852, + "step": 813, + "time_per_iteration": 2.6313953399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144128, + "balance_loss_mlp": 1.09851837, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.06467027207336487, + "language_loss": 0.90443802, + "learning_rate": 0.0009585624682276977, + "loss": 0.91587937, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.45605469, + "step": 814, + "time_per_iteration": 2.7377047538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144046, + "balance_loss_mlp": 1.09705353, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.06824176290368998, + "language_loss": 0.89156437, + "learning_rate": 0.0009584381984987386, + "loss": 0.90300483, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.47021484, + "step": 815, + "time_per_iteration": 2.5524120330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134862, + "balance_loss_mlp": 1.09225655, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.061358262400161866, + "language_loss": 0.92449033, + "learning_rate": 0.0009583137507882864, + "loss": 0.93583906, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.42626953, + "step": 816, + "time_per_iteration": 2.699207305908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.08698916, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.06309616730716378, + "language_loss": 0.82620019, + "learning_rate": 0.000958189125144656, + "loss": 0.8375479, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.47851562, + "step": 817, + "time_per_iteration": 2.6626293659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142354, + "balance_loss_mlp": 1.09493256, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08013787804574789, + "language_loss": 0.90297949, + "learning_rate": 0.0009580643216162313, + "loss": 0.91440302, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.47436523, + "step": 818, + "time_per_iteration": 2.6708288192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.09368527, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.06582812199168771, + "language_loss": 0.82167578, + "learning_rate": 0.0009579393402514652, + "loss": 0.83310658, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.49389648, + "step": 819, + "time_per_iteration": 2.577592611312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_mlp": 1.09898734, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.07647809261390527, + "language_loss": 0.92362559, + "learning_rate": 0.0009578141810988801, + "loss": 0.93505466, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.43920898, + "step": 820, + "time_per_iteration": 2.5464515686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152369, + "balance_loss_mlp": 1.10678363, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07136182637629812, + "language_loss": 0.92042351, + "learning_rate": 0.0009576888442070668, + "loss": 0.93194717, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.45556641, + "step": 821, + "time_per_iteration": 2.5755786895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114609, + "balance_loss_mlp": 1.10288835, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08295395391365894, + "language_loss": 0.94583452, + "learning_rate": 0.0009575633296246854, + "loss": 0.95729542, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.43212891, + "step": 822, + "time_per_iteration": 2.5701425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162298, + "balance_loss_mlp": 1.11821485, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.06548151577025092, + "language_loss": 0.85385978, + "learning_rate": 0.0009574376374004652, + "loss": 0.86548281, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.44116211, + "step": 823, + "time_per_iteration": 2.622905731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_mlp": 1.12019491, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.1009087476503521, + "language_loss": 0.82624936, + "learning_rate": 0.000957311767583204, + "loss": 0.83794677, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.49536133, + "step": 824, + "time_per_iteration": 2.5683999061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196161, + "balance_loss_mlp": 1.1752758, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.05150472419389455, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83267754, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.20898438, + "step": 825, + "time_per_iteration": 4.722898960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176776, + "balance_loss_mlp": 1.12170124, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.10062471557735768, + "language_loss": 0.94017303, + "learning_rate": 0.0009570594953650961, + "loss": 0.95194077, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.55029297, + "step": 826, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173437, + "balance_loss_mlp": 1.12091362, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.0719939675894647, + "language_loss": 0.8219676, + "learning_rate": 0.00095693309306219, + "loss": 0.83370197, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.52612305, + "step": 827, + "time_per_iteration": 3.0926811695098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_mlp": 1.12434745, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.06038838021195225, + "language_loss": 0.90083122, + "learning_rate": 0.0009568065133621244, + "loss": 0.91261542, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54077148, + "step": 828, + "time_per_iteration": 3.315122604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164888, + "balance_loss_mlp": 1.12013662, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.07025990147709567, + "language_loss": 0.87178355, + "learning_rate": 0.0009566797563140422, + "loss": 0.88343245, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.44775391, + "step": 829, + "time_per_iteration": 2.8680243492126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116659, + "balance_loss_mlp": 1.11912107, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.061296828426512996, + "language_loss": 0.89984798, + "learning_rate": 0.0009565528219671547, + "loss": 0.91151381, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.47460938, + "step": 830, + "time_per_iteration": 2.9325318336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.1076839, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07652275644998038, + "language_loss": 0.86699682, + "learning_rate": 0.0009564257103707418, + "loss": 0.87860584, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.53198242, + "step": 831, + "time_per_iteration": 2.598191976547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184474, + "balance_loss_mlp": 1.12973261, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08337472663089728, + "language_loss": 0.92543364, + "learning_rate": 0.0009562984215741533, + "loss": 0.93727839, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54736328, + "step": 832, + "time_per_iteration": 2.676666736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_mlp": 1.11177731, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.05762908483075192, + "language_loss": 0.8408711, + "learning_rate": 0.0009561709556268065, + "loss": 0.85247904, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.49047852, + "step": 833, + "time_per_iteration": 2.7075538635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162528, + "balance_loss_mlp": 1.11141133, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.06044842900072245, + "language_loss": 0.96042889, + "learning_rate": 0.0009560433125781884, + "loss": 0.97205412, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.51171875, + "step": 834, + "time_per_iteration": 2.7619521617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.09130979, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.06441579465763399, + "language_loss": 0.94159138, + "learning_rate": 0.0009559154924778544, + "loss": 0.95304114, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.53686523, + "step": 835, + "time_per_iteration": 2.7467222213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_mlp": 1.08218372, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.07312538570388089, + "language_loss": 0.86469144, + "learning_rate": 0.0009557874953754284, + "loss": 0.87598646, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.47314453, + "step": 836, + "time_per_iteration": 3.0907793045043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126281, + "balance_loss_mlp": 1.07618928, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08101808751207061, + "language_loss": 0.85894346, + "learning_rate": 0.0009556593213206038, + "loss": 0.87020624, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.5012207, + "step": 837, + "time_per_iteration": 2.7060487270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.07765627, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.060960398488271, + "language_loss": 0.89031309, + "learning_rate": 0.0009555309703631414, + "loss": 0.9015379, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.44848633, + "step": 838, + "time_per_iteration": 2.6838622093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131245, + "balance_loss_mlp": 1.07853079, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.0637381399971671, + "language_loss": 0.88547724, + "learning_rate": 0.0009554024425528722, + "loss": 0.89678967, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.52685547, + "step": 839, + "time_per_iteration": 2.7301504611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124486, + "balance_loss_mlp": 1.07978272, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0692663948027758, + "language_loss": 0.90811443, + "learning_rate": 0.0009552737379396948, + "loss": 0.91935933, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.44726562, + "step": 840, + "time_per_iteration": 2.6181893348693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129368, + "balance_loss_mlp": 1.08208978, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06449676765287365, + "language_loss": 0.89640445, + "learning_rate": 0.0009551448565735767, + "loss": 0.90769809, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.47265625, + "step": 841, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135502, + "balance_loss_mlp": 1.08555281, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.07291825437583387, + "language_loss": 0.86443651, + "learning_rate": 0.0009550157985045543, + "loss": 0.87579155, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.49926758, + "step": 842, + "time_per_iteration": 3.0523600578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_mlp": 1.08724499, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.06222432903322319, + "language_loss": 0.90556312, + "learning_rate": 0.0009548865637827321, + "loss": 0.91690183, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.46630859, + "step": 843, + "time_per_iteration": 2.6370396614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113271, + "balance_loss_mlp": 1.08757734, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.07459586377830821, + "language_loss": 0.91347718, + "learning_rate": 0.0009547571524582838, + "loss": 0.92480427, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.45141602, + "step": 844, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142931, + "balance_loss_mlp": 1.09460354, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.08463351541898638, + "language_loss": 0.94371468, + "learning_rate": 0.0009546275645814512, + "loss": 0.95514405, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.48339844, + "step": 845, + "time_per_iteration": 2.632861375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117157, + "balance_loss_mlp": 1.12107265, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.08033911629378378, + "language_loss": 0.92129737, + "learning_rate": 0.0009544978002025446, + "loss": 0.93301302, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.50561523, + "step": 846, + "time_per_iteration": 2.7044737339019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193795, + "balance_loss_mlp": 1.14096177, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.052695226385161484, + "language_loss": 0.88037688, + "learning_rate": 0.0009543678593719434, + "loss": 0.89231491, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.52880859, + "step": 847, + "time_per_iteration": 2.798231601715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208721, + "balance_loss_mlp": 1.15734136, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.056853368929671785, + "language_loss": 0.88963962, + "learning_rate": 0.0009542377421400945, + "loss": 0.90172684, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.51391602, + "step": 848, + "time_per_iteration": 2.7955727577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122402, + "balance_loss_mlp": 1.16584587, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06352967983147602, + "language_loss": 0.85259467, + "learning_rate": 0.0009541074485575145, + "loss": 0.86483485, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.58154297, + "step": 849, + "time_per_iteration": 2.703871488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_mlp": 1.17088127, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07774946886845908, + "language_loss": 0.93468195, + "learning_rate": 0.0009539769786747874, + "loss": 0.94693196, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.54125977, + "step": 850, + "time_per_iteration": 2.6687557697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012154, + "balance_loss_mlp": 1.16130245, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.057605035940766894, + "language_loss": 0.82393861, + "learning_rate": 0.0009538463325425665, + "loss": 0.83609259, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.54101562, + "step": 851, + "time_per_iteration": 2.751335382461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199288, + "balance_loss_mlp": 1.1491015, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.06621147850271279, + "language_loss": 0.87526274, + "learning_rate": 0.0009537155102115728, + "loss": 0.88725561, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.50170898, + "step": 852, + "time_per_iteration": 2.568573474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168884, + "balance_loss_mlp": 1.12236834, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.07419725806034035, + "language_loss": 0.85374665, + "learning_rate": 0.0009535845117325961, + "loss": 0.86543554, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.46533203, + "step": 853, + "time_per_iteration": 2.628973960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137862, + "balance_loss_mlp": 1.09511375, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.05551255594321189, + "language_loss": 0.94495642, + "learning_rate": 0.0009534533371564946, + "loss": 0.95633507, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.42724609, + "step": 854, + "time_per_iteration": 2.780510902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133546, + "balance_loss_mlp": 1.09003448, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.08632067881035285, + "language_loss": 0.90547508, + "learning_rate": 0.0009533219865341949, + "loss": 0.91681051, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.43530273, + "step": 855, + "time_per_iteration": 2.583874464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_mlp": 1.07188785, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.06082853882497287, + "language_loss": 0.88071746, + "learning_rate": 0.0009531904599166916, + "loss": 0.89188123, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.44482422, + "step": 856, + "time_per_iteration": 2.626354217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_mlp": 1.06231081, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.0709999882269981, + "language_loss": 0.86807954, + "learning_rate": 0.0009530587573550478, + "loss": 0.87915355, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.45068359, + "step": 857, + "time_per_iteration": 2.5761454105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142125, + "balance_loss_mlp": 1.11237001, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04095057850479287, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75461513, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.296875, + "step": 858, + "time_per_iteration": 5.055138349533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_mlp": 1.06165087, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.08838989258306214, + "language_loss": 0.91845137, + "learning_rate": 0.0009527948246039337, + "loss": 0.92946172, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.39379883, + "step": 859, + "time_per_iteration": 2.582608461380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111338, + "balance_loss_mlp": 1.0715934, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.06489567580347368, + "language_loss": 0.89263308, + "learning_rate": 0.000952662594516931, + "loss": 0.90374649, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.39746094, + "step": 860, + "time_per_iteration": 3.067707061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_mlp": 1.07018054, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.055059247831062384, + "language_loss": 0.88479781, + "learning_rate": 0.0009525301886907234, + "loss": 0.89590299, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.40307617, + "step": 861, + "time_per_iteration": 2.8873865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112184, + "balance_loss_mlp": 1.07758975, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.06995538812096423, + "language_loss": 0.89499515, + "learning_rate": 0.0009523976071767155, + "loss": 0.90621358, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.44262695, + "step": 862, + "time_per_iteration": 2.6588613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.08183372, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.06313062043432274, + "language_loss": 0.89038265, + "learning_rate": 0.00095226485002638, + "loss": 0.90163255, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.43115234, + "step": 863, + "time_per_iteration": 2.797896146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.07232881, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.054774526957085325, + "language_loss": 0.90381318, + "learning_rate": 0.0009521319172912576, + "loss": 0.91494584, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.40917969, + "step": 864, + "time_per_iteration": 2.7238612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_mlp": 1.08132839, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.05854649520245602, + "language_loss": 0.96491337, + "learning_rate": 0.0009519988090229579, + "loss": 0.97618109, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.4543457, + "step": 865, + "time_per_iteration": 2.683509111404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_mlp": 1.07907248, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.05699467986566688, + "language_loss": 0.89545953, + "learning_rate": 0.0009518655252731576, + "loss": 0.90669084, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.44067383, + "step": 866, + "time_per_iteration": 2.729865550994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_mlp": 1.08456326, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.06482393342324422, + "language_loss": 0.9171015, + "learning_rate": 0.0009517320660936022, + "loss": 0.9284128, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.46557617, + "step": 867, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133142, + "balance_loss_mlp": 1.08843839, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.06614373571764609, + "language_loss": 0.84472704, + "learning_rate": 0.0009515984315361051, + "loss": 0.85605848, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.44702148, + "step": 868, + "time_per_iteration": 2.796868085861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121806, + "balance_loss_mlp": 1.07657838, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08270078218547869, + "language_loss": 0.88773656, + "learning_rate": 0.000951464621652548, + "loss": 0.89895463, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.45239258, + "step": 869, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141117, + "balance_loss_mlp": 1.09751046, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.06072661062765564, + "language_loss": 0.80103016, + "learning_rate": 0.0009513306364948804, + "loss": 0.81244129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.43579102, + "step": 870, + "time_per_iteration": 2.799009084701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_mlp": 1.10373545, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09261319168225486, + "language_loss": 0.90277344, + "learning_rate": 0.0009511964761151197, + "loss": 0.91426206, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.45117188, + "step": 871, + "time_per_iteration": 2.5934712886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158743, + "balance_loss_mlp": 1.1145407, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.06739805293344515, + "language_loss": 0.91524243, + "learning_rate": 0.0009510621405653521, + "loss": 0.92682987, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.44213867, + "step": 872, + "time_per_iteration": 2.5557620525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11627746, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.06267535529199315, + "language_loss": 0.85553813, + "learning_rate": 0.0009509276298977309, + "loss": 0.86710668, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.40576172, + "step": 873, + "time_per_iteration": 2.9965007305145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187981, + "balance_loss_mlp": 1.13760364, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.07409010972210926, + "language_loss": 0.82916558, + "learning_rate": 0.0009507929441644778, + "loss": 0.84104538, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.50415039, + "step": 874, + "time_per_iteration": 3.5573699474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118449, + "balance_loss_mlp": 1.14097893, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.07388150752212762, + "language_loss": 0.8737148, + "learning_rate": 0.0009506580834178826, + "loss": 0.88555974, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.43530273, + "step": 875, + "time_per_iteration": 2.7659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215839, + "balance_loss_mlp": 1.16841793, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.06935842584614806, + "language_loss": 0.92793226, + "learning_rate": 0.0009505230477103028, + "loss": 0.94009066, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.47436523, + "step": 876, + "time_per_iteration": 2.7306137084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_mlp": 1.18224776, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10053146783154573, + "language_loss": 0.82997662, + "learning_rate": 0.0009503878370941641, + "loss": 0.84224302, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.44433594, + "step": 877, + "time_per_iteration": 2.7356183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211684, + "balance_loss_mlp": 1.16793382, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.10508781605450683, + "language_loss": 0.9020679, + "learning_rate": 0.0009502524516219595, + "loss": 0.91418481, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.43798828, + "step": 878, + "time_per_iteration": 2.7525370121002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185028, + "balance_loss_mlp": 1.14232683, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.07887273759437702, + "language_loss": 0.91364408, + "learning_rate": 0.0009501168913462506, + "loss": 0.92549431, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.42724609, + "step": 879, + "time_per_iteration": 2.7009639739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115086, + "balance_loss_mlp": 1.11919844, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04902821320434346, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80272782, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.31640625, + "step": 880, + "time_per_iteration": 4.812703609466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116281, + "balance_loss_mlp": 1.11748707, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.06555145426806878, + "language_loss": 0.86756283, + "learning_rate": 0.0009498452465949042, + "loss": 0.87919092, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.453125, + "step": 881, + "time_per_iteration": 3.230407476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159747, + "balance_loss_mlp": 1.1133033, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.0753185527775994, + "language_loss": 0.92756218, + "learning_rate": 0.0009497091622247285, + "loss": 0.93915963, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.46459961, + "step": 882, + "time_per_iteration": 2.7412030696868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141259, + "balance_loss_mlp": 1.09734213, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.07197762243887564, + "language_loss": 0.94941783, + "learning_rate": 0.0009495729032619723, + "loss": 0.96083045, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.43945312, + "step": 883, + "time_per_iteration": 2.6705245971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_mlp": 1.09724283, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07033792867334165, + "language_loss": 0.85310471, + "learning_rate": 0.0009494364697595354, + "loss": 0.86451751, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.44018555, + "step": 884, + "time_per_iteration": 2.9024457931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115677, + "balance_loss_mlp": 1.10977769, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.0673266035955572, + "language_loss": 0.90739167, + "learning_rate": 0.0009492998617703867, + "loss": 0.91895938, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.47045898, + "step": 885, + "time_per_iteration": 2.6497459411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151127, + "balance_loss_mlp": 1.10813999, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.0863252086663651, + "language_loss": 0.89101255, + "learning_rate": 0.0009491630793475619, + "loss": 0.90252388, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.42993164, + "step": 886, + "time_per_iteration": 2.6258063316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159756, + "balance_loss_mlp": 1.11231089, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.0686214928272948, + "language_loss": 0.85993534, + "learning_rate": 0.0009490261225441643, + "loss": 0.87153292, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.47412109, + "step": 887, + "time_per_iteration": 2.9036519527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168173, + "balance_loss_mlp": 1.12370825, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07914830411429463, + "language_loss": 0.91452426, + "learning_rate": 0.0009488889914133656, + "loss": 0.92620599, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.44458008, + "step": 888, + "time_per_iteration": 3.0038132667541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155268, + "balance_loss_mlp": 1.10706019, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07300075385020723, + "language_loss": 0.90558064, + "learning_rate": 0.0009487516860084047, + "loss": 0.91713333, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.48193359, + "step": 889, + "time_per_iteration": 2.7158679962158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147984, + "balance_loss_mlp": 1.0996089, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.09172908653222724, + "language_loss": 0.90068781, + "learning_rate": 0.0009486142063825884, + "loss": 0.91216767, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.48364258, + "step": 890, + "time_per_iteration": 2.5330443382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.06175303, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.031797672969882694, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73511147, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.23144531, + "step": 891, + "time_per_iteration": 4.953175783157349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_mlp": 1.11835372, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.06989736404119995, + "language_loss": 0.91231126, + "learning_rate": 0.0009483387246819542, + "loss": 0.92398739, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.49243164, + "step": 892, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.0426023, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.022698270048783192, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83350885, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.2265625, + "step": 893, + "time_per_iteration": 4.662828683853149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12312233, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.06047387129149895, + "language_loss": 0.90360647, + "learning_rate": 0.0009480625467392688, + "loss": 0.91527206, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.43481445, + "step": 894, + "time_per_iteration": 2.615447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046079, + "balance_loss_mlp": 1.02433491, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.017910617622931155, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79040754, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.21777344, + "step": 895, + "time_per_iteration": 4.802469968795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196327, + "balance_loss_mlp": 1.15264833, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0591778940977726, + "language_loss": 0.88960874, + "learning_rate": 0.0009477856729834196, + "loss": 0.90157199, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.43652344, + "step": 896, + "time_per_iteration": 2.743036985397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214543, + "balance_loss_mlp": 1.17217648, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.09709817551063968, + "language_loss": 0.91585428, + "learning_rate": 0.0009476469753098809, + "loss": 0.92799973, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.42358398, + "step": 897, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206917, + "balance_loss_mlp": 1.16080689, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08785360527314089, + "language_loss": 0.87616539, + "learning_rate": 0.0009475081038443738, + "loss": 0.88823456, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.46118164, + "step": 898, + "time_per_iteration": 2.5958664417266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178905, + "balance_loss_mlp": 1.13436794, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.08099470404026293, + "language_loss": 0.87109447, + "learning_rate": 0.0009473690586408124, + "loss": 0.88288355, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.44482422, + "step": 899, + "time_per_iteration": 2.885279417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.13184392, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.060075693842180825, + "language_loss": 0.87349975, + "learning_rate": 0.0009472298397531792, + "loss": 0.88526928, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.45141602, + "step": 900, + "time_per_iteration": 2.6987335681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117213, + "balance_loss_mlp": 1.12244344, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.06597136758704356, + "language_loss": 0.87749296, + "learning_rate": 0.0009470904472355235, + "loss": 0.88921428, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.49707031, + "step": 901, + "time_per_iteration": 2.6920526027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_mlp": 1.08898544, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.06929151708835651, + "language_loss": 0.8084361, + "learning_rate": 0.0009469508811419626, + "loss": 0.81977129, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.44555664, + "step": 902, + "time_per_iteration": 2.7087764739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_mlp": 1.01825094, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.018918236495105482, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7265144, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.831868648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130376, + "balance_loss_mlp": 1.08429003, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.06904883588321564, + "language_loss": 0.84871197, + "learning_rate": 0.0009466712284439292, + "loss": 0.86001575, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.46118164, + "step": 904, + "time_per_iteration": 2.727154493331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135856, + "balance_loss_mlp": 1.08867335, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.0797697294198037, + "language_loss": 0.90077758, + "learning_rate": 0.0009465311419480276, + "loss": 0.9121362, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.47216797, + "step": 905, + "time_per_iteration": 2.659696340560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130539, + "balance_loss_mlp": 1.0859549, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.0780460064240459, + "language_loss": 0.89685637, + "learning_rate": 0.0009463908820933622, + "loss": 0.90816176, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.44604492, + "step": 906, + "time_per_iteration": 2.845508337020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_mlp": 1.10657179, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.06621529993663824, + "language_loss": 0.83420271, + "learning_rate": 0.0009462504489343868, + "loss": 0.84573436, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.46582031, + "step": 907, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152101, + "balance_loss_mlp": 1.10246193, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0823987818854668, + "language_loss": 0.9018122, + "learning_rate": 0.0009461098425256222, + "loss": 0.91333324, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.49633789, + "step": 908, + "time_per_iteration": 2.5904529094696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.11457169, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.0762262609163865, + "language_loss": 0.87090451, + "learning_rate": 0.0009459690629216567, + "loss": 0.88250846, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.45874023, + "step": 909, + "time_per_iteration": 2.61710524559021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155202, + "balance_loss_mlp": 1.10921121, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06657664395828655, + "language_loss": 0.88943893, + "learning_rate": 0.0009458281101771457, + "loss": 0.90099096, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.46020508, + "step": 910, + "time_per_iteration": 2.6421282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176316, + "balance_loss_mlp": 1.12810779, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.08799417436837091, + "language_loss": 0.8354404, + "learning_rate": 0.0009456869843468122, + "loss": 0.84720349, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.48217773, + "step": 911, + "time_per_iteration": 2.8633837699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178078, + "balance_loss_mlp": 1.12688971, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.08410877580390771, + "language_loss": 0.79552639, + "learning_rate": 0.0009455456854854459, + "loss": 0.80730712, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.51220703, + "step": 912, + "time_per_iteration": 2.661038875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180916, + "balance_loss_mlp": 1.13564038, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.17307911593328887, + "language_loss": 0.85480136, + "learning_rate": 0.0009454042136479039, + "loss": 0.86661053, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.45263672, + "step": 913, + "time_per_iteration": 2.561790943145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198041, + "balance_loss_mlp": 1.15183568, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.06959724621682493, + "language_loss": 0.8438077, + "learning_rate": 0.0009452625688891103, + "loss": 0.85578811, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.4621582, + "step": 914, + "time_per_iteration": 2.5396227836608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092507, + "balance_loss_mlp": 1.07600832, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.034614734916794516, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79827243, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.16503906, + "step": 915, + "time_per_iteration": 4.550157308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_mlp": 1.21347213, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08235911171958209, + "language_loss": 0.94223297, + "learning_rate": 0.0009449787608278015, + "loss": 0.95488179, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.51489258, + "step": 916, + "time_per_iteration": 2.8292665481567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243722, + "balance_loss_mlp": 1.19525158, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08361954447634375, + "language_loss": 0.9338274, + "learning_rate": 0.0009448365976354704, + "loss": 0.94626462, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.48461914, + "step": 917, + "time_per_iteration": 2.543883800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216482, + "balance_loss_mlp": 1.16622329, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.08482517786251102, + "language_loss": 0.91736883, + "learning_rate": 0.0009446942617422558, + "loss": 0.9295336, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.50317383, + "step": 918, + "time_per_iteration": 2.6130669116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118235, + "balance_loss_mlp": 1.13740778, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.07957198864097685, + "language_loss": 0.8648746, + "learning_rate": 0.0009445517532034176, + "loss": 0.87669808, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.44970703, + "step": 919, + "time_per_iteration": 2.7341010570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116033, + "balance_loss_mlp": 1.11002386, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.08371374964142012, + "language_loss": 0.9020586, + "learning_rate": 0.0009444090720742824, + "loss": 0.9136619, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.50341797, + "step": 920, + "time_per_iteration": 2.628169298171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158892, + "balance_loss_mlp": 1.1083951, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.07483188289837522, + "language_loss": 0.89025688, + "learning_rate": 0.0009442662184102439, + "loss": 0.90184581, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.50512695, + "step": 921, + "time_per_iteration": 2.7538435459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154828, + "balance_loss_mlp": 1.11210358, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.05276545299780942, + "language_loss": 0.88537991, + "learning_rate": 0.000944123192266763, + "loss": 0.89692819, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.42724609, + "step": 922, + "time_per_iteration": 2.788759469985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190709, + "balance_loss_mlp": 1.13887644, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.07681776188261369, + "language_loss": 0.84657156, + "learning_rate": 0.0009439799936993671, + "loss": 0.85847867, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.51904297, + "step": 923, + "time_per_iteration": 2.7123734951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196866, + "balance_loss_mlp": 1.14787149, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.09732559260361714, + "language_loss": 0.89131558, + "learning_rate": 0.0009438366227636511, + "loss": 0.90328419, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.49047852, + "step": 924, + "time_per_iteration": 2.6907341480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171193, + "balance_loss_mlp": 1.12396216, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07379366042998667, + "language_loss": 0.86971134, + "learning_rate": 0.0009436930795152763, + "loss": 0.88142323, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.47241211, + "step": 925, + "time_per_iteration": 2.865673065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168215, + "balance_loss_mlp": 1.12174773, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07469970420174622, + "language_loss": 0.8767308, + "learning_rate": 0.0009435493640099713, + "loss": 0.88841295, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.46411133, + "step": 926, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_mlp": 1.10388088, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.06972760602295516, + "language_loss": 0.85458124, + "learning_rate": 0.0009434054763035314, + "loss": 0.86612737, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.50756836, + "step": 927, + "time_per_iteration": 2.5972957611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 1.09983397, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.05666425765353489, + "language_loss": 0.86302543, + "learning_rate": 0.0009432614164518185, + "loss": 0.8745054, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.48168945, + "step": 928, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150828, + "balance_loss_mlp": 1.09780383, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07484249942420804, + "language_loss": 0.85464913, + "learning_rate": 0.000943117184510762, + "loss": 0.86615741, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 2.9855945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124448, + "balance_loss_mlp": 1.10556555, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.03465095249088487, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79914415, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.18847656, + "step": 930, + "time_per_iteration": 5.016055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148447, + "balance_loss_mlp": 1.09997642, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.07304481613225793, + "language_loss": 0.89790976, + "learning_rate": 0.0009428282045846674, + "loss": 0.90939426, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.48461914, + "step": 931, + "time_per_iteration": 2.787473678588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134009, + "balance_loss_mlp": 1.08797026, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.05043968313129053, + "language_loss": 0.90432143, + "learning_rate": 0.0009426834567118214, + "loss": 0.91566151, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.46044922, + "step": 932, + "time_per_iteration": 3.1106340885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149699, + "balance_loss_mlp": 1.10091829, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.0884624873286247, + "language_loss": 0.81563932, + "learning_rate": 0.0009425385369740155, + "loss": 0.82713628, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.48779297, + "step": 933, + "time_per_iteration": 3.056328296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.1138767, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.0672899912264689, + "language_loss": 0.88411558, + "learning_rate": 0.0009423934454275125, + "loss": 0.8957603, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.50561523, + "step": 934, + "time_per_iteration": 2.827507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162381, + "balance_loss_mlp": 1.11333871, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.07880287247644589, + "language_loss": 0.92845738, + "learning_rate": 0.0009422481821286418, + "loss": 0.94008112, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.49072266, + "step": 935, + "time_per_iteration": 2.7188265323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164456, + "balance_loss_mlp": 1.11918044, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.07978340192275198, + "language_loss": 0.88968349, + "learning_rate": 0.0009421027471337998, + "loss": 0.90132797, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.45239258, + "step": 936, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176728, + "balance_loss_mlp": 1.1271131, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.07049523693926517, + "language_loss": 0.83782339, + "learning_rate": 0.0009419571404994493, + "loss": 0.84959066, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.49584961, + "step": 937, + "time_per_iteration": 2.641847610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_mlp": 1.11354589, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.06745021535989586, + "language_loss": 0.91665328, + "learning_rate": 0.00094181136228212, + "loss": 0.92827624, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.48803711, + "step": 938, + "time_per_iteration": 2.622314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146811, + "balance_loss_mlp": 1.10334706, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06209482952821168, + "language_loss": 0.87085009, + "learning_rate": 0.0009416654125384077, + "loss": 0.88231826, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.43432617, + "step": 939, + "time_per_iteration": 2.735565423965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167753, + "balance_loss_mlp": 1.15230346, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.039552666267989665, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80940127, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15429688, + "step": 940, + "time_per_iteration": 4.9464662075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_mlp": 1.10293126, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.06405620484007693, + "language_loss": 0.85002685, + "learning_rate": 0.000941372998698552, + "loss": 0.86150396, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.44750977, + "step": 941, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152914, + "balance_loss_mlp": 1.10344219, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.07883971857950696, + "language_loss": 0.82437575, + "learning_rate": 0.0009412265347159336, + "loss": 0.8359049, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.49487305, + "step": 942, + "time_per_iteration": 2.727071762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135445, + "balance_loss_mlp": 1.09083664, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.10057326993772005, + "language_loss": 0.85614288, + "learning_rate": 0.0009410798994339829, + "loss": 0.86749732, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.44604492, + "step": 943, + "time_per_iteration": 2.6305696964263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.09248304, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.05478952043416941, + "language_loss": 0.88907182, + "learning_rate": 0.000940933092909628, + "loss": 0.90042174, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.42529297, + "step": 944, + "time_per_iteration": 2.631101369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.10530019, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.06051663433249254, + "language_loss": 0.84961444, + "learning_rate": 0.0009407861151998649, + "loss": 0.8611083, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.44067383, + "step": 945, + "time_per_iteration": 2.5717978477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116837, + "balance_loss_mlp": 1.12040067, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.06666982795430461, + "language_loss": 0.87044382, + "learning_rate": 0.0009406389663617552, + "loss": 0.88212758, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.47998047, + "step": 946, + "time_per_iteration": 2.6768407821655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170796, + "balance_loss_mlp": 1.12757087, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0759743739596538, + "language_loss": 0.87192827, + "learning_rate": 0.000940491646452427, + "loss": 0.88363624, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.43212891, + "step": 947, + "time_per_iteration": 2.7174758911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174804, + "balance_loss_mlp": 1.1271199, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.06285362616764655, + "language_loss": 0.91503757, + "learning_rate": 0.000940344155529075, + "loss": 0.92678559, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.47680664, + "step": 948, + "time_per_iteration": 2.6130924224853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175811, + "balance_loss_mlp": 1.12643504, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.07182633578445446, + "language_loss": 0.88395435, + "learning_rate": 0.0009401964936489605, + "loss": 0.89571244, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.4934082, + "step": 949, + "time_per_iteration": 2.518735885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154077, + "balance_loss_mlp": 1.11173368, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.08616214546245322, + "language_loss": 0.86381257, + "learning_rate": 0.0009400486608694108, + "loss": 0.87535334, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.42358398, + "step": 950, + "time_per_iteration": 2.7356269359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_mlp": 1.10071373, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.05684050086710682, + "language_loss": 0.88146299, + "learning_rate": 0.0009399006572478195, + "loss": 0.89294124, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.47119141, + "step": 951, + "time_per_iteration": 3.0829784870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113898, + "balance_loss_mlp": 1.09449124, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06809630737889293, + "language_loss": 0.91594249, + "learning_rate": 0.0009397524828416468, + "loss": 0.92733228, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.44482422, + "step": 952, + "time_per_iteration": 2.710500478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141316, + "balance_loss_mlp": 1.09339356, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.06814185159234107, + "language_loss": 0.97457635, + "learning_rate": 0.0009396041377084192, + "loss": 0.98598951, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.47949219, + "step": 953, + "time_per_iteration": 2.6530585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011339, + "balance_loss_mlp": 1.08716977, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.06688505748067412, + "language_loss": 0.88496006, + "learning_rate": 0.0009394556219057295, + "loss": 0.896299, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.46704102, + "step": 954, + "time_per_iteration": 2.662543773651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.08948374, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08148035498798997, + "language_loss": 0.84775722, + "learning_rate": 0.0009393069354912362, + "loss": 0.85911626, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.46386719, + "step": 955, + "time_per_iteration": 2.7262632846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_mlp": 1.0954181, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07343823471440349, + "language_loss": 0.83466816, + "learning_rate": 0.0009391580785226649, + "loss": 0.8460598, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.43774414, + "step": 956, + "time_per_iteration": 2.8661141395568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_mlp": 1.04708123, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.029557521366383285, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80407178, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.19628906, + "step": 957, + "time_per_iteration": 4.751030921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.08978534, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.06490118531587029, + "language_loss": 0.87677503, + "learning_rate": 0.0009388598531545196, + "loss": 0.88812232, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.44946289, + "step": 958, + "time_per_iteration": 2.8378970623016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143042, + "balance_loss_mlp": 1.09702718, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.07391212127287443, + "language_loss": 0.86896807, + "learning_rate": 0.000938710484870727, + "loss": 0.88039851, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.46044922, + "step": 959, + "time_per_iteration": 4.31168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128823, + "balance_loss_mlp": 1.08416748, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0638837232249089, + "language_loss": 0.86957002, + "learning_rate": 0.0009385609462644189, + "loss": 0.88085824, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.44702148, + "step": 960, + "time_per_iteration": 2.6793572902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_mlp": 1.07233214, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07248975394705585, + "language_loss": 0.86711299, + "learning_rate": 0.0009384112373936514, + "loss": 0.87830293, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.46679688, + "step": 961, + "time_per_iteration": 2.6220860481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119858, + "balance_loss_mlp": 1.07334304, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.06813544125014795, + "language_loss": 0.92053163, + "learning_rate": 0.0009382613583165467, + "loss": 0.93173021, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.46533203, + "step": 962, + "time_per_iteration": 2.8032093048095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108588, + "balance_loss_mlp": 1.06142831, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07296294799157402, + "language_loss": 0.9064188, + "learning_rate": 0.0009381113090912928, + "loss": 0.91750467, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.47167969, + "step": 963, + "time_per_iteration": 2.7358789443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_mlp": 1.06741881, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.07962159601741099, + "language_loss": 0.90353996, + "learning_rate": 0.000937961089776144, + "loss": 0.91463923, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.42480469, + "step": 964, + "time_per_iteration": 2.5761237144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.07924736, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09082243760489998, + "language_loss": 0.83673573, + "learning_rate": 0.0009378107004294208, + "loss": 0.84802246, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.49438477, + "step": 965, + "time_per_iteration": 2.9681291580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132442, + "balance_loss_mlp": 1.08542585, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08405098410424734, + "language_loss": 0.92054594, + "learning_rate": 0.0009376601411095096, + "loss": 0.93187034, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.4699707, + "step": 966, + "time_per_iteration": 2.696122407913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.09773731, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07104128547690361, + "language_loss": 0.87554526, + "learning_rate": 0.0009375094118748622, + "loss": 0.88693225, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.40991211, + "step": 967, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179223, + "balance_loss_mlp": 1.13373268, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.0728928893981835, + "language_loss": 0.91626799, + "learning_rate": 0.0009373585127839976, + "loss": 0.92806023, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.45507812, + "step": 968, + "time_per_iteration": 2.9854021072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212732, + "balance_loss_mlp": 1.16905367, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08777237711590531, + "language_loss": 0.91368866, + "learning_rate": 0.0009372074438954994, + "loss": 0.92581606, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.43652344, + "step": 969, + "time_per_iteration": 2.5014536380767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211792, + "balance_loss_mlp": 1.16539574, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.0704882552763471, + "language_loss": 0.92436379, + "learning_rate": 0.0009370562052680181, + "loss": 0.93648171, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.46411133, + "step": 970, + "time_per_iteration": 2.453458070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120766, + "balance_loss_mlp": 1.16183591, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.07372597108689087, + "language_loss": 0.89988613, + "learning_rate": 0.0009369047969602695, + "loss": 0.91196281, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.45825195, + "step": 971, + "time_per_iteration": 2.703948497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192702, + "balance_loss_mlp": 1.14396954, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.08557962606734577, + "language_loss": 0.8750906, + "learning_rate": 0.0009367532190310357, + "loss": 0.88701761, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.48657227, + "step": 972, + "time_per_iteration": 4.1564977169036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148896, + "balance_loss_mlp": 1.1052649, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.06811184838385763, + "language_loss": 0.89467651, + "learning_rate": 0.0009366014715391644, + "loss": 0.90616548, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.43603516, + "step": 973, + "time_per_iteration": 2.695730209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134701, + "balance_loss_mlp": 1.09307301, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.054567817192194557, + "language_loss": 0.84347546, + "learning_rate": 0.0009364495545435693, + "loss": 0.85482252, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.41625977, + "step": 974, + "time_per_iteration": 2.828831672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146218, + "balance_loss_mlp": 1.09970224, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.08256927623824414, + "language_loss": 0.89333141, + "learning_rate": 0.0009362974681032297, + "loss": 0.90479362, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.46484375, + "step": 975, + "time_per_iteration": 2.5982418060302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143654, + "balance_loss_mlp": 1.09909391, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.07754570301250979, + "language_loss": 0.89447427, + "learning_rate": 0.0009361452122771907, + "loss": 0.90591079, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.44555664, + "step": 976, + "time_per_iteration": 2.881242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_mlp": 1.08834195, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.0965092241218366, + "language_loss": 0.84541976, + "learning_rate": 0.0009359927871245635, + "loss": 0.85675669, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.45361328, + "step": 977, + "time_per_iteration": 2.4720265865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113596, + "balance_loss_mlp": 1.09039843, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09227923665031239, + "language_loss": 0.87538362, + "learning_rate": 0.0009358401927045246, + "loss": 0.88674331, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.45581055, + "step": 978, + "time_per_iteration": 2.8225297927856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_mlp": 1.0945406, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.05953389716062443, + "language_loss": 0.88990903, + "learning_rate": 0.0009356874290763166, + "loss": 0.90131652, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.46264648, + "step": 979, + "time_per_iteration": 3.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_mlp": 1.09494936, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.06969100284100371, + "language_loss": 0.89955008, + "learning_rate": 0.0009355344962992474, + "loss": 0.91095543, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.45581055, + "step": 980, + "time_per_iteration": 2.6008429527282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138568, + "balance_loss_mlp": 1.09291101, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07021551702573088, + "language_loss": 0.88888156, + "learning_rate": 0.0009353813944326908, + "loss": 0.90026724, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.45654297, + "step": 981, + "time_per_iteration": 2.9102253913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141833, + "balance_loss_mlp": 1.09352899, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0640154196439605, + "language_loss": 0.83560127, + "learning_rate": 0.0009352281235360863, + "loss": 0.84701967, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.4831543, + "step": 982, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.10627127, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.06254433649037737, + "language_loss": 0.85791624, + "learning_rate": 0.0009350746836689389, + "loss": 0.86940861, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.4296875, + "step": 983, + "time_per_iteration": 2.524491548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.02905524, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.024687708549402564, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82486492, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.18261719, + "step": 984, + "time_per_iteration": 5.200335741043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156154, + "balance_loss_mlp": 1.1069684, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08202626484000469, + "language_loss": 0.84151661, + "learning_rate": 0.0009347672972613634, + "loss": 0.85307819, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.49145508, + "step": 985, + "time_per_iteration": 2.6939473152160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011517, + "balance_loss_mlp": 1.10756862, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.061889675774481866, + "language_loss": 0.8651796, + "learning_rate": 0.0009346133508402735, + "loss": 0.87669659, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.44140625, + "step": 986, + "time_per_iteration": 2.695004463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146205, + "balance_loss_mlp": 1.1000948, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07730871241699967, + "language_loss": 0.84821075, + "learning_rate": 0.0009344592356873166, + "loss": 0.85967278, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.46118164, + "step": 987, + "time_per_iteration": 2.635143518447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_mlp": 1.0975666, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.058246004489727894, + "language_loss": 0.79289091, + "learning_rate": 0.0009343049518623255, + "loss": 0.80432773, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.46142578, + "step": 988, + "time_per_iteration": 2.7257165908813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126709, + "balance_loss_mlp": 1.08503366, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.06464318177286693, + "language_loss": 0.83752143, + "learning_rate": 0.0009341504994251985, + "loss": 0.84878862, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.41674805, + "step": 989, + "time_per_iteration": 2.8336057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_mlp": 1.03692603, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.01962059038868396, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74572587, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.15136719, + "step": 990, + "time_per_iteration": 4.980287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.07682681, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.06360467015426281, + "language_loss": 0.82411575, + "learning_rate": 0.0009338410889544574, + "loss": 0.83530033, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.41601562, + "step": 991, + "time_per_iteration": 3.0192768573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123102, + "balance_loss_mlp": 1.0790422, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.06107834506241764, + "language_loss": 0.88440853, + "learning_rate": 0.000933686131040967, + "loss": 0.89563954, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.44067383, + "step": 992, + "time_per_iteration": 2.795952796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118187, + "balance_loss_mlp": 1.07479525, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.08075044213119366, + "language_loss": 0.91145802, + "learning_rate": 0.0009335310047555883, + "loss": 0.92263985, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.43383789, + "step": 993, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144036, + "balance_loss_mlp": 1.10052443, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.06789475617385991, + "language_loss": 0.89048505, + "learning_rate": 0.0009333757101585467, + "loss": 0.90192544, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.43554688, + "step": 994, + "time_per_iteration": 2.659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_mlp": 1.11687493, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.05475551086737561, + "language_loss": 0.94071913, + "learning_rate": 0.0009332202473101329, + "loss": 0.95231587, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.42822266, + "step": 995, + "time_per_iteration": 2.672307014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153956, + "balance_loss_mlp": 1.11011088, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.060816834986447306, + "language_loss": 0.8370983, + "learning_rate": 0.0009330646162707028, + "loss": 0.84863788, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.4387207, + "step": 996, + "time_per_iteration": 2.7483248710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.11274719, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.05013127115514869, + "language_loss": 0.85195571, + "learning_rate": 0.0009329088171006779, + "loss": 0.86350954, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.42626953, + "step": 997, + "time_per_iteration": 3.1445202827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_mlp": 1.1197654, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.07353815647154911, + "language_loss": 0.86074895, + "learning_rate": 0.0009327528498605446, + "loss": 0.87238026, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.43383789, + "step": 998, + "time_per_iteration": 2.536146402359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159094, + "balance_loss_mlp": 1.11844337, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.06861677349241169, + "language_loss": 0.9080506, + "learning_rate": 0.0009325967146108548, + "loss": 0.91964149, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.40649414, + "step": 999, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151834, + "balance_loss_mlp": 1.11049271, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.0672850368289366, + "language_loss": 0.88138115, + "learning_rate": 0.0009324404114122258, + "loss": 0.89289951, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.41357422, + "step": 1000, + "time_per_iteration": 2.677651882171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.12221444, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.06402741154285656, + "language_loss": 0.8710497, + "learning_rate": 0.0009322839403253397, + "loss": 0.88269627, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.42431641, + "step": 1001, + "time_per_iteration": 2.7528679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169088, + "balance_loss_mlp": 1.12440836, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07104878229054386, + "language_loss": 0.84949791, + "learning_rate": 0.0009321273014109439, + "loss": 0.86118877, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.44702148, + "step": 1002, + "time_per_iteration": 2.9990484714508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114889, + "balance_loss_mlp": 1.10523582, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.0673469195429183, + "language_loss": 0.85240018, + "learning_rate": 0.0009319704947298513, + "loss": 0.8638891, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.43676758, + "step": 1003, + "time_per_iteration": 2.8755459785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141118, + "balance_loss_mlp": 1.10127831, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.0925310675323854, + "language_loss": 0.89122581, + "learning_rate": 0.0009318135203429393, + "loss": 0.902637, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.3984375, + "step": 1004, + "time_per_iteration": 2.771192789077759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_mlp": 1.0866611, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.05779097302789, + "language_loss": 0.88602638, + "learning_rate": 0.0009316563783111511, + "loss": 0.8973062, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.41308594, + "step": 1005, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_mlp": 1.08638334, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06006842888316194, + "language_loss": 0.83199531, + "learning_rate": 0.0009314990686954943, + "loss": 0.84330451, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.44506836, + "step": 1006, + "time_per_iteration": 2.935081720352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_mlp": 1.09561515, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.0666735983489841, + "language_loss": 0.81657201, + "learning_rate": 0.000931341591557042, + "loss": 0.82798046, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.45263672, + "step": 1007, + "time_per_iteration": 3.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155404, + "balance_loss_mlp": 1.1041683, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.08115294197805281, + "language_loss": 0.87899536, + "learning_rate": 0.0009311839469569325, + "loss": 0.89054936, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.51171875, + "step": 1008, + "time_per_iteration": 2.6384472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150065, + "balance_loss_mlp": 1.10030699, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.07776470075981182, + "language_loss": 0.88065994, + "learning_rate": 0.0009310261349563687, + "loss": 0.89216053, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.49804688, + "step": 1009, + "time_per_iteration": 2.703058958053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_mlp": 1.11160064, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.05519618089274153, + "language_loss": 0.86250293, + "learning_rate": 0.0009308681556166186, + "loss": 0.87407839, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.45922852, + "step": 1010, + "time_per_iteration": 2.8404791355133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177928, + "balance_loss_mlp": 1.12480855, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10323239067467582, + "language_loss": 0.8870275, + "learning_rate": 0.0009307100089990152, + "loss": 0.89880681, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.53100586, + "step": 1011, + "time_per_iteration": 2.7103512287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185201, + "balance_loss_mlp": 1.13530004, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.08766026563197518, + "language_loss": 0.84582877, + "learning_rate": 0.0009305516951649568, + "loss": 0.8576808, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.49902344, + "step": 1012, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175674, + "balance_loss_mlp": 1.12818122, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07259628373080033, + "language_loss": 0.87723738, + "learning_rate": 0.0009303932141759057, + "loss": 0.8889941, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.47485352, + "step": 1013, + "time_per_iteration": 2.7738490104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161359, + "balance_loss_mlp": 1.11200666, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.07589756885314788, + "language_loss": 0.84698361, + "learning_rate": 0.0009302345660933902, + "loss": 0.85859716, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.49291992, + "step": 1014, + "time_per_iteration": 2.7809414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152692, + "balance_loss_mlp": 1.10579538, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.06636914889533592, + "language_loss": 0.85938931, + "learning_rate": 0.0009300757509790026, + "loss": 0.87091625, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.46875, + "step": 1015, + "time_per_iteration": 2.886200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151123, + "balance_loss_mlp": 1.10324848, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.08384883211824797, + "language_loss": 0.91210115, + "learning_rate": 0.0009299167688944005, + "loss": 0.92361236, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.47827148, + "step": 1016, + "time_per_iteration": 2.5308799743652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135215, + "balance_loss_mlp": 1.09036839, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07612639660839114, + "language_loss": 0.86733758, + "learning_rate": 0.0009297576199013063, + "loss": 0.87868977, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.44873047, + "step": 1017, + "time_per_iteration": 2.699352264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_mlp": 1.14159799, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.04987694814110311, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74158609, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.1484375, + "step": 1018, + "time_per_iteration": 4.927512168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099249, + "balance_loss_mlp": 1.08494341, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.032347612483235935, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80525547, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14257812, + "step": 1019, + "time_per_iteration": 5.494646787643433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_mlp": 1.08855522, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.06601293097738069, + "language_loss": 0.87223667, + "learning_rate": 0.0009292791720892659, + "loss": 0.88352561, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.40332031, + "step": 1020, + "time_per_iteration": 2.8718464374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_mlp": 1.08823943, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07136038826441608, + "language_loss": 0.89387941, + "learning_rate": 0.0009291193560807218, + "loss": 0.90521628, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.45483398, + "step": 1021, + "time_per_iteration": 2.588604211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132851, + "balance_loss_mlp": 1.09141409, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.06738480994857221, + "language_loss": 0.87651652, + "learning_rate": 0.0009289593734732688, + "loss": 0.88784504, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.41430664, + "step": 1022, + "time_per_iteration": 2.5915818214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.09036541, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.06942729809827348, + "language_loss": 0.94984972, + "learning_rate": 0.0009287992243290175, + "loss": 0.96114612, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.39282227, + "step": 1023, + "time_per_iteration": 2.4477546215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142342, + "balance_loss_mlp": 1.09880638, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.1017247644504036, + "language_loss": 0.91891634, + "learning_rate": 0.0009286389087101435, + "loss": 0.93033981, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 2.765334129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142412, + "balance_loss_mlp": 1.09942544, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07195718640229302, + "language_loss": 0.8893857, + "learning_rate": 0.0009284784266788864, + "loss": 0.90080982, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.42993164, + "step": 1025, + "time_per_iteration": 2.7323853969573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_mlp": 1.10327554, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.069193395974369, + "language_loss": 0.93259764, + "learning_rate": 0.0009283177782975512, + "loss": 0.94401753, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.38696289, + "step": 1026, + "time_per_iteration": 2.9729068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114551, + "balance_loss_mlp": 1.10142589, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08755988500201482, + "language_loss": 0.88955659, + "learning_rate": 0.000928156963628507, + "loss": 0.90101171, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.44067383, + "step": 1027, + "time_per_iteration": 2.594200849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138947, + "balance_loss_mlp": 1.09855926, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.07316483198701504, + "language_loss": 0.89277303, + "learning_rate": 0.0009279959827341877, + "loss": 0.90416259, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.40405273, + "step": 1028, + "time_per_iteration": 2.7378368377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140451, + "balance_loss_mlp": 1.09727335, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.059550544329949856, + "language_loss": 0.88526183, + "learning_rate": 0.0009278348356770915, + "loss": 0.89666629, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.43188477, + "step": 1029, + "time_per_iteration": 2.5737922191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133825, + "balance_loss_mlp": 1.0914098, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.06393748023743129, + "language_loss": 0.8587814, + "learning_rate": 0.0009276735225197814, + "loss": 0.87011963, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.42431641, + "step": 1030, + "time_per_iteration": 2.648477077484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146668, + "balance_loss_mlp": 1.10170269, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.06069855374703422, + "language_loss": 0.86812896, + "learning_rate": 0.0009275120433248847, + "loss": 0.87959564, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.44946289, + "step": 1031, + "time_per_iteration": 2.6862802505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.10327268, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.06482797348212818, + "language_loss": 0.87033594, + "learning_rate": 0.0009273503981550931, + "loss": 0.8818205, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.45166016, + "step": 1032, + "time_per_iteration": 3.0549416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157268, + "balance_loss_mlp": 1.11235023, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.07571303407420105, + "language_loss": 0.87661642, + "learning_rate": 0.0009271885870731626, + "loss": 0.88818914, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.44946289, + "step": 1033, + "time_per_iteration": 2.4938008785247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172373, + "balance_loss_mlp": 1.12495148, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.07801561202279184, + "language_loss": 0.89466584, + "learning_rate": 0.0009270266101419143, + "loss": 0.90638959, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.47460938, + "step": 1034, + "time_per_iteration": 2.61181378364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.12681675, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.07487269237991181, + "language_loss": 0.85762119, + "learning_rate": 0.0009268644674242328, + "loss": 0.86931992, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.43066406, + "step": 1035, + "time_per_iteration": 2.6761085987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163027, + "balance_loss_mlp": 1.1147716, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.06997084642295975, + "language_loss": 0.81697071, + "learning_rate": 0.0009267021589830678, + "loss": 0.828601, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.4831543, + "step": 1036, + "time_per_iteration": 2.6166343688964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162737, + "balance_loss_mlp": 1.14547551, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.04224955266067769, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78789818, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.17285156, + "step": 1037, + "time_per_iteration": 4.932336330413818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124804, + "balance_loss_mlp": 1.08224678, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.07370646472771722, + "language_loss": 0.9354341, + "learning_rate": 0.000926377045182406, + "loss": 0.94668216, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.42553711, + "step": 1038, + "time_per_iteration": 2.89486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122228, + "balance_loss_mlp": 1.07704759, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.06351485696264159, + "language_loss": 0.88915765, + "learning_rate": 0.0009262142399491296, + "loss": 0.9003799, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.4519043, + "step": 1039, + "time_per_iteration": 3.0843544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132553, + "balance_loss_mlp": 1.08784938, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.06429886269356283, + "language_loss": 0.89007306, + "learning_rate": 0.0009260512692448105, + "loss": 0.9013986, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.44677734, + "step": 1040, + "time_per_iteration": 2.7221181392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143871, + "balance_loss_mlp": 1.10071695, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0714265416650486, + "language_loss": 0.85044324, + "learning_rate": 0.000925888133132719, + "loss": 0.86188197, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.43164062, + "step": 1041, + "time_per_iteration": 2.7112865447998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113685, + "balance_loss_mlp": 1.09566069, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.0301437897992815, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072412, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.18066406, + "step": 1042, + "time_per_iteration": 4.913869380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.13338971, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.11345429965909062, + "language_loss": 0.82242954, + "learning_rate": 0.0009255613649386244, + "loss": 0.83422714, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.46337891, + "step": 1043, + "time_per_iteration": 2.6586339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153581, + "balance_loss_mlp": 1.11133325, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07362734504976313, + "language_loss": 0.79954398, + "learning_rate": 0.0009253977329834838, + "loss": 0.81107974, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.42236328, + "step": 1044, + "time_per_iteration": 2.7028462886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143902, + "balance_loss_mlp": 1.0951457, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.07842723007783056, + "language_loss": 0.8753317, + "learning_rate": 0.0009252339358742965, + "loss": 0.88677073, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.48779297, + "step": 1045, + "time_per_iteration": 2.8069612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139165, + "balance_loss_mlp": 1.0902648, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07197327624603128, + "language_loss": 0.84128577, + "learning_rate": 0.000925069973674654, + "loss": 0.85267735, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.48925781, + "step": 1046, + "time_per_iteration": 2.603602409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136955, + "balance_loss_mlp": 1.09303868, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06199919012721526, + "language_loss": 0.89849102, + "learning_rate": 0.000924905846448212, + "loss": 0.90986055, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.43896484, + "step": 1047, + "time_per_iteration": 2.733009099960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166193, + "balance_loss_mlp": 1.11726964, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.08010189097684783, + "language_loss": 0.86224002, + "learning_rate": 0.0009247415542586906, + "loss": 0.87390196, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.48950195, + "step": 1048, + "time_per_iteration": 2.8471555709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186895, + "balance_loss_mlp": 1.13675559, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.050762349186412876, + "language_loss": 0.83535373, + "learning_rate": 0.0009245770971698735, + "loss": 0.84722269, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.50170898, + "step": 1049, + "time_per_iteration": 2.889474630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183513, + "balance_loss_mlp": 1.13671136, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.07506320746734087, + "language_loss": 0.8918047, + "learning_rate": 0.0009244124752456087, + "loss": 0.90363979, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.46826172, + "step": 1050, + "time_per_iteration": 2.5762786865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205722, + "balance_loss_mlp": 1.15453339, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.08917577036116058, + "language_loss": 0.86475039, + "learning_rate": 0.0009242476885498081, + "loss": 0.87680757, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.51220703, + "step": 1051, + "time_per_iteration": 2.720395565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193502, + "balance_loss_mlp": 1.14009643, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.08090891256677915, + "language_loss": 0.81871718, + "learning_rate": 0.0009240827371464474, + "loss": 0.83065224, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.53442383, + "step": 1052, + "time_per_iteration": 2.535388231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162702, + "balance_loss_mlp": 1.11833251, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.08177732735855556, + "language_loss": 0.84886205, + "learning_rate": 0.0009239176210995666, + "loss": 0.86048913, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.4440918, + "step": 1053, + "time_per_iteration": 3.4955379962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148392, + "balance_loss_mlp": 1.0973227, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.9822109545682867, + "language_loss": 0.94933617, + "learning_rate": 0.0009237523404732695, + "loss": 0.96082008, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51074219, + "step": 1054, + "time_per_iteration": 2.90132737159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137464, + "balance_loss_mlp": 1.09118664, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.09331279688006895, + "language_loss": 0.85504258, + "learning_rate": 0.0009235868953317235, + "loss": 0.86641729, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.46264648, + "step": 1055, + "time_per_iteration": 2.813202381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212355, + "balance_loss_mlp": 1.16388512, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08645469446577787, + "language_loss": 0.86679947, + "learning_rate": 0.0009234212857391602, + "loss": 0.87892294, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.48486328, + "step": 1056, + "time_per_iteration": 3.184723377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_mlp": 1.23723245, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.11402704661401492, + "language_loss": 0.90548229, + "learning_rate": 0.000923255511759875, + "loss": 0.91837716, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.52319336, + "step": 1057, + "time_per_iteration": 2.8404476642608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374128, + "balance_loss_mlp": 1.3215096, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.12448379126392096, + "language_loss": 0.86306804, + "learning_rate": 0.000923089573458227, + "loss": 0.87680936, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.52661133, + "step": 1058, + "time_per_iteration": 2.921942949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411943, + "balance_loss_mlp": 1.35701096, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.12614323996078466, + "language_loss": 0.84856015, + "learning_rate": 0.0009229234708986392, + "loss": 0.8626796, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.54931641, + "step": 1059, + "time_per_iteration": 2.922795057296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01629047, + "balance_loss_mlp": 1.60253465, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.12493252943786969, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83295941, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.265625, + "step": 1060, + "time_per_iteration": 4.733684062957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333622, + "balance_loss_mlp": 1.27976346, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.0936460184690869, + "language_loss": 0.86563337, + "learning_rate": 0.0009225907732636548, + "loss": 0.87896961, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.53881836, + "step": 1061, + "time_per_iteration": 2.761353015899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296883, + "balance_loss_mlp": 1.24183202, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.09002543594031559, + "language_loss": 0.87698424, + "learning_rate": 0.0009224241783174227, + "loss": 0.88995302, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.55078125, + "step": 1062, + "time_per_iteration": 2.7161052227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252808, + "balance_loss_mlp": 1.19947362, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.08928798499879465, + "language_loss": 0.87254798, + "learning_rate": 0.0009222574193715802, + "loss": 0.88507611, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.53369141, + "step": 1063, + "time_per_iteration": 2.779623031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122391, + "balance_loss_mlp": 1.16757131, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06606001070927259, + "language_loss": 0.87212694, + "learning_rate": 0.000922090496490869, + "loss": 0.88436604, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.56323242, + "step": 1064, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217897, + "balance_loss_mlp": 1.16120076, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.3109146854617931, + "language_loss": 0.90918952, + "learning_rate": 0.0009219234097400937, + "loss": 0.92136848, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.56665039, + "step": 1065, + "time_per_iteration": 2.804588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245438, + "balance_loss_mlp": 1.18359244, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06908392980849179, + "language_loss": 0.84456235, + "learning_rate": 0.0009217561591841237, + "loss": 0.85701674, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.61816406, + "step": 1066, + "time_per_iteration": 3.303875207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287048, + "balance_loss_mlp": 1.21867001, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.1162597514909173, + "language_loss": 0.82140827, + "learning_rate": 0.0009215887448878913, + "loss": 0.83427876, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.68408203, + "step": 1067, + "time_per_iteration": 2.568690776824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293452, + "balance_loss_mlp": 1.22288036, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08586469474305494, + "language_loss": 0.85986763, + "learning_rate": 0.0009214211669163922, + "loss": 0.87280214, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.70654297, + "step": 1068, + "time_per_iteration": 2.700090169906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_mlp": 1.21408105, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.06609725061841937, + "language_loss": 0.94520444, + "learning_rate": 0.0009212534253346862, + "loss": 0.95800096, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.65478516, + "step": 1069, + "time_per_iteration": 2.696699857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285979, + "balance_loss_mlp": 1.21912634, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.07442061186670905, + "language_loss": 0.85475862, + "learning_rate": 0.0009210855202078964, + "loss": 0.86761844, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.66845703, + "step": 1070, + "time_per_iteration": 2.5769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_mlp": 1.21771979, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.07631989099853977, + "language_loss": 0.88063252, + "learning_rate": 0.0009209174516012091, + "loss": 0.89347488, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.66601562, + "step": 1071, + "time_per_iteration": 2.6154239177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261362, + "balance_loss_mlp": 1.19317448, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.05883273983798781, + "language_loss": 0.90461957, + "learning_rate": 0.0009207492195798747, + "loss": 0.91723317, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.68164062, + "step": 1072, + "time_per_iteration": 2.764965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261739, + "balance_loss_mlp": 1.18997467, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.07316980575900926, + "language_loss": 0.86156094, + "learning_rate": 0.0009205808242092061, + "loss": 0.87417829, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.71728516, + "step": 1073, + "time_per_iteration": 2.6222856044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258718, + "balance_loss_mlp": 1.18952858, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.06600331144021966, + "language_loss": 0.83598334, + "learning_rate": 0.0009204122655545808, + "loss": 0.84857053, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.69189453, + "step": 1074, + "time_per_iteration": 3.313964605331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252953, + "balance_loss_mlp": 1.18571925, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.06834339296378739, + "language_loss": 0.82186073, + "learning_rate": 0.0009202435436814388, + "loss": 0.83439028, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.67236328, + "step": 1075, + "time_per_iteration": 2.68725848197937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260409, + "balance_loss_mlp": 1.1926024, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.07476886245144747, + "language_loss": 0.91110998, + "learning_rate": 0.0009200746586552836, + "loss": 0.92371404, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.67773438, + "step": 1076, + "time_per_iteration": 2.889910936355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238308, + "balance_loss_mlp": 1.17145491, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.06855298516082668, + "language_loss": 0.84957182, + "learning_rate": 0.0009199056105416825, + "loss": 0.86195493, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.66894531, + "step": 1077, + "time_per_iteration": 3.0826096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242457, + "balance_loss_mlp": 1.17312455, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.0732932371665923, + "language_loss": 0.87494361, + "learning_rate": 0.0009197363994062654, + "loss": 0.8873682, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.69287109, + "step": 1078, + "time_per_iteration": 2.814481735229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121373, + "balance_loss_mlp": 1.15455508, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.060498447021287705, + "language_loss": 0.85097158, + "learning_rate": 0.0009195670253147262, + "loss": 0.86310887, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.59179688, + "step": 1079, + "time_per_iteration": 2.989818572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216427, + "balance_loss_mlp": 1.15286458, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.0563328194871683, + "language_loss": 0.83052152, + "learning_rate": 0.0009193974883328216, + "loss": 0.84268576, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.63574219, + "step": 1080, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209864, + "balance_loss_mlp": 1.14553857, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06150097183917509, + "language_loss": 0.87932825, + "learning_rate": 0.0009192277885263718, + "loss": 0.89142686, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.64306641, + "step": 1081, + "time_per_iteration": 2.65731143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198356, + "balance_loss_mlp": 1.13264751, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.05302154537588453, + "language_loss": 0.86579674, + "learning_rate": 0.0009190579259612602, + "loss": 0.87778032, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.65722656, + "step": 1082, + "time_per_iteration": 3.2999303340911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207666, + "balance_loss_mlp": 1.14300656, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.07988409962843289, + "language_loss": 0.87673134, + "learning_rate": 0.000918887900703433, + "loss": 0.88880801, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.64648438, + "step": 1083, + "time_per_iteration": 2.7956981658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204411, + "balance_loss_mlp": 1.14361465, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07357181622228276, + "language_loss": 0.91242653, + "learning_rate": 0.0009187177128188999, + "loss": 0.92447066, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.60693359, + "step": 1084, + "time_per_iteration": 2.4656450748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194774, + "balance_loss_mlp": 1.16902518, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.038082499218869, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78351313, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.2578125, + "step": 1085, + "time_per_iteration": 4.855400323867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181967, + "balance_loss_mlp": 1.12419796, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07376491342946172, + "language_loss": 0.86747313, + "learning_rate": 0.000918376849434071, + "loss": 0.87929279, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.57739258, + "step": 1086, + "time_per_iteration": 2.5493998527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192292, + "balance_loss_mlp": 1.1305418, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.07728027722551846, + "language_loss": 0.9155581, + "learning_rate": 0.0009182061740661098, + "loss": 0.92748106, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.61767578, + "step": 1087, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192286, + "balance_loss_mlp": 1.13144195, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.057753656338862314, + "language_loss": 0.85712528, + "learning_rate": 0.0009180353363361127, + "loss": 0.86904812, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.60888672, + "step": 1088, + "time_per_iteration": 3.1143646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180296, + "balance_loss_mlp": 1.11868906, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.07221088423930573, + "language_loss": 0.83469599, + "learning_rate": 0.0009178643363104044, + "loss": 0.84649897, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.61621094, + "step": 1089, + "time_per_iteration": 3.092656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199477, + "balance_loss_mlp": 1.138394, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.08745424257973078, + "language_loss": 0.92463166, + "learning_rate": 0.0009176931740553735, + "loss": 0.93662637, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.61083984, + "step": 1090, + "time_per_iteration": 2.53558349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207875, + "balance_loss_mlp": 1.14850855, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.07295134358518522, + "language_loss": 0.83623219, + "learning_rate": 0.0009175218496374708, + "loss": 0.84831095, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.59277344, + "step": 1091, + "time_per_iteration": 3.3514459133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226261, + "balance_loss_mlp": 1.16503549, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.0645587086921242, + "language_loss": 0.86590576, + "learning_rate": 0.0009173503631232103, + "loss": 0.87816834, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.61181641, + "step": 1092, + "time_per_iteration": 3.3893167972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122226, + "balance_loss_mlp": 1.16194034, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.12026645314545058, + "language_loss": 0.8245008, + "learning_rate": 0.0009171787145791691, + "loss": 0.83672333, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.60351562, + "step": 1093, + "time_per_iteration": 3.251084327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251584, + "balance_loss_mlp": 1.18854666, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.08481501206118727, + "language_loss": 0.8143028, + "learning_rate": 0.000917006904071987, + "loss": 0.82681859, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.63037109, + "step": 1094, + "time_per_iteration": 2.613060712814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_mlp": 1.20551634, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.08143629367900677, + "language_loss": 0.87639427, + "learning_rate": 0.0009168349316683669, + "loss": 0.88911939, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.66992188, + "step": 1095, + "time_per_iteration": 2.705172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269472, + "balance_loss_mlp": 1.20462179, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.05512017255927588, + "language_loss": 0.83512938, + "learning_rate": 0.0009166627974350741, + "loss": 0.8478241, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.64746094, + "step": 1096, + "time_per_iteration": 2.8979411125183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259233, + "balance_loss_mlp": 1.19390619, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.06519728045913388, + "language_loss": 0.90715098, + "learning_rate": 0.0009164905014389373, + "loss": 0.91974336, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.65283203, + "step": 1097, + "time_per_iteration": 2.7965359687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291926, + "balance_loss_mlp": 1.22445381, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.07891140172991894, + "language_loss": 0.87571776, + "learning_rate": 0.0009163180437468476, + "loss": 0.88863701, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.67480469, + "step": 1098, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012877, + "balance_loss_mlp": 1.22065675, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.06282838131309415, + "language_loss": 0.86816525, + "learning_rate": 0.000916145424425759, + "loss": 0.88104224, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.67041016, + "step": 1099, + "time_per_iteration": 2.6685678958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305165, + "balance_loss_mlp": 1.23554707, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.08616648204830919, + "language_loss": 0.916682, + "learning_rate": 0.0009159726435426885, + "loss": 0.92973363, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.69628906, + "step": 1100, + "time_per_iteration": 3.0852713584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282199, + "balance_loss_mlp": 1.21677744, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.07323647205544051, + "language_loss": 0.91053265, + "learning_rate": 0.0009157997011647154, + "loss": 0.92335469, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.65380859, + "step": 1101, + "time_per_iteration": 2.6137943267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_mlp": 1.20784807, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.05451247925490285, + "language_loss": 0.87014931, + "learning_rate": 0.0009156265973589817, + "loss": 0.88285577, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.62792969, + "step": 1102, + "time_per_iteration": 2.7920916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255362, + "balance_loss_mlp": 1.1928488, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.06310879580708054, + "language_loss": 0.90527534, + "learning_rate": 0.0009154533321926926, + "loss": 0.91782892, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.62548828, + "step": 1103, + "time_per_iteration": 2.646440029144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234037, + "balance_loss_mlp": 1.17214394, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.07831819024350671, + "language_loss": 0.88472342, + "learning_rate": 0.0009152799057331156, + "loss": 0.89706385, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.61865234, + "step": 1104, + "time_per_iteration": 3.122450590133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214804, + "balance_loss_mlp": 1.15462673, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.06719929320387279, + "language_loss": 0.91964042, + "learning_rate": 0.0009151063180475805, + "loss": 0.9317885, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.6015625, + "step": 1105, + "time_per_iteration": 2.5321173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181276, + "balance_loss_mlp": 1.12772751, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.07726558156265032, + "language_loss": 0.8518455, + "learning_rate": 0.0009149325692034803, + "loss": 0.86365819, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.53613281, + "step": 1106, + "time_per_iteration": 2.6019790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129115, + "balance_loss_mlp": 1.10660839, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.0458739418309424, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80332541, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.22460938, + "step": 1107, + "time_per_iteration": 4.859830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180766, + "balance_loss_mlp": 1.12478542, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08338906086238376, + "language_loss": 0.88186961, + "learning_rate": 0.0009145845883094678, + "loss": 0.89367729, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.56005859, + "step": 1108, + "time_per_iteration": 3.04249906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.10114598, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07708602471843919, + "language_loss": 0.85793281, + "learning_rate": 0.000914410356394654, + "loss": 0.86946738, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.5234375, + "step": 1109, + "time_per_iteration": 4.412867307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163449, + "balance_loss_mlp": 1.10751617, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.08187458054057056, + "language_loss": 0.85334879, + "learning_rate": 0.0009142359635914709, + "loss": 0.86498332, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.55957031, + "step": 1110, + "time_per_iteration": 3.023928642272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148781, + "balance_loss_mlp": 1.09570932, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.0669404625356857, + "language_loss": 0.85089076, + "learning_rate": 0.0009140614099676245, + "loss": 0.86237848, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.53076172, + "step": 1111, + "time_per_iteration": 2.625797748565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148537, + "balance_loss_mlp": 1.09632301, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.06784083874149466, + "language_loss": 0.83744586, + "learning_rate": 0.0009138866955908821, + "loss": 0.84893119, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.52246094, + "step": 1112, + "time_per_iteration": 2.9033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152374, + "balance_loss_mlp": 1.10042286, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.0756009236441896, + "language_loss": 0.81778276, + "learning_rate": 0.0009137118205290738, + "loss": 0.82930648, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.51977539, + "step": 1113, + "time_per_iteration": 3.00955867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163421, + "balance_loss_mlp": 1.10677314, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.07649003777848401, + "language_loss": 0.90946341, + "learning_rate": 0.0009135367848500924, + "loss": 0.92109764, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.56591797, + "step": 1114, + "time_per_iteration": 2.50858211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167845, + "balance_loss_mlp": 1.11472559, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.0823134598214501, + "language_loss": 0.87556803, + "learning_rate": 0.0009133615886218927, + "loss": 0.88724649, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.53125, + "step": 1115, + "time_per_iteration": 2.717454195022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178358, + "balance_loss_mlp": 1.11651218, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.06887665628973552, + "language_loss": 0.89567351, + "learning_rate": 0.0009131862319124917, + "loss": 0.90745711, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.61816406, + "step": 1116, + "time_per_iteration": 2.623767852783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176568, + "balance_loss_mlp": 1.1235671, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08365937432877864, + "language_loss": 0.85244483, + "learning_rate": 0.0009130107147899691, + "loss": 0.86421049, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.53051758, + "step": 1117, + "time_per_iteration": 2.795011281967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178642, + "balance_loss_mlp": 1.12561774, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.06665693704910039, + "language_loss": 0.8600654, + "learning_rate": 0.0009128350373224665, + "loss": 0.8718518, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.53076172, + "step": 1118, + "time_per_iteration": 2.5644795894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011837, + "balance_loss_mlp": 1.15928602, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.058896568697900505, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82640129, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.24414062, + "step": 1119, + "time_per_iteration": 4.683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204932, + "balance_loss_mlp": 1.15031052, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07135490421069918, + "language_loss": 0.85804355, + "learning_rate": 0.0009124832016254005, + "loss": 0.87009287, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.54663086, + "step": 1120, + "time_per_iteration": 2.6158647537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206508, + "balance_loss_mlp": 1.14571166, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.055578106746994274, + "language_loss": 0.89113355, + "learning_rate": 0.0009123070435324316, + "loss": 0.9031986, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.60791016, + "step": 1121, + "time_per_iteration": 2.755823850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.07988179, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.03051163671975961, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78977883, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.22265625, + "step": 1122, + "time_per_iteration": 4.996071100234985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211089, + "balance_loss_mlp": 1.15358257, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.06035521524280068, + "language_loss": 0.87145722, + "learning_rate": 0.0009119542471995752, + "loss": 0.88356811, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.57446289, + "step": 1123, + "time_per_iteration": 2.8323612213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204972, + "balance_loss_mlp": 1.14675009, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.060035653180353525, + "language_loss": 0.8248235, + "learning_rate": 0.0009117776090966554, + "loss": 0.83687323, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.58251953, + "step": 1124, + "time_per_iteration": 2.954216480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216387, + "balance_loss_mlp": 1.1558764, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.07791040933307145, + "language_loss": 0.876288, + "learning_rate": 0.0009116008111274899, + "loss": 0.88845193, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.60498047, + "step": 1125, + "time_per_iteration": 3.2826616764068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08216333, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.030294405796961115, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80209303, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.20214844, + "step": 1126, + "time_per_iteration": 4.8284173011779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.1455152, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.10762952047928877, + "language_loss": 0.8553561, + "learning_rate": 0.0009112467358650396, + "loss": 0.86737764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.56640625, + "step": 1127, + "time_per_iteration": 3.1621291637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192643, + "balance_loss_mlp": 1.13561273, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.06435190440672867, + "language_loss": 0.87181705, + "learning_rate": 0.0009110694587092192, + "loss": 0.88374346, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.56982422, + "step": 1128, + "time_per_iteration": 2.7597765922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194699, + "balance_loss_mlp": 1.13452196, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.06894978951163175, + "language_loss": 0.8223331, + "learning_rate": 0.0009108920219620815, + "loss": 0.83428001, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.6015625, + "step": 1129, + "time_per_iteration": 2.6658482551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198898, + "balance_loss_mlp": 1.14072335, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06550313542995663, + "language_loss": 0.90210444, + "learning_rate": 0.0009107144256925133, + "loss": 0.91409343, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.58154297, + "step": 1130, + "time_per_iteration": 2.7298777103424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211101, + "balance_loss_mlp": 1.15464389, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08430456831611369, + "language_loss": 0.82975614, + "learning_rate": 0.0009105366699694638, + "loss": 0.84186715, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.56445312, + "step": 1131, + "time_per_iteration": 2.7422807216644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121305, + "balance_loss_mlp": 1.15263498, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.05499133039406014, + "language_loss": 0.82219702, + "learning_rate": 0.0009103587548619439, + "loss": 0.83432752, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.60400391, + "step": 1132, + "time_per_iteration": 2.8834011554718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202147, + "balance_loss_mlp": 1.14468873, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.12855794167944481, + "language_loss": 0.87174821, + "learning_rate": 0.0009101806804390261, + "loss": 0.88376963, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.57421875, + "step": 1133, + "time_per_iteration": 2.8493435382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186046, + "balance_loss_mlp": 1.13082814, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.07046865468216726, + "language_loss": 0.91345453, + "learning_rate": 0.0009100024467698453, + "loss": 0.92531502, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.55175781, + "step": 1134, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184337, + "balance_loss_mlp": 1.12613893, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.07929007457036284, + "language_loss": 0.8353889, + "learning_rate": 0.0009098240539235981, + "loss": 0.84723222, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.58227539, + "step": 1135, + "time_per_iteration": 2.6736483573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176396, + "balance_loss_mlp": 1.12122619, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.06661367385494366, + "language_loss": 0.88575935, + "learning_rate": 0.0009096455019695423, + "loss": 0.89752328, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.55224609, + "step": 1136, + "time_per_iteration": 2.8438823223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172318, + "balance_loss_mlp": 1.1156702, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07075177433605506, + "language_loss": 0.90707165, + "learning_rate": 0.000909466790976998, + "loss": 0.91879487, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.56616211, + "step": 1137, + "time_per_iteration": 2.4795870780944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185042, + "balance_loss_mlp": 1.12801182, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07051320604800417, + "language_loss": 0.83409071, + "learning_rate": 0.0009092879210153473, + "loss": 0.84594113, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.57080078, + "step": 1138, + "time_per_iteration": 3.1328911781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186779, + "balance_loss_mlp": 1.13284826, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.06458215213012623, + "language_loss": 0.89566886, + "learning_rate": 0.0009091088921540333, + "loss": 0.90753663, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.54003906, + "step": 1139, + "time_per_iteration": 2.5608675479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_mlp": 1.03115106, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.027642480599540168, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76555562, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.15722656, + "step": 1140, + "time_per_iteration": 4.908522605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117392, + "balance_loss_mlp": 1.11908412, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.0906322081519832, + "language_loss": 0.84775734, + "learning_rate": 0.0009087503580104985, + "loss": 0.85949653, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.54882812, + "step": 1141, + "time_per_iteration": 2.696129083633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181851, + "balance_loss_mlp": 1.12558413, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.16226849767110665, + "language_loss": 0.80068243, + "learning_rate": 0.0009085708528674728, + "loss": 0.81250095, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.56347656, + "step": 1142, + "time_per_iteration": 2.7995505332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157793, + "balance_loss_mlp": 1.09985733, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08217329602320493, + "language_loss": 0.874843, + "learning_rate": 0.0009083911891031745, + "loss": 0.88642091, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.57958984, + "step": 1143, + "time_per_iteration": 3.1351919174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115076, + "balance_loss_mlp": 1.09578109, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.06169995263224583, + "language_loss": 0.92273706, + "learning_rate": 0.0009082113667873553, + "loss": 0.93424463, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.55029297, + "step": 1144, + "time_per_iteration": 3.1171934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153616, + "balance_loss_mlp": 1.10087752, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.07183124767141379, + "language_loss": 0.91221762, + "learning_rate": 0.0009080313859898283, + "loss": 0.9237538, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.52758789, + "step": 1145, + "time_per_iteration": 2.506591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153774, + "balance_loss_mlp": 1.09986758, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07077080612529597, + "language_loss": 0.92340779, + "learning_rate": 0.0009078512467804684, + "loss": 0.93494552, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.53881836, + "step": 1146, + "time_per_iteration": 2.591327667236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172392, + "balance_loss_mlp": 1.11800838, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.07651793216141736, + "language_loss": 0.91144007, + "learning_rate": 0.0009076709492292119, + "loss": 0.92316401, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.54418945, + "step": 1147, + "time_per_iteration": 2.609628438949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.11723804, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.07920780045429675, + "language_loss": 0.89603102, + "learning_rate": 0.0009074904934060562, + "loss": 0.90772295, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.51928711, + "step": 1148, + "time_per_iteration": 2.6755712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173959, + "balance_loss_mlp": 1.11697721, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.08245317941840166, + "language_loss": 0.8559376, + "learning_rate": 0.0009073098793810607, + "loss": 0.86767721, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.57006836, + "step": 1149, + "time_per_iteration": 2.9874348640441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177675, + "balance_loss_mlp": 1.12293434, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.08525751827962168, + "language_loss": 0.88982397, + "learning_rate": 0.000907129107224346, + "loss": 0.90160072, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.54785156, + "step": 1150, + "time_per_iteration": 2.739461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180589, + "balance_loss_mlp": 1.12658715, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.05205595876874212, + "language_loss": 0.88991034, + "learning_rate": 0.0009069481770060939, + "loss": 0.90171623, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.54077148, + "step": 1151, + "time_per_iteration": 2.7024669647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187248, + "balance_loss_mlp": 1.13212562, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06739531662392768, + "language_loss": 0.84448045, + "learning_rate": 0.000906767088796548, + "loss": 0.85635293, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.55126953, + "step": 1152, + "time_per_iteration": 3.4467508792877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117836, + "balance_loss_mlp": 1.12571764, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.05411857974090042, + "language_loss": 0.8779093, + "learning_rate": 0.0009065858426660127, + "loss": 0.8896929, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.52661133, + "step": 1153, + "time_per_iteration": 2.6216752529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182109, + "balance_loss_mlp": 1.12736845, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.07769931213358174, + "language_loss": 0.84979808, + "learning_rate": 0.0009064044386848543, + "loss": 0.86161917, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.54833984, + "step": 1154, + "time_per_iteration": 2.91601824760437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172512, + "balance_loss_mlp": 1.11381316, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.0711084155390928, + "language_loss": 0.89741302, + "learning_rate": 0.0009062228769234997, + "loss": 0.90913814, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.58691406, + "step": 1155, + "time_per_iteration": 2.5972864627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116208, + "balance_loss_mlp": 1.10690951, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.09100503083112628, + "language_loss": 0.81526613, + "learning_rate": 0.0009060411574524376, + "loss": 0.82688695, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.55224609, + "step": 1156, + "time_per_iteration": 2.6763274669647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182591, + "balance_loss_mlp": 1.12684917, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.06563385289017937, + "language_loss": 0.88585329, + "learning_rate": 0.0009058592803422178, + "loss": 0.89767921, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.55810547, + "step": 1157, + "time_per_iteration": 3.1414153575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_mlp": 1.00955701, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.012760142008093896, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79737109, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.17285156, + "step": 1158, + "time_per_iteration": 4.802858352661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171905, + "balance_loss_mlp": 1.12126482, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.060083734909452326, + "language_loss": 0.90886426, + "learning_rate": 0.00090549505348681, + "loss": 0.92058331, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.50683594, + "step": 1159, + "time_per_iteration": 2.5810928344726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168069, + "balance_loss_mlp": 1.11137354, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07069918091424116, + "language_loss": 0.85149121, + "learning_rate": 0.0009053127038830275, + "loss": 0.86317194, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.56689453, + "step": 1160, + "time_per_iteration": 3.009434223175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162107, + "balance_loss_mlp": 1.1050297, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.07200535138488619, + "language_loss": 0.87409687, + "learning_rate": 0.000905130196922898, + "loss": 0.88571799, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.57080078, + "step": 1161, + "time_per_iteration": 2.5972068309783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157457, + "balance_loss_mlp": 1.10223973, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.053497533436564174, + "language_loss": 0.8808614, + "learning_rate": 0.0009049475326772769, + "loss": 0.89243597, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.55224609, + "step": 1162, + "time_per_iteration": 2.580254316329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167432, + "balance_loss_mlp": 1.11092722, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.105825736895628, + "language_loss": 0.83639884, + "learning_rate": 0.0009047647112170811, + "loss": 0.84807312, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.56469727, + "step": 1163, + "time_per_iteration": 2.7509572505950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170262, + "balance_loss_mlp": 1.11041939, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.11729347611284674, + "language_loss": 0.8833853, + "learning_rate": 0.0009045817326132876, + "loss": 0.89508796, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.59814453, + "step": 1164, + "time_per_iteration": 3.6648380756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170775, + "balance_loss_mlp": 1.11226714, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.05704665841604838, + "language_loss": 0.83974147, + "learning_rate": 0.0009043985969369357, + "loss": 0.85144925, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.58544922, + "step": 1165, + "time_per_iteration": 2.868560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176977, + "balance_loss_mlp": 1.11665666, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.059940537627208516, + "language_loss": 0.84960037, + "learning_rate": 0.0009042153042591245, + "loss": 0.86137015, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.60302734, + "step": 1166, + "time_per_iteration": 2.8023743629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116839, + "balance_loss_mlp": 1.11271954, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.054742371261080745, + "language_loss": 0.85761929, + "learning_rate": 0.0009040318546510146, + "loss": 0.86930317, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.55639648, + "step": 1167, + "time_per_iteration": 3.141993999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117745, + "balance_loss_mlp": 1.1215651, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.07712318573741421, + "language_loss": 0.8582288, + "learning_rate": 0.0009038482481838275, + "loss": 0.87000328, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.55957031, + "step": 1168, + "time_per_iteration": 2.675204038619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116517, + "balance_loss_mlp": 1.1128844, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05640688657343365, + "language_loss": 0.88303328, + "learning_rate": 0.0009036644849288455, + "loss": 0.89468497, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.52319336, + "step": 1169, + "time_per_iteration": 3.0777511596679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.09441662, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.07174166621143864, + "language_loss": 0.86291218, + "learning_rate": 0.0009034805649574118, + "loss": 0.87439895, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.54394531, + "step": 1170, + "time_per_iteration": 2.7120091915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157496, + "balance_loss_mlp": 1.10513926, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.05497638968028837, + "language_loss": 0.85883957, + "learning_rate": 0.0009032964883409308, + "loss": 0.87041461, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.52441406, + "step": 1171, + "time_per_iteration": 2.8770556449890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_mlp": 1.03001809, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.027786176955518046, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74097812, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.17285156, + "step": 1172, + "time_per_iteration": 4.997943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150837, + "balance_loss_mlp": 1.0977174, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.06380875138992877, + "language_loss": 0.87640917, + "learning_rate": 0.0009029278654587462, + "loss": 0.88791752, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.53173828, + "step": 1173, + "time_per_iteration": 2.6070940494537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148484, + "balance_loss_mlp": 1.09546018, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.057211485944593306, + "language_loss": 0.83189976, + "learning_rate": 0.0009027433193361548, + "loss": 0.84338462, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.53027344, + "step": 1174, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.09708285, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06182212989299174, + "language_loss": 0.86948568, + "learning_rate": 0.00090255861685474, + "loss": 0.88097882, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.52246094, + "step": 1175, + "time_per_iteration": 2.7387607097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146248, + "balance_loss_mlp": 1.09284246, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.06871471519475823, + "language_loss": 0.92170686, + "learning_rate": 0.0009023737580862095, + "loss": 0.93316931, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.53442383, + "step": 1176, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160546, + "balance_loss_mlp": 1.11035883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0563237464245993, + "language_loss": 0.83948356, + "learning_rate": 0.0009021887431023321, + "loss": 0.851089, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.50219727, + "step": 1177, + "time_per_iteration": 2.5911412239074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161678, + "balance_loss_mlp": 1.11063254, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.06510699727290163, + "language_loss": 0.88054293, + "learning_rate": 0.0009020035719749369, + "loss": 0.8921597, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.51098633, + "step": 1178, + "time_per_iteration": 2.747715473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_mlp": 1.1255312, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0760827261000747, + "language_loss": 0.78592283, + "learning_rate": 0.0009018182447759136, + "loss": 0.79774463, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.56616211, + "step": 1179, + "time_per_iteration": 2.9912376403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177946, + "balance_loss_mlp": 1.12287188, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.05857060866656224, + "language_loss": 0.80403864, + "learning_rate": 0.0009016327615772126, + "loss": 0.81581813, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.55126953, + "step": 1180, + "time_per_iteration": 2.951934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178867, + "balance_loss_mlp": 1.1241498, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.07803208794693026, + "language_loss": 0.88654709, + "learning_rate": 0.0009014471224508451, + "loss": 0.8983357, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.54711914, + "step": 1181, + "time_per_iteration": 2.6834704875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175396, + "balance_loss_mlp": 1.12280107, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.07891792311297686, + "language_loss": 0.84171915, + "learning_rate": 0.0009012613274688823, + "loss": 0.85347319, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.52636719, + "step": 1182, + "time_per_iteration": 2.6773135662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193932, + "balance_loss_mlp": 1.13711679, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.06685387295915801, + "language_loss": 0.88334668, + "learning_rate": 0.0009010753767034565, + "loss": 0.89528602, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.56811523, + "step": 1183, + "time_per_iteration": 2.53671932220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192681, + "balance_loss_mlp": 1.13732028, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.05676884979808662, + "language_loss": 0.79381895, + "learning_rate": 0.0009008892702267599, + "loss": 0.80574578, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.55297852, + "step": 1184, + "time_per_iteration": 2.9609317779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218637, + "balance_loss_mlp": 1.16055822, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.11080255811352213, + "language_loss": 0.897048, + "learning_rate": 0.0009007030081110457, + "loss": 0.9092344, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.58105469, + "step": 1185, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212656, + "balance_loss_mlp": 1.15872598, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.06215110995007368, + "language_loss": 0.8510564, + "learning_rate": 0.000900516590428627, + "loss": 0.8631829, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.53955078, + "step": 1186, + "time_per_iteration": 2.66469407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206955, + "balance_loss_mlp": 1.15416956, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07510292852734143, + "language_loss": 0.90231287, + "learning_rate": 0.0009003300172518778, + "loss": 0.91438246, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.52807617, + "step": 1187, + "time_per_iteration": 2.6872987747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189379, + "balance_loss_mlp": 1.13559163, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.06187047573177096, + "language_loss": 0.84854043, + "learning_rate": 0.0009001432886532321, + "loss": 0.86043417, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.53808594, + "step": 1188, + "time_per_iteration": 2.961327314376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185796, + "balance_loss_mlp": 1.13248527, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0670290505569486, + "language_loss": 0.87277937, + "learning_rate": 0.0008999564047051843, + "loss": 0.88463724, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.53320312, + "step": 1189, + "time_per_iteration": 2.5120058059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119823, + "balance_loss_mlp": 1.14773321, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.07775817493182749, + "language_loss": 0.85562766, + "learning_rate": 0.0008997693654802894, + "loss": 0.86760998, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.50537109, + "step": 1190, + "time_per_iteration": 2.6584115028381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203195, + "balance_loss_mlp": 1.15343666, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08092173087187808, + "language_loss": 0.87245274, + "learning_rate": 0.0008995821710511625, + "loss": 0.88448465, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49780273, + "step": 1191, + "time_per_iteration": 2.75514817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189711, + "balance_loss_mlp": 1.14376771, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.058050392882622655, + "language_loss": 0.85975361, + "learning_rate": 0.0008993948214904786, + "loss": 0.8716507, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.45922852, + "step": 1192, + "time_per_iteration": 2.5808064937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132885, + "balance_loss_mlp": 1.11629128, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.04438752541684951, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.795551, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.16601562, + "step": 1193, + "time_per_iteration": 4.915351629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170271, + "balance_loss_mlp": 1.11338401, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.06516354982073377, + "language_loss": 0.79226351, + "learning_rate": 0.0008990196572654427, + "loss": 0.80396616, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.56933594, + "step": 1194, + "time_per_iteration": 2.914353609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159508, + "balance_loss_mlp": 1.10982203, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.053033431306196574, + "language_loss": 0.88186455, + "learning_rate": 0.0008988318427467426, + "loss": 0.89345956, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.49707031, + "step": 1195, + "time_per_iteration": 2.763303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146949, + "balance_loss_mlp": 1.09754825, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.06471781599702997, + "language_loss": 0.87142104, + "learning_rate": 0.0008986438733877887, + "loss": 0.88289052, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.49414062, + "step": 1196, + "time_per_iteration": 3.453037738800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138036, + "balance_loss_mlp": 1.08901691, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.05831436273017673, + "language_loss": 0.84795159, + "learning_rate": 0.0008984557492615576, + "loss": 0.85933197, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.49023438, + "step": 1197, + "time_per_iteration": 2.9209883213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147831, + "balance_loss_mlp": 1.09816873, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.06183090029168821, + "language_loss": 0.90001792, + "learning_rate": 0.0008982674704410854, + "loss": 0.91149628, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.49658203, + "step": 1198, + "time_per_iteration": 2.723980665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.10364521, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.06439147944581719, + "language_loss": 0.78128076, + "learning_rate": 0.0008980790369994682, + "loss": 0.7928164, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.49926758, + "step": 1199, + "time_per_iteration": 2.968733787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148741, + "balance_loss_mlp": 1.09817219, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.060755539801175186, + "language_loss": 0.8790828, + "learning_rate": 0.000897890449009863, + "loss": 0.89057022, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.50561523, + "step": 1200, + "time_per_iteration": 2.7373695373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159063, + "balance_loss_mlp": 1.11052144, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09508340337221405, + "language_loss": 0.9041636, + "learning_rate": 0.0008977017065454853, + "loss": 0.91575426, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.4855957, + "step": 1201, + "time_per_iteration": 2.6561479568481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172289, + "balance_loss_mlp": 1.12393796, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06896397472633412, + "language_loss": 0.8110497, + "learning_rate": 0.0008975128096796121, + "loss": 0.82277262, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48413086, + "step": 1202, + "time_per_iteration": 2.850882053375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166428, + "balance_loss_mlp": 1.11583591, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.07234791297382964, + "language_loss": 0.86751068, + "learning_rate": 0.0008973237584855794, + "loss": 0.87917495, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.50610352, + "step": 1203, + "time_per_iteration": 2.898651599884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201199, + "balance_loss_mlp": 1.14912796, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.0647782155366788, + "language_loss": 0.82535917, + "learning_rate": 0.0008971345530367832, + "loss": 0.83737111, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.52172852, + "step": 1204, + "time_per_iteration": 2.479710102081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188056, + "balance_loss_mlp": 1.13743997, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07976239468268423, + "language_loss": 0.86050093, + "learning_rate": 0.0008969451934066799, + "loss": 0.87238145, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.50561523, + "step": 1205, + "time_per_iteration": 2.7891948223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190172, + "balance_loss_mlp": 1.13834012, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08603625620414594, + "language_loss": 0.8068459, + "learning_rate": 0.0008967556796687854, + "loss": 0.81874764, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.51879883, + "step": 1206, + "time_per_iteration": 2.879742383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182453, + "balance_loss_mlp": 1.1313839, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.06613018456643845, + "language_loss": 0.8416872, + "learning_rate": 0.0008965660118966752, + "loss": 0.85351169, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.51098633, + "step": 1207, + "time_per_iteration": 2.8900513648986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.11610246, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06058209183838784, + "language_loss": 0.90754479, + "learning_rate": 0.0008963761901639851, + "loss": 0.91918385, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.47802734, + "step": 1208, + "time_per_iteration": 2.805534601211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176094, + "balance_loss_mlp": 1.12457156, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.06993420403149982, + "language_loss": 0.83909518, + "learning_rate": 0.0008961862145444103, + "loss": 0.85085618, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.51538086, + "step": 1209, + "time_per_iteration": 2.6882550716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197419, + "balance_loss_mlp": 1.14587319, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08646594069324176, + "language_loss": 0.85994279, + "learning_rate": 0.0008959960851117059, + "loss": 0.87191701, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.51611328, + "step": 1210, + "time_per_iteration": 2.6176648139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118703, + "balance_loss_mlp": 1.13340998, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06670419812311852, + "language_loss": 0.84013158, + "learning_rate": 0.0008958058019396868, + "loss": 0.85200191, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.53637695, + "step": 1211, + "time_per_iteration": 2.7867624759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177443, + "balance_loss_mlp": 1.12754154, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08722593193124767, + "language_loss": 0.87226063, + "learning_rate": 0.0008956153651022274, + "loss": 0.88403505, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.49926758, + "step": 1212, + "time_per_iteration": 2.671705961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169147, + "balance_loss_mlp": 1.11726665, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.06082314874639417, + "language_loss": 0.84296238, + "learning_rate": 0.0008954247746732618, + "loss": 0.85465384, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.51904297, + "step": 1213, + "time_per_iteration": 2.58005952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163838, + "balance_loss_mlp": 1.1156534, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.06006865966510304, + "language_loss": 0.91204965, + "learning_rate": 0.0008952340307267837, + "loss": 0.92368799, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48144531, + "step": 1214, + "time_per_iteration": 2.842824697494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149903, + "balance_loss_mlp": 1.09983516, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.07140080071894721, + "language_loss": 0.84202802, + "learning_rate": 0.0008950431333368468, + "loss": 0.85352707, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.50097656, + "step": 1215, + "time_per_iteration": 2.5616672039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155221, + "balance_loss_mlp": 1.10656011, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.083723319453273, + "language_loss": 0.85366404, + "learning_rate": 0.0008948520825775634, + "loss": 0.86521626, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48657227, + "step": 1216, + "time_per_iteration": 3.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114764, + "balance_loss_mlp": 1.09895492, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.05781662545039131, + "language_loss": 0.84181142, + "learning_rate": 0.0008946608785231067, + "loss": 0.85328782, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48706055, + "step": 1217, + "time_per_iteration": 2.861449956893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131497, + "balance_loss_mlp": 1.08352745, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.06428977242182035, + "language_loss": 0.85432529, + "learning_rate": 0.0008944695212477084, + "loss": 0.86564028, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.47973633, + "step": 1218, + "time_per_iteration": 2.540524959564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148618, + "balance_loss_mlp": 1.09907508, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.060914019840806265, + "language_loss": 0.86493349, + "learning_rate": 0.0008942780108256599, + "loss": 0.87641972, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.49560547, + "step": 1219, + "time_per_iteration": 2.613769769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142119, + "balance_loss_mlp": 1.09100199, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.05108155821019921, + "language_loss": 0.87340164, + "learning_rate": 0.0008940863473313121, + "loss": 0.88482285, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.51123047, + "step": 1220, + "time_per_iteration": 2.4549899101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145999, + "balance_loss_mlp": 1.09702742, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07702998226564757, + "language_loss": 0.8851074, + "learning_rate": 0.0008938945308390756, + "loss": 0.8965674, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48974609, + "step": 1221, + "time_per_iteration": 2.6133854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149321, + "balance_loss_mlp": 1.10211444, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.057479910137590906, + "language_loss": 0.88199294, + "learning_rate": 0.00089370256142342, + "loss": 0.89348614, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.47192383, + "step": 1222, + "time_per_iteration": 2.713489532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.09286284, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.05442066188859713, + "language_loss": 0.85879123, + "learning_rate": 0.0008935104391588746, + "loss": 0.87021047, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.49121094, + "step": 1223, + "time_per_iteration": 2.7304563522338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145924, + "balance_loss_mlp": 1.09447336, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.05049406517739995, + "language_loss": 0.8341555, + "learning_rate": 0.0008933181641200276, + "loss": 0.84561473, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.51513672, + "step": 1224, + "time_per_iteration": 3.122603416442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.09279394, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.0678885239417847, + "language_loss": 0.8627063, + "learning_rate": 0.0008931257363815271, + "loss": 0.87410253, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.46826172, + "step": 1225, + "time_per_iteration": 2.86014986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142208, + "balance_loss_mlp": 1.09490585, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.0639396043769501, + "language_loss": 0.90318632, + "learning_rate": 0.0008929331560180798, + "loss": 0.91460842, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.47338867, + "step": 1226, + "time_per_iteration": 2.9069020748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158077, + "balance_loss_mlp": 1.10924876, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.05735405278544162, + "language_loss": 0.9124881, + "learning_rate": 0.0008927404231044525, + "loss": 0.92406881, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48828125, + "step": 1227, + "time_per_iteration": 2.745591163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154284, + "balance_loss_mlp": 1.10571766, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.062458312515348655, + "language_loss": 0.8233285, + "learning_rate": 0.0008925475377154703, + "loss": 0.83487129, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48583984, + "step": 1228, + "time_per_iteration": 2.7165796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147881, + "balance_loss_mlp": 1.09664452, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.06307879716822463, + "language_loss": 0.82915187, + "learning_rate": 0.0008923544999260183, + "loss": 0.84063065, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.51293945, + "step": 1229, + "time_per_iteration": 2.787444829940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156102, + "balance_loss_mlp": 1.10567617, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.06236445133400911, + "language_loss": 0.92471206, + "learning_rate": 0.00089216130981104, + "loss": 0.9362731, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.50439453, + "step": 1230, + "time_per_iteration": 3.0671463012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148349, + "balance_loss_mlp": 1.09816241, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.06420697058211047, + "language_loss": 0.82893002, + "learning_rate": 0.000891967967445539, + "loss": 0.84041357, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.50195312, + "step": 1231, + "time_per_iteration": 2.692819356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147263, + "balance_loss_mlp": 1.09733796, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.044472050821074895, + "language_loss": 0.89257467, + "learning_rate": 0.0008917744729045772, + "loss": 0.90404725, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.49975586, + "step": 1232, + "time_per_iteration": 2.911123037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.10190618, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.055115174481180494, + "language_loss": 0.84317499, + "learning_rate": 0.0008915808262632757, + "loss": 0.85468972, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.49633789, + "step": 1233, + "time_per_iteration": 2.8429055213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164245, + "balance_loss_mlp": 1.1117928, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.07089823280283834, + "language_loss": 0.93916011, + "learning_rate": 0.0008913870275968148, + "loss": 0.95080256, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.52392578, + "step": 1234, + "time_per_iteration": 2.7355082035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152305, + "balance_loss_mlp": 1.10321498, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06512180670183462, + "language_loss": 0.87916219, + "learning_rate": 0.0008911930769804342, + "loss": 0.8906852, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.49145508, + "step": 1235, + "time_per_iteration": 3.320653200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115702, + "balance_loss_mlp": 1.10549772, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.04926889071384256, + "language_loss": 0.91928077, + "learning_rate": 0.0008909989744894318, + "loss": 0.93085092, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.51513672, + "step": 1236, + "time_per_iteration": 2.860095500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114863, + "balance_loss_mlp": 1.09808517, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.06373579401102465, + "language_loss": 0.81724823, + "learning_rate": 0.0008908047201991649, + "loss": 0.82873452, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.50512695, + "step": 1237, + "time_per_iteration": 2.7173092365264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146815, + "balance_loss_mlp": 1.10065758, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.06973577397583665, + "language_loss": 0.86895192, + "learning_rate": 0.0008906103141850502, + "loss": 0.88042009, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.46142578, + "step": 1238, + "time_per_iteration": 2.9070518016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149112, + "balance_loss_mlp": 1.10068893, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.07438040904238923, + "language_loss": 0.88608682, + "learning_rate": 0.0008904157565225621, + "loss": 0.897578, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48461914, + "step": 1239, + "time_per_iteration": 2.598175287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114606, + "balance_loss_mlp": 1.09758997, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.07265689268382322, + "language_loss": 0.82424903, + "learning_rate": 0.000890221047287235, + "loss": 0.83570957, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48486328, + "step": 1240, + "time_per_iteration": 3.5255463123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149116, + "balance_loss_mlp": 1.10207629, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07692592831537566, + "language_loss": 0.91524613, + "learning_rate": 0.0008900261865546615, + "loss": 0.92673725, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47021484, + "step": 1241, + "time_per_iteration": 2.626298189163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150585, + "balance_loss_mlp": 1.10101807, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.06193436068824588, + "language_loss": 0.85487348, + "learning_rate": 0.0008898311744004936, + "loss": 0.86637932, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.49584961, + "step": 1242, + "time_per_iteration": 2.6845884323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143159, + "balance_loss_mlp": 1.09638107, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06489370510499948, + "language_loss": 0.87195957, + "learning_rate": 0.0008896360109004414, + "loss": 0.88339114, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.46801758, + "step": 1243, + "time_per_iteration": 2.6279244422912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149339, + "balance_loss_mlp": 1.10239482, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.05690023470638135, + "language_loss": 0.84913921, + "learning_rate": 0.0008894406961302742, + "loss": 0.8606326, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.46948242, + "step": 1244, + "time_per_iteration": 2.5823607444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161498, + "balance_loss_mlp": 1.11591244, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.06599652790645752, + "language_loss": 0.84225279, + "learning_rate": 0.0008892452301658201, + "loss": 0.85386777, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.45581055, + "step": 1245, + "time_per_iteration": 3.0007240772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153792, + "balance_loss_mlp": 1.1045351, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.05569216777143309, + "language_loss": 0.83851659, + "learning_rate": 0.0008890496130829653, + "loss": 0.8500545, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.49316406, + "step": 1246, + "time_per_iteration": 2.656524658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.10424757, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.0643203237989141, + "language_loss": 0.85808307, + "learning_rate": 0.0008888538449576555, + "loss": 0.86958289, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.45751953, + "step": 1247, + "time_per_iteration": 2.5420141220092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148571, + "balance_loss_mlp": 1.09993315, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.07931889136759729, + "language_loss": 0.83083689, + "learning_rate": 0.0008886579258658944, + "loss": 0.84232259, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48632812, + "step": 1248, + "time_per_iteration": 2.574025869369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136833, + "balance_loss_mlp": 1.08786154, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.057547694087262784, + "language_loss": 0.85210383, + "learning_rate": 0.0008884618558837446, + "loss": 0.8634721, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.48974609, + "step": 1249, + "time_per_iteration": 2.808790922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146334, + "balance_loss_mlp": 1.09407234, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.05843363394571656, + "language_loss": 0.87170362, + "learning_rate": 0.0008882656350873273, + "loss": 0.88316691, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.52319336, + "step": 1250, + "time_per_iteration": 2.839341163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139888, + "balance_loss_mlp": 1.08998704, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.06920486589868534, + "language_loss": 0.87495792, + "learning_rate": 0.0008880692635528219, + "loss": 0.88635677, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.49975586, + "step": 1251, + "time_per_iteration": 3.0422415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141134, + "balance_loss_mlp": 1.09404635, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09445201185980338, + "language_loss": 0.89987123, + "learning_rate": 0.0008878727413564669, + "loss": 0.91128266, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47094727, + "step": 1252, + "time_per_iteration": 2.7974343299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110917, + "balance_loss_mlp": 1.09066832, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.0270998190046769, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81244767, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.18457031, + "step": 1253, + "time_per_iteration": 4.892668724060059 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150056, + "balance_loss_mlp": 1.09707963, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.06472275672686992, + "language_loss": 0.79044139, + "learning_rate": 0.0008874792452834528, + "loss": 0.80194199, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.53076172, + "step": 1254, + "time_per_iteration": 2.759533643722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144784, + "balance_loss_mlp": 1.09397733, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08671647217417044, + "language_loss": 0.87847424, + "learning_rate": 0.0008872822715595626, + "loss": 0.88992208, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.50878906, + "step": 1255, + "time_per_iteration": 2.6758921146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136115, + "balance_loss_mlp": 1.08731091, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.07818195128513271, + "language_loss": 0.87750483, + "learning_rate": 0.0008870851474793598, + "loss": 0.88886595, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.48803711, + "step": 1256, + "time_per_iteration": 2.5903451442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140246, + "balance_loss_mlp": 1.09196591, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.06462138017812241, + "language_loss": 0.90108514, + "learning_rate": 0.0008868878731193752, + "loss": 0.91248751, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48291016, + "step": 1257, + "time_per_iteration": 2.9156484603881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131411, + "balance_loss_mlp": 1.08611095, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.06839520252820154, + "language_loss": 0.89823216, + "learning_rate": 0.0008866904485561973, + "loss": 0.90954626, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.45361328, + "step": 1258, + "time_per_iteration": 2.709073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128152, + "balance_loss_mlp": 1.07698727, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.061516465429869265, + "language_loss": 0.83619797, + "learning_rate": 0.000886492873866473, + "loss": 0.84747952, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.51245117, + "step": 1259, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122174, + "balance_loss_mlp": 1.07315516, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.07532562043269028, + "language_loss": 0.85057306, + "learning_rate": 0.000886295149126908, + "loss": 0.86179483, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.49023438, + "step": 1260, + "time_per_iteration": 2.7702596187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_mlp": 1.07291138, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.06506459806255929, + "language_loss": 0.86249155, + "learning_rate": 0.0008860972744142655, + "loss": 0.87369466, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47363281, + "step": 1261, + "time_per_iteration": 2.9010353088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111356, + "balance_loss_mlp": 1.06575668, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.05333874014607912, + "language_loss": 0.82215619, + "learning_rate": 0.0008858992498053671, + "loss": 0.83329183, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47729492, + "step": 1262, + "time_per_iteration": 2.8307647705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_mlp": 1.08506405, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.04388178085496151, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77694511, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.20703125, + "step": 1263, + "time_per_iteration": 4.839150428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113047, + "balance_loss_mlp": 1.06517243, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07576677138650743, + "language_loss": 0.83877796, + "learning_rate": 0.0008855027512063817, + "loss": 0.84990847, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47924805, + "step": 1264, + "time_per_iteration": 2.6955387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116702, + "balance_loss_mlp": 1.06847, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.08737911579836782, + "language_loss": 0.86160326, + "learning_rate": 0.0008853042773702292, + "loss": 0.87277025, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.48217773, + "step": 1265, + "time_per_iteration": 2.718477725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123795, + "balance_loss_mlp": 1.07191551, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.05410456343654981, + "language_loss": 0.87916005, + "learning_rate": 0.0008851056539456896, + "loss": 0.89039803, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.51855469, + "step": 1266, + "time_per_iteration": 2.668398380279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127352, + "balance_loss_mlp": 1.07792759, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.06341671281787149, + "language_loss": 0.82546353, + "learning_rate": 0.0008849068810098755, + "loss": 0.8367371, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.49414062, + "step": 1267, + "time_per_iteration": 3.348644971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132731, + "balance_loss_mlp": 1.08523834, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.08675992555990221, + "language_loss": 0.8333391, + "learning_rate": 0.0008847079586399575, + "loss": 0.84466636, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47509766, + "step": 1268, + "time_per_iteration": 2.549433946609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126198, + "balance_loss_mlp": 1.07994461, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07249150513377325, + "language_loss": 0.8672694, + "learning_rate": 0.0008845088869131641, + "loss": 0.87853134, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.46289062, + "step": 1269, + "time_per_iteration": 2.6586451530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.10145724, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.06266770628228314, + "language_loss": 0.89411461, + "learning_rate": 0.0008843096659067818, + "loss": 0.90561438, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.48510742, + "step": 1270, + "time_per_iteration": 2.626946210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146652, + "balance_loss_mlp": 1.10228229, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.056965438466979365, + "language_loss": 0.86992264, + "learning_rate": 0.000884110295698155, + "loss": 0.88138914, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.44335938, + "step": 1271, + "time_per_iteration": 2.970078706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160922, + "balance_loss_mlp": 1.11080623, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.06894839907125858, + "language_loss": 0.86557794, + "learning_rate": 0.0008839107763646861, + "loss": 0.87718713, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.5012207, + "step": 1272, + "time_per_iteration": 2.592349052429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183532, + "balance_loss_mlp": 1.13437057, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.06647703149266906, + "language_loss": 0.90856385, + "learning_rate": 0.0008837111079838353, + "loss": 0.92039919, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.49194336, + "step": 1273, + "time_per_iteration": 2.7098910808563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118943, + "balance_loss_mlp": 1.14289117, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.05923779703064254, + "language_loss": 0.90316379, + "learning_rate": 0.000883511290633121, + "loss": 0.91505814, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.46533203, + "step": 1274, + "time_per_iteration": 2.5714197158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.13739181, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.060927364177961095, + "language_loss": 0.92697686, + "learning_rate": 0.000883311324390119, + "loss": 0.93883693, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.48608398, + "step": 1275, + "time_per_iteration": 2.740896224975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189584, + "balance_loss_mlp": 1.13474798, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.07775603238406727, + "language_loss": 0.82056022, + "learning_rate": 0.0008831112093324629, + "loss": 0.83245611, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.5480957, + "step": 1276, + "time_per_iteration": 3.0821468830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190059, + "balance_loss_mlp": 1.13927567, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.05600773018776359, + "language_loss": 0.89543378, + "learning_rate": 0.0008829109455378444, + "loss": 0.90733445, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.50830078, + "step": 1277, + "time_per_iteration": 2.7299413681030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192458, + "balance_loss_mlp": 1.14241397, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.05156937738675093, + "language_loss": 0.87083036, + "learning_rate": 0.000882710533084013, + "loss": 0.88275498, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.5, + "step": 1278, + "time_per_iteration": 2.6295228004455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185847, + "balance_loss_mlp": 1.13568354, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.05927927368096647, + "language_loss": 0.90088928, + "learning_rate": 0.0008825099720487755, + "loss": 0.91274774, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.50195312, + "step": 1279, + "time_per_iteration": 2.630868434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149494, + "balance_loss_mlp": 1.13461673, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04555367127523109, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76410633, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.1484375, + "step": 1280, + "time_per_iteration": 4.843670129776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118256, + "balance_loss_mlp": 1.10366488, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.038204832859796624, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79062366, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.14550781, + "step": 1281, + "time_per_iteration": 4.784554481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115452, + "balance_loss_mlp": 1.10547721, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.05852441511604794, + "language_loss": 0.89541078, + "learning_rate": 0.0008819073982335619, + "loss": 0.90695602, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.49047852, + "step": 1282, + "time_per_iteration": 2.8370161056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141297, + "balance_loss_mlp": 1.09726083, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07515840278086762, + "language_loss": 0.84908974, + "learning_rate": 0.0008817062436519235, + "loss": 0.86050272, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.44042969, + "step": 1283, + "time_per_iteration": 2.6532042026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114078, + "balance_loss_mlp": 1.09164214, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.051214690731677004, + "language_loss": 0.9022612, + "learning_rate": 0.0008815049408787788, + "loss": 0.91366905, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.49072266, + "step": 1284, + "time_per_iteration": 2.577040195465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145806, + "balance_loss_mlp": 1.09857535, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.06399849872592922, + "language_loss": 0.86388409, + "learning_rate": 0.0008813034899922805, + "loss": 0.87534213, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47216797, + "step": 1285, + "time_per_iteration": 2.586411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153157, + "balance_loss_mlp": 1.10366094, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.05962621730359375, + "language_loss": 0.90523338, + "learning_rate": 0.0008811018910706387, + "loss": 0.91676497, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.49536133, + "step": 1286, + "time_per_iteration": 2.558340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150564, + "balance_loss_mlp": 1.0996381, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08171747444285254, + "language_loss": 0.82914776, + "learning_rate": 0.0008809001441921211, + "loss": 0.84065336, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.50976562, + "step": 1287, + "time_per_iteration": 2.7096829414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134679, + "balance_loss_mlp": 1.08651865, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.061876473909820096, + "language_loss": 0.86037469, + "learning_rate": 0.0008806982494350528, + "loss": 0.87172151, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.48144531, + "step": 1288, + "time_per_iteration": 2.6826744079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.0885514, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.05818805427718153, + "language_loss": 0.90965348, + "learning_rate": 0.0008804962068778161, + "loss": 0.92104065, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.50195312, + "step": 1289, + "time_per_iteration": 2.9314775466918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137271, + "balance_loss_mlp": 1.08872867, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.06661216201088474, + "language_loss": 0.81390089, + "learning_rate": 0.0008802940165988511, + "loss": 0.82527363, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.48510742, + "step": 1290, + "time_per_iteration": 2.8629136085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113117, + "balance_loss_mlp": 1.08389127, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.06960392685137955, + "language_loss": 0.89268786, + "learning_rate": 0.000880091678676655, + "loss": 0.90399957, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47265625, + "step": 1291, + "time_per_iteration": 2.8345038890838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.08882165, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.058047960295431696, + "language_loss": 0.89150697, + "learning_rate": 0.0008798891931897821, + "loss": 0.90286887, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47338867, + "step": 1292, + "time_per_iteration": 2.7299227714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128008, + "balance_loss_mlp": 1.07949018, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.09954343743221296, + "language_loss": 0.84998739, + "learning_rate": 0.0008796865602168447, + "loss": 0.86126745, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.48535156, + "step": 1293, + "time_per_iteration": 2.5342278480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127533, + "balance_loss_mlp": 1.08220935, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.05777797953149353, + "language_loss": 0.89527249, + "learning_rate": 0.0008794837798365115, + "loss": 0.90654784, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.45361328, + "step": 1294, + "time_per_iteration": 2.6889185905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.08886147, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.07754051928079464, + "language_loss": 0.89232659, + "learning_rate": 0.0008792808521275089, + "loss": 0.90369469, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47924805, + "step": 1295, + "time_per_iteration": 2.7635927200317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136837, + "balance_loss_mlp": 1.09027398, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.09989296116771008, + "language_loss": 0.87984705, + "learning_rate": 0.0008790777771686206, + "loss": 0.89121538, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.46557617, + "step": 1296, + "time_per_iteration": 2.579235076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124595, + "balance_loss_mlp": 1.07853234, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.08251132162328097, + "language_loss": 0.85680348, + "learning_rate": 0.0008788745550386872, + "loss": 0.86804938, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.46044922, + "step": 1297, + "time_per_iteration": 2.598031759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128617, + "balance_loss_mlp": 1.08152938, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.06717402893383145, + "language_loss": 0.80945367, + "learning_rate": 0.0008786711858166063, + "loss": 0.82073987, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47070312, + "step": 1298, + "time_per_iteration": 2.9720141887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133144, + "balance_loss_mlp": 1.08696246, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.058753985131359356, + "language_loss": 0.84356344, + "learning_rate": 0.0008784676695813332, + "loss": 0.85489488, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.46166992, + "step": 1299, + "time_per_iteration": 3.003113031387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154452, + "balance_loss_mlp": 1.10700631, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07081449776085671, + "language_loss": 0.85444576, + "learning_rate": 0.0008782640064118796, + "loss": 0.86599028, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47436523, + "step": 1300, + "time_per_iteration": 2.8769848346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166343, + "balance_loss_mlp": 1.14946294, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.041859158942630086, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77351093, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.16894531, + "step": 1301, + "time_per_iteration": 4.951652526855469 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191692, + "balance_loss_mlp": 1.14701271, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.07273634964220443, + "language_loss": 0.8750245, + "learning_rate": 0.0008778562395867648, + "loss": 0.88694143, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.44677734, + "step": 1302, + "time_per_iteration": 2.604402542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181408, + "balance_loss_mlp": 1.13629961, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07562070017846675, + "language_loss": 0.84288502, + "learning_rate": 0.0008776521360894127, + "loss": 0.85469913, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.45092773, + "step": 1303, + "time_per_iteration": 2.5878565311431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08784008, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0317480068151838, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80065739, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.15820312, + "step": 1304, + "time_per_iteration": 4.7717835903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116688, + "balance_loss_mlp": 1.12220049, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.05690422496958516, + "language_loss": 0.90951985, + "learning_rate": 0.0008772434893213186, + "loss": 0.92118865, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.44702148, + "step": 1305, + "time_per_iteration": 2.604490280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160948, + "balance_loss_mlp": 1.11405063, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.058263181320018995, + "language_loss": 0.85050523, + "learning_rate": 0.0008770389462092276, + "loss": 0.86211473, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46875, + "step": 1306, + "time_per_iteration": 2.6470468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011567, + "balance_loss_mlp": 1.1099937, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.058464254330546805, + "language_loss": 0.87023067, + "learning_rate": 0.0008768342567176357, + "loss": 0.88179767, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.46704102, + "step": 1307, + "time_per_iteration": 2.8168630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155047, + "balance_loss_mlp": 1.10753012, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.05479935706331158, + "language_loss": 0.90999937, + "learning_rate": 0.0008766294209260107, + "loss": 0.9215498, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.4753418, + "step": 1308, + "time_per_iteration": 2.721531629562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144469, + "balance_loss_mlp": 1.09704781, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.06755027454964987, + "language_loss": 0.91936618, + "learning_rate": 0.0008764244389138767, + "loss": 0.93081093, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47436523, + "step": 1309, + "time_per_iteration": 2.574913263320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146846, + "balance_loss_mlp": 1.10061693, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09614568206927013, + "language_loss": 0.82912982, + "learning_rate": 0.000876219310760815, + "loss": 0.84059829, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.46240234, + "step": 1310, + "time_per_iteration": 2.8861234188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140262, + "balance_loss_mlp": 1.09419942, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.07943381545238665, + "language_loss": 0.82026285, + "learning_rate": 0.0008760140365464631, + "loss": 0.83166546, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.46020508, + "step": 1311, + "time_per_iteration": 2.615981340408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157686, + "balance_loss_mlp": 1.11212397, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.0923524312347507, + "language_loss": 0.8768574, + "learning_rate": 0.0008758086163505156, + "loss": 0.88843429, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.45532227, + "step": 1312, + "time_per_iteration": 2.6723434925079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144164, + "balance_loss_mlp": 1.09872115, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.06443576206069311, + "language_loss": 0.90026277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91170442, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.45458984, + "step": 1313, + "time_per_iteration": 2.841367721557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114771, + "balance_loss_mlp": 1.10291111, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.057466156915965357, + "language_loss": 0.90976274, + "learning_rate": 0.0008753973383328954, + "loss": 0.92123979, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.44824219, + "step": 1314, + "time_per_iteration": 2.7198092937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135642, + "balance_loss_mlp": 1.08912706, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.0651730634150067, + "language_loss": 0.84640622, + "learning_rate": 0.0008751914806708952, + "loss": 0.85776269, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.46508789, + "step": 1315, + "time_per_iteration": 2.619739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138249, + "balance_loss_mlp": 1.0955956, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.06535523514746128, + "language_loss": 0.82706141, + "learning_rate": 0.0008749854773466439, + "loss": 0.83844388, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.42700195, + "step": 1316, + "time_per_iteration": 2.6750850677490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126734, + "balance_loss_mlp": 1.08594072, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07438570972797282, + "language_loss": 0.85103095, + "learning_rate": 0.0008747793284401192, + "loss": 0.86229837, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.40771484, + "step": 1317, + "time_per_iteration": 2.667684316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127851, + "balance_loss_mlp": 1.08231306, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.06662830476911753, + "language_loss": 0.8637262, + "learning_rate": 0.0008745730340313551, + "loss": 0.87500465, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.45532227, + "step": 1318, + "time_per_iteration": 2.783167839050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_mlp": 1.08298802, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.06014849970215255, + "language_loss": 0.84828806, + "learning_rate": 0.0008743665942004422, + "loss": 0.85955328, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.43554688, + "step": 1319, + "time_per_iteration": 2.6454880237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128022, + "balance_loss_mlp": 1.08334279, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.10116204644494126, + "language_loss": 0.93301231, + "learning_rate": 0.0008741600090275277, + "loss": 0.94429255, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.44702148, + "step": 1320, + "time_per_iteration": 2.565373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112488, + "balance_loss_mlp": 1.07884121, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.06655436432492466, + "language_loss": 0.84446663, + "learning_rate": 0.0008739532785928151, + "loss": 0.85571539, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.45996094, + "step": 1321, + "time_per_iteration": 3.479727268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080328, + "balance_loss_mlp": 1.06325758, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.0281051137535917, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7597391, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.17089844, + "step": 1322, + "time_per_iteration": 4.7930076122283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136178, + "balance_loss_mlp": 1.08921003, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.06285601142266005, + "language_loss": 0.83366752, + "learning_rate": 0.0008735393822590908, + "loss": 0.84502923, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.46923828, + "step": 1323, + "time_per_iteration": 2.672137498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145864, + "balance_loss_mlp": 1.10192394, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.05471127015298985, + "language_loss": 0.8775813, + "learning_rate": 0.0008733322165207681, + "loss": 0.88903993, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.43969727, + "step": 1324, + "time_per_iteration": 2.6422736644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157775, + "balance_loss_mlp": 1.11292815, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.058409122955484685, + "language_loss": 0.83687508, + "learning_rate": 0.0008731249058420247, + "loss": 0.84845281, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.44824219, + "step": 1325, + "time_per_iteration": 3.02577805519104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165947, + "balance_loss_mlp": 1.11995602, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.0843662219595253, + "language_loss": 0.90814316, + "learning_rate": 0.0008729174503033459, + "loss": 0.91980267, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.45947266, + "step": 1326, + "time_per_iteration": 2.700956344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160817, + "balance_loss_mlp": 1.11418188, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.07395752020353057, + "language_loss": 0.83274329, + "learning_rate": 0.0008727098499852728, + "loss": 0.84435147, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.46630859, + "step": 1327, + "time_per_iteration": 2.8289363384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.0946734, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.05433597882612883, + "language_loss": 0.90389377, + "learning_rate": 0.0008725021049684034, + "loss": 0.91528177, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.44165039, + "step": 1328, + "time_per_iteration": 2.766871452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.09057808, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.04999939134312536, + "language_loss": 0.83732843, + "learning_rate": 0.000872294215333391, + "loss": 0.84867573, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.44165039, + "step": 1329, + "time_per_iteration": 3.181687116622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133543, + "balance_loss_mlp": 1.08941174, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.053270875218317436, + "language_loss": 0.83338815, + "learning_rate": 0.0008720861811609457, + "loss": 0.84472358, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.44140625, + "step": 1330, + "time_per_iteration": 2.753095865249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139869, + "balance_loss_mlp": 1.09282851, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0744958299593676, + "language_loss": 0.83801699, + "learning_rate": 0.0008718780025318338, + "loss": 0.84941566, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.4699707, + "step": 1331, + "time_per_iteration": 2.74076771736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141571, + "balance_loss_mlp": 1.09913218, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.06658506014654758, + "language_loss": 0.84681445, + "learning_rate": 0.0008716696795268771, + "loss": 0.85823017, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.42456055, + "step": 1332, + "time_per_iteration": 2.6771953105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141914, + "balance_loss_mlp": 1.09718704, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.06458865940403113, + "language_loss": 0.86108088, + "learning_rate": 0.0008714612122269538, + "loss": 0.87250006, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.44750977, + "step": 1333, + "time_per_iteration": 2.872405767440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145867, + "balance_loss_mlp": 1.09944701, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.06078246423813374, + "language_loss": 0.89285004, + "learning_rate": 0.0008712526007129982, + "loss": 0.90430868, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46411133, + "step": 1334, + "time_per_iteration": 2.575467586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148778, + "balance_loss_mlp": 1.10517156, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.06822349657501799, + "language_loss": 0.91275418, + "learning_rate": 0.0008710438450660003, + "loss": 0.92424202, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.43603516, + "step": 1335, + "time_per_iteration": 2.6461987495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149209, + "balance_loss_mlp": 1.10157323, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.08158488021096956, + "language_loss": 0.88278055, + "learning_rate": 0.0008708349453670064, + "loss": 0.89427269, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47583008, + "step": 1336, + "time_per_iteration": 2.5001657009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128588, + "balance_loss_mlp": 1.08297849, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.0603403973753485, + "language_loss": 0.91654134, + "learning_rate": 0.0008706259016971185, + "loss": 0.92782724, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.45629883, + "step": 1337, + "time_per_iteration": 2.817657947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127771, + "balance_loss_mlp": 1.07865644, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.08421665296665147, + "language_loss": 0.83723027, + "learning_rate": 0.0008704167141374944, + "loss": 0.848508, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.49145508, + "step": 1338, + "time_per_iteration": 2.808487892150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_mlp": 1.08003271, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.05813050369368248, + "language_loss": 0.88781357, + "learning_rate": 0.0008702073827693482, + "loss": 0.89909494, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.48144531, + "step": 1339, + "time_per_iteration": 2.687836170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131918, + "balance_loss_mlp": 1.08711886, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.05714278292432699, + "language_loss": 0.89388514, + "learning_rate": 0.0008699979076739494, + "loss": 0.9052043, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.44799805, + "step": 1340, + "time_per_iteration": 2.9907524585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157888, + "balance_loss_mlp": 1.11089551, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.06321899043923618, + "language_loss": 0.8949765, + "learning_rate": 0.0008697882889326234, + "loss": 0.90655541, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.4699707, + "step": 1341, + "time_per_iteration": 2.5261731147766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182653, + "balance_loss_mlp": 1.13513625, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.06545350512623192, + "language_loss": 0.87013066, + "learning_rate": 0.0008695785266267515, + "loss": 0.88195717, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.4753418, + "step": 1342, + "time_per_iteration": 2.719949722290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194656, + "balance_loss_mlp": 1.14585173, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.07227104516109029, + "language_loss": 0.8379634, + "learning_rate": 0.0008693686208377704, + "loss": 0.84991002, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.48828125, + "step": 1343, + "time_per_iteration": 2.789046049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011909, + "balance_loss_mlp": 1.14572012, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.08291144049116697, + "language_loss": 0.89388204, + "learning_rate": 0.0008691585716471733, + "loss": 0.90579104, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.45214844, + "step": 1344, + "time_per_iteration": 2.63281512260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182868, + "balance_loss_mlp": 1.1348505, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.05462335243620436, + "language_loss": 0.86349607, + "learning_rate": 0.0008689483791365079, + "loss": 0.87532479, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.48022461, + "step": 1345, + "time_per_iteration": 2.8293464183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165648, + "balance_loss_mlp": 1.11879873, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.060641418043912716, + "language_loss": 0.89744675, + "learning_rate": 0.0008687380433873786, + "loss": 0.90910327, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46875, + "step": 1346, + "time_per_iteration": 2.757361650466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150314, + "balance_loss_mlp": 1.100389, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.0738804898683007, + "language_loss": 0.83070856, + "learning_rate": 0.0008685275644814448, + "loss": 0.84221172, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.49926758, + "step": 1347, + "time_per_iteration": 2.716006278991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147842, + "balance_loss_mlp": 1.10087395, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07544817120788133, + "language_loss": 0.85244781, + "learning_rate": 0.0008683169425004216, + "loss": 0.86392623, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46972656, + "step": 1348, + "time_per_iteration": 2.900754451751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114914, + "balance_loss_mlp": 1.09842825, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.08404854247051008, + "language_loss": 0.83688962, + "learning_rate": 0.0008681061775260799, + "loss": 0.84838104, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.50708008, + "step": 1349, + "time_per_iteration": 2.8356235027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140271, + "balance_loss_mlp": 1.09356534, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08196022848482862, + "language_loss": 0.92983842, + "learning_rate": 0.0008678952696402458, + "loss": 0.94124115, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46704102, + "step": 1350, + "time_per_iteration": 2.5051889419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_mlp": 1.0865308, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.052642437263987304, + "language_loss": 0.86759204, + "learning_rate": 0.000867684218924801, + "loss": 0.87891388, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.45629883, + "step": 1351, + "time_per_iteration": 2.8635144233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089623, + "balance_loss_mlp": 1.0725522, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.04013302579778462, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80036712, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.17089844, + "step": 1352, + "time_per_iteration": 4.89817476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121624, + "balance_loss_mlp": 1.07587171, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.055845692832442596, + "language_loss": 0.85694808, + "learning_rate": 0.0008672616893328834, + "loss": 0.8681643, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.45751953, + "step": 1353, + "time_per_iteration": 2.9335103034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123767, + "balance_loss_mlp": 1.07877684, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.07010977425409264, + "language_loss": 0.9082427, + "learning_rate": 0.0008670502106204512, + "loss": 0.91948032, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.44970703, + "step": 1354, + "time_per_iteration": 2.8469178676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138616, + "balance_loss_mlp": 1.08840501, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.056353527093492256, + "language_loss": 0.82360619, + "learning_rate": 0.0008668385894064892, + "loss": 0.83499235, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.50195312, + "step": 1355, + "time_per_iteration": 2.672883987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149756, + "balance_loss_mlp": 1.10321617, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.05383030346289838, + "language_loss": 0.89593899, + "learning_rate": 0.0008666268257731562, + "loss": 0.90743661, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46557617, + "step": 1356, + "time_per_iteration": 3.1050939559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169178, + "balance_loss_mlp": 1.12127948, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.05849819020383372, + "language_loss": 0.85968256, + "learning_rate": 0.0008664149198026662, + "loss": 0.87137431, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.47900391, + "step": 1357, + "time_per_iteration": 3.226966619491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156465, + "balance_loss_mlp": 1.10932934, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.07293583935871151, + "language_loss": 0.89518476, + "learning_rate": 0.0008662028715772883, + "loss": 0.90674949, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.47143555, + "step": 1358, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163078, + "balance_loss_mlp": 1.11718237, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.05890556701012809, + "language_loss": 0.86217821, + "learning_rate": 0.0008659906811793467, + "loss": 0.87380904, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.45898438, + "step": 1359, + "time_per_iteration": 2.651193857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151481, + "balance_loss_mlp": 1.10699224, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.06298146111957026, + "language_loss": 0.90418088, + "learning_rate": 0.0008657783486912215, + "loss": 0.91569573, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.44482422, + "step": 1360, + "time_per_iteration": 2.723550319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156338, + "balance_loss_mlp": 1.11022782, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.055299708084911615, + "language_loss": 0.90110713, + "learning_rate": 0.0008655658741953472, + "loss": 0.91267049, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.4609375, + "step": 1361, + "time_per_iteration": 3.216830015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139946, + "balance_loss_mlp": 1.09564757, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.04868556149108388, + "language_loss": 0.89168048, + "learning_rate": 0.0008653532577742136, + "loss": 0.90307987, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.44311523, + "step": 1362, + "time_per_iteration": 2.718886375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143457, + "balance_loss_mlp": 1.0986346, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.058057923999792295, + "language_loss": 0.87558335, + "learning_rate": 0.0008651404995103659, + "loss": 0.88701797, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.44824219, + "step": 1363, + "time_per_iteration": 2.594294309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.09338474, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.06330728330165165, + "language_loss": 0.87334514, + "learning_rate": 0.0008649275994864041, + "loss": 0.88471884, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.43994141, + "step": 1364, + "time_per_iteration": 2.707449197769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144999, + "balance_loss_mlp": 1.09879303, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.05276541609050752, + "language_loss": 0.84391934, + "learning_rate": 0.0008647145577849834, + "loss": 0.85536933, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46191406, + "step": 1365, + "time_per_iteration": 2.8216350078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131283, + "balance_loss_mlp": 1.08560157, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.05376997595185902, + "language_loss": 0.83317888, + "learning_rate": 0.0008645013744888139, + "loss": 0.84449172, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.45678711, + "step": 1366, + "time_per_iteration": 2.866891622543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149616, + "balance_loss_mlp": 1.10536587, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.06316724717597957, + "language_loss": 0.87992281, + "learning_rate": 0.0008642880496806607, + "loss": 0.89141893, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.44287109, + "step": 1367, + "time_per_iteration": 2.7763173580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142909, + "balance_loss_mlp": 1.09772861, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.05877759558608074, + "language_loss": 0.84959197, + "learning_rate": 0.0008640745834433437, + "loss": 0.86102104, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.4519043, + "step": 1368, + "time_per_iteration": 2.738328218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134336, + "balance_loss_mlp": 1.09018087, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.05935956886320276, + "language_loss": 0.87054664, + "learning_rate": 0.000863860975859738, + "loss": 0.88189, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.44165039, + "step": 1369, + "time_per_iteration": 2.9206831455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131372, + "balance_loss_mlp": 1.0855242, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.06691392922801855, + "language_loss": 0.88684422, + "learning_rate": 0.0008636472270127733, + "loss": 0.89815795, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.45825195, + "step": 1370, + "time_per_iteration": 2.6078739166259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116186, + "balance_loss_mlp": 1.07021928, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.06515524250359679, + "language_loss": 0.90367895, + "learning_rate": 0.0008634333369854345, + "loss": 0.91484082, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.45947266, + "step": 1371, + "time_per_iteration": 2.6001384258270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110327, + "balance_loss_mlp": 1.0667206, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.056061894150206536, + "language_loss": 0.87892628, + "learning_rate": 0.0008632193058607608, + "loss": 0.89002955, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.43554688, + "step": 1372, + "time_per_iteration": 2.711435317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113538, + "balance_loss_mlp": 1.06628299, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.060513983317086996, + "language_loss": 0.81023312, + "learning_rate": 0.0008630051337218466, + "loss": 0.82136846, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47314453, + "step": 1373, + "time_per_iteration": 2.656416893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110026, + "balance_loss_mlp": 1.0668484, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0689512550651149, + "language_loss": 0.82808203, + "learning_rate": 0.0008627908206518409, + "loss": 0.83918226, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.43188477, + "step": 1374, + "time_per_iteration": 2.673738956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_mlp": 1.02716982, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.01820003864645097, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76191109, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12695312, + "step": 1375, + "time_per_iteration": 5.317140817642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115308, + "balance_loss_mlp": 1.07272696, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.062338636090573274, + "language_loss": 0.91769958, + "learning_rate": 0.0008623617720514241, + "loss": 0.92885268, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.42578125, + "step": 1376, + "time_per_iteration": 2.666618585586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117829, + "balance_loss_mlp": 1.07255304, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.08321054400070194, + "language_loss": 0.85169828, + "learning_rate": 0.0008621470366875848, + "loss": 0.86287659, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.45288086, + "step": 1377, + "time_per_iteration": 2.5939900875091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011137, + "balance_loss_mlp": 1.0724293, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.0756812485553519, + "language_loss": 0.88528687, + "learning_rate": 0.0008619321607257966, + "loss": 0.89642382, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.41259766, + "step": 1378, + "time_per_iteration": 2.675719976425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.08109117, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.05967522341676015, + "language_loss": 0.8244732, + "learning_rate": 0.000861717144249482, + "loss": 0.8357054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.42138672, + "step": 1379, + "time_per_iteration": 2.8289949893951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132354, + "balance_loss_mlp": 1.09170318, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06486885922060631, + "language_loss": 0.90334523, + "learning_rate": 0.0008615019873421175, + "loss": 0.91466868, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.40649414, + "step": 1380, + "time_per_iteration": 2.4665510654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141805, + "balance_loss_mlp": 1.09798408, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.06471812563896691, + "language_loss": 0.86262017, + "learning_rate": 0.0008612866900872349, + "loss": 0.87403822, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.43823242, + "step": 1381, + "time_per_iteration": 2.553489923477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140972, + "balance_loss_mlp": 1.10017824, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.07006288293307902, + "language_loss": 0.88817614, + "learning_rate": 0.0008610712525684197, + "loss": 0.89958596, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.40771484, + "step": 1382, + "time_per_iteration": 2.623844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156525, + "balance_loss_mlp": 1.11341906, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.06690376769295572, + "language_loss": 0.85084939, + "learning_rate": 0.0008608556748693121, + "loss": 0.8624146, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.43115234, + "step": 1383, + "time_per_iteration": 3.248947858810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149603, + "balance_loss_mlp": 1.10549557, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.05893966497122096, + "language_loss": 0.86648834, + "learning_rate": 0.000860639957073607, + "loss": 0.8779844, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.44116211, + "step": 1384, + "time_per_iteration": 2.6954376697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161137, + "balance_loss_mlp": 1.11838901, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.05777577847879513, + "language_loss": 0.88325369, + "learning_rate": 0.0008604240992650534, + "loss": 0.8948651, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.42749023, + "step": 1385, + "time_per_iteration": 2.6810553073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116884, + "balance_loss_mlp": 1.12613928, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.1266990207417539, + "language_loss": 0.89650941, + "learning_rate": 0.0008602081015274545, + "loss": 0.90819776, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.42724609, + "step": 1386, + "time_per_iteration": 2.7079007625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169207, + "balance_loss_mlp": 1.12602973, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.05666517988787923, + "language_loss": 0.83684492, + "learning_rate": 0.0008599919639446684, + "loss": 0.84853697, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.43139648, + "step": 1387, + "time_per_iteration": 2.67275333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184027, + "balance_loss_mlp": 1.13755894, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.06873806966805297, + "language_loss": 0.80686462, + "learning_rate": 0.000859775686600607, + "loss": 0.81870484, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46459961, + "step": 1388, + "time_per_iteration": 2.568384885787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192065, + "balance_loss_mlp": 1.14676547, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.07413400256287127, + "language_loss": 0.85524642, + "learning_rate": 0.0008595592695792367, + "loss": 0.86716712, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.453125, + "step": 1389, + "time_per_iteration": 2.6748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182907, + "balance_loss_mlp": 1.13884759, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06676524761439688, + "language_loss": 0.9117986, + "learning_rate": 0.0008593427129645778, + "loss": 0.92362767, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.44042969, + "step": 1390, + "time_per_iteration": 2.5506954193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186114, + "balance_loss_mlp": 1.14205468, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.056989477345309104, + "language_loss": 0.85532665, + "learning_rate": 0.0008591260168407052, + "loss": 0.86718786, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.44067383, + "step": 1391, + "time_per_iteration": 2.759000778198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_mlp": 1.13714194, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.12230490659722075, + "language_loss": 0.83154678, + "learning_rate": 0.0008589091812917479, + "loss": 0.84336257, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.4440918, + "step": 1392, + "time_per_iteration": 2.6213910579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183464, + "balance_loss_mlp": 1.14030981, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07403824045185783, + "language_loss": 0.8547672, + "learning_rate": 0.0008586922064018887, + "loss": 0.86660182, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.43139648, + "step": 1393, + "time_per_iteration": 2.6706490516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170846, + "balance_loss_mlp": 1.12375855, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.06891205333434622, + "language_loss": 0.89827204, + "learning_rate": 0.0008584750922553651, + "loss": 0.90998048, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.47021484, + "step": 1394, + "time_per_iteration": 3.1465976238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164798, + "balance_loss_mlp": 1.1222403, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.06253124916771012, + "language_loss": 0.84102368, + "learning_rate": 0.0008582578389364677, + "loss": 0.85267168, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.42529297, + "step": 1395, + "time_per_iteration": 2.853278875350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170721, + "balance_loss_mlp": 1.12573135, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.0656545534576685, + "language_loss": 0.92268932, + "learning_rate": 0.0008580404465295422, + "loss": 0.93439656, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.44970703, + "step": 1396, + "time_per_iteration": 2.773932695388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152323, + "balance_loss_mlp": 1.10826349, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07972324646927738, + "language_loss": 0.88789833, + "learning_rate": 0.0008578229151189876, + "loss": 0.89942157, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.44067383, + "step": 1397, + "time_per_iteration": 2.934276819229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10151267, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.10010461149900847, + "language_loss": 0.8178823, + "learning_rate": 0.0008576052447892573, + "loss": 0.82932794, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.43115234, + "step": 1398, + "time_per_iteration": 2.5337071418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131122, + "balance_loss_mlp": 1.08768189, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.07718983812215899, + "language_loss": 0.86768365, + "learning_rate": 0.000857387435624858, + "loss": 0.87899494, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.43457031, + "step": 1399, + "time_per_iteration": 2.5189273357391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127749, + "balance_loss_mlp": 1.08404672, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0707561541840249, + "language_loss": 0.88852745, + "learning_rate": 0.0008571694877103513, + "loss": 0.89980495, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.43701172, + "step": 1400, + "time_per_iteration": 3.287325859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126684, + "balance_loss_mlp": 1.08372128, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.08476375879770352, + "language_loss": 0.88499445, + "learning_rate": 0.0008569514011303515, + "loss": 0.89626133, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.4296875, + "step": 1401, + "time_per_iteration": 2.849506378173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120044, + "balance_loss_mlp": 1.07770109, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.12418270059874827, + "language_loss": 0.88531977, + "learning_rate": 0.0008567331759695277, + "loss": 0.89652026, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.42358398, + "step": 1402, + "time_per_iteration": 2.7033023834228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119932, + "balance_loss_mlp": 1.07584798, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.09855769315853927, + "language_loss": 0.86756563, + "learning_rate": 0.0008565148123126023, + "loss": 0.87876499, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.44091797, + "step": 1403, + "time_per_iteration": 2.645425319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119876, + "balance_loss_mlp": 1.07769978, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.15226973878739974, + "language_loss": 0.86578166, + "learning_rate": 0.0008562963102443516, + "loss": 0.87698042, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.421875, + "step": 1404, + "time_per_iteration": 2.6965179443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130222, + "balance_loss_mlp": 1.08668637, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.09156828725831004, + "language_loss": 0.85926664, + "learning_rate": 0.0008560776698496056, + "loss": 0.87056887, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.43530273, + "step": 1405, + "time_per_iteration": 2.868159532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141969, + "balance_loss_mlp": 1.09707534, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.07226677638641436, + "language_loss": 0.86433703, + "learning_rate": 0.0008558588912132481, + "loss": 0.87575674, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.44873047, + "step": 1406, + "time_per_iteration": 2.8309988975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.05236614, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03207539465139433, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77525663, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.14257812, + "step": 1407, + "time_per_iteration": 4.926543235778809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09220862, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.06146298960376288, + "language_loss": 0.83448923, + "learning_rate": 0.0008554209195555016, + "loss": 0.84585381, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.44287109, + "step": 1408, + "time_per_iteration": 2.6698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136456, + "balance_loss_mlp": 1.08965421, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.1627330563817166, + "language_loss": 0.89102834, + "learning_rate": 0.0008552017267041483, + "loss": 0.90239286, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.46801758, + "step": 1409, + "time_per_iteration": 2.6957972049713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127578, + "balance_loss_mlp": 1.08349395, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06560812899143556, + "language_loss": 0.83656335, + "learning_rate": 0.0008549823959512549, + "loss": 0.84783912, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.44091797, + "step": 1410, + "time_per_iteration": 2.7068376541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011101, + "balance_loss_mlp": 1.06708908, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.08175260567644033, + "language_loss": 0.87610555, + "learning_rate": 0.0008547629273819728, + "loss": 0.88720655, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.43041992, + "step": 1411, + "time_per_iteration": 3.366260290145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_mlp": 1.06542349, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.10517352924457117, + "language_loss": 0.84009993, + "learning_rate": 0.0008545433210815074, + "loss": 0.85118002, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.42578125, + "step": 1412, + "time_per_iteration": 2.630105972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.07931852, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.09841738404648297, + "language_loss": 0.87974489, + "learning_rate": 0.0008543235771351176, + "loss": 0.89097011, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.43188477, + "step": 1413, + "time_per_iteration": 2.725048065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.08635998, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.059677420125308425, + "language_loss": 0.84918916, + "learning_rate": 0.0008541036956281154, + "loss": 0.86048239, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.42993164, + "step": 1414, + "time_per_iteration": 2.897216796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133545, + "balance_loss_mlp": 1.08898425, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.08487151018546404, + "language_loss": 0.82919049, + "learning_rate": 0.0008538836766458665, + "loss": 0.84052598, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.44580078, + "step": 1415, + "time_per_iteration": 2.8930981159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137425, + "balance_loss_mlp": 1.0942955, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09871518143765563, + "language_loss": 0.85738099, + "learning_rate": 0.0008536635202737897, + "loss": 0.86875528, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.43164062, + "step": 1416, + "time_per_iteration": 2.7891178131103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137299, + "balance_loss_mlp": 1.0931915, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.10766210404252562, + "language_loss": 0.82790214, + "learning_rate": 0.0008534432265973573, + "loss": 0.83927512, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.44091797, + "step": 1417, + "time_per_iteration": 2.6409006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141948, + "balance_loss_mlp": 1.09691095, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07824380469589887, + "language_loss": 0.88708508, + "learning_rate": 0.000853222795702095, + "loss": 0.89850456, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.45092773, + "step": 1418, + "time_per_iteration": 3.4312241077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115343, + "balance_loss_mlp": 1.10767758, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.06262628073505326, + "language_loss": 0.84196067, + "learning_rate": 0.0008530022276735813, + "loss": 0.85349494, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.45727539, + "step": 1419, + "time_per_iteration": 2.742341995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169959, + "balance_loss_mlp": 1.12742519, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07008703106338479, + "language_loss": 0.86301696, + "learning_rate": 0.0008527815225974489, + "loss": 0.87471658, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.42529297, + "step": 1420, + "time_per_iteration": 2.643151044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172801, + "balance_loss_mlp": 1.12731028, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10800570533054084, + "language_loss": 0.88767672, + "learning_rate": 0.0008525606805593829, + "loss": 0.8994047, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.45483398, + "step": 1421, + "time_per_iteration": 2.4374186992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115892, + "balance_loss_mlp": 1.11283422, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.11472023337789067, + "language_loss": 0.83181965, + "learning_rate": 0.0008523397016451213, + "loss": 0.84340894, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46142578, + "step": 1422, + "time_per_iteration": 2.585376739501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152063, + "balance_loss_mlp": 1.10824132, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.08784028487991961, + "language_loss": 0.87910116, + "learning_rate": 0.0008521185859404564, + "loss": 0.89062172, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.43847656, + "step": 1423, + "time_per_iteration": 3.399348020553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150781, + "balance_loss_mlp": 1.10634017, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06323160386311827, + "language_loss": 0.89755672, + "learning_rate": 0.0008518973335312326, + "loss": 0.90906453, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.44433594, + "step": 1424, + "time_per_iteration": 2.771397352218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141797, + "balance_loss_mlp": 1.09628344, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.0741893947597381, + "language_loss": 0.83755773, + "learning_rate": 0.0008516759445033477, + "loss": 0.84897572, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.45532227, + "step": 1425, + "time_per_iteration": 2.623136520385742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148744, + "balance_loss_mlp": 1.10227656, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08118081060083703, + "language_loss": 0.85448551, + "learning_rate": 0.0008514544189427526, + "loss": 0.865973, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.46484375, + "step": 1426, + "time_per_iteration": 2.695749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156426, + "balance_loss_mlp": 1.11208034, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.0837156631450272, + "language_loss": 0.86976963, + "learning_rate": 0.0008512327569354511, + "loss": 0.88133389, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.44360352, + "step": 1427, + "time_per_iteration": 2.5354061126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160051, + "balance_loss_mlp": 1.11353528, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.09189170382991782, + "language_loss": 0.84034801, + "learning_rate": 0.0008510109585675001, + "loss": 0.8519485, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.46508789, + "step": 1428, + "time_per_iteration": 2.5996179580688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093492, + "balance_loss_mlp": 1.07680273, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.03549776566589832, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.8224684, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.16699219, + "step": 1429, + "time_per_iteration": 4.714696407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172648, + "balance_loss_mlp": 1.1280638, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.1239425770540774, + "language_loss": 0.81035018, + "learning_rate": 0.0008505669530941415, + "loss": 0.82207668, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 3.346867322921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171144, + "balance_loss_mlp": 1.12613082, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.0741807723541833, + "language_loss": 0.84519219, + "learning_rate": 0.000850344746161112, + "loss": 0.85690367, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.45019531, + "step": 1431, + "time_per_iteration": 2.6365530490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178527, + "balance_loss_mlp": 1.13418126, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.09683250699138053, + "language_loss": 0.88287663, + "learning_rate": 0.0008501224032121894, + "loss": 0.8946619, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.44360352, + "step": 1432, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178788, + "balance_loss_mlp": 1.13406062, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06051880699738469, + "language_loss": 0.82098711, + "learning_rate": 0.0008498999243336946, + "loss": 0.832775, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.44726562, + "step": 1433, + "time_per_iteration": 2.643663167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198526, + "balance_loss_mlp": 1.15129471, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.07173936681504893, + "language_loss": 0.87897062, + "learning_rate": 0.0008496773096120021, + "loss": 0.89095587, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.47241211, + "step": 1434, + "time_per_iteration": 2.8680803775787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198281, + "balance_loss_mlp": 1.15164685, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.07924459326066897, + "language_loss": 0.84949142, + "learning_rate": 0.0008494545591335381, + "loss": 0.86147422, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46630859, + "step": 1435, + "time_per_iteration": 2.9436187744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197184, + "balance_loss_mlp": 1.15176487, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.05338969573395925, + "language_loss": 0.87283278, + "learning_rate": 0.0008492316729847823, + "loss": 0.88480461, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.4543457, + "step": 1436, + "time_per_iteration": 2.817201614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195413, + "balance_loss_mlp": 1.14739525, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08524745340475512, + "language_loss": 0.80082995, + "learning_rate": 0.0008490086512522664, + "loss": 0.81278408, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47998047, + "step": 1437, + "time_per_iteration": 2.7126290798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196486, + "balance_loss_mlp": 1.14870656, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.06867103991167788, + "language_loss": 0.90572739, + "learning_rate": 0.0008487854940225755, + "loss": 0.9176923, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47729492, + "step": 1438, + "time_per_iteration": 2.431755542755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207177, + "balance_loss_mlp": 1.15858746, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.13716227323677116, + "language_loss": 0.90202403, + "learning_rate": 0.0008485622013823466, + "loss": 0.91409582, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.48608398, + "step": 1439, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198257, + "balance_loss_mlp": 1.15062046, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.09985187013126534, + "language_loss": 0.836923, + "learning_rate": 0.00084833877341827, + "loss": 0.84890562, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47680664, + "step": 1440, + "time_per_iteration": 2.652665138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215433, + "balance_loss_mlp": 1.16562724, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09777751450797587, + "language_loss": 0.81022394, + "learning_rate": 0.000848115210217088, + "loss": 0.82237822, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.49853516, + "step": 1441, + "time_per_iteration": 2.550879955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120133, + "balance_loss_mlp": 1.15166724, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.06658099231370791, + "language_loss": 0.82249796, + "learning_rate": 0.0008478915118655952, + "loss": 0.83451128, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.49658203, + "step": 1442, + "time_per_iteration": 2.7541940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209129, + "balance_loss_mlp": 1.16261363, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.05385742523937431, + "language_loss": 0.86750221, + "learning_rate": 0.0008476676784506393, + "loss": 0.87959349, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.46557617, + "step": 1443, + "time_per_iteration": 2.6595921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120895, + "balance_loss_mlp": 1.16083765, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07541643273231594, + "language_loss": 0.82715142, + "learning_rate": 0.0008474437100591201, + "loss": 0.83924091, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.48120117, + "step": 1444, + "time_per_iteration": 3.285985231399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209577, + "balance_loss_mlp": 1.16258454, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.07952238187909891, + "language_loss": 0.8560605, + "learning_rate": 0.0008472196067779898, + "loss": 0.86815625, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47021484, + "step": 1445, + "time_per_iteration": 2.677077293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204567, + "balance_loss_mlp": 1.15600109, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10163023549653756, + "language_loss": 0.86494523, + "learning_rate": 0.0008469953686942531, + "loss": 0.87699091, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.48583984, + "step": 1446, + "time_per_iteration": 3.10603928565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.14158559, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.0769454608790312, + "language_loss": 0.83537692, + "learning_rate": 0.0008467709958949668, + "loss": 0.84726554, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.47265625, + "step": 1447, + "time_per_iteration": 2.7602903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116478, + "balance_loss_mlp": 1.11943233, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08244080074007111, + "language_loss": 0.86534739, + "learning_rate": 0.0008465464884672403, + "loss": 0.87699515, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.45410156, + "step": 1448, + "time_per_iteration": 2.702974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178355, + "balance_loss_mlp": 1.13424778, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.061441667483596626, + "language_loss": 0.85982984, + "learning_rate": 0.0008463218464982348, + "loss": 0.87161338, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.44091797, + "step": 1449, + "time_per_iteration": 2.832615852355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185601, + "balance_loss_mlp": 1.14058757, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07503412994840371, + "language_loss": 0.88168389, + "learning_rate": 0.0008460970700751645, + "loss": 0.89353991, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.45019531, + "step": 1450, + "time_per_iteration": 3.0487136840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185626, + "balance_loss_mlp": 1.13977861, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.06352945894963989, + "language_loss": 0.88538259, + "learning_rate": 0.000845872159285295, + "loss": 0.89723885, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.45849609, + "step": 1451, + "time_per_iteration": 2.715423822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_mlp": 1.04985404, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.02807340123185793, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78833961, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17285156, + "step": 1452, + "time_per_iteration": 4.906192302703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197684, + "balance_loss_mlp": 1.15064442, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.06703382456082828, + "language_loss": 0.86617672, + "learning_rate": 0.0008454219349544836, + "loss": 0.87815356, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47045898, + "step": 1453, + "time_per_iteration": 3.3534200191497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.15343201, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.08552050648295068, + "language_loss": 0.82341981, + "learning_rate": 0.000845196621588334, + "loss": 0.83540004, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.44580078, + "step": 1454, + "time_per_iteration": 2.743699073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204394, + "balance_loss_mlp": 1.1566391, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.05325666962256515, + "language_loss": 0.7637955, + "learning_rate": 0.0008449711742049706, + "loss": 0.77583951, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.4777832, + "step": 1455, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208188, + "balance_loss_mlp": 1.16222095, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.09912152167704158, + "language_loss": 0.84447122, + "learning_rate": 0.0008447455928919196, + "loss": 0.85655314, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.45996094, + "step": 1456, + "time_per_iteration": 2.597557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242882, + "balance_loss_mlp": 1.19460225, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.060789109492995964, + "language_loss": 0.87272859, + "learning_rate": 0.0008445198777367595, + "loss": 0.88515741, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.48291016, + "step": 1457, + "time_per_iteration": 2.5689990520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283391, + "balance_loss_mlp": 1.23394287, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.0840599244275116, + "language_loss": 0.80820799, + "learning_rate": 0.0008442940288271208, + "loss": 0.82104188, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.49365234, + "step": 1458, + "time_per_iteration": 2.674907922744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299064, + "balance_loss_mlp": 1.24899602, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06912303271008884, + "language_loss": 0.87410611, + "learning_rate": 0.0008440680462506856, + "loss": 0.88709676, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.50073242, + "step": 1459, + "time_per_iteration": 2.73905873298645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312423, + "balance_loss_mlp": 1.26221192, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.11964292138845481, + "language_loss": 0.86650789, + "learning_rate": 0.0008438419300951883, + "loss": 0.87963212, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.50219727, + "step": 1460, + "time_per_iteration": 2.6775193214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277494, + "balance_loss_mlp": 1.22690177, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.08967430845786024, + "language_loss": 0.86711442, + "learning_rate": 0.0008436156804484148, + "loss": 0.87988937, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.50610352, + "step": 1461, + "time_per_iteration": 2.8446624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225027, + "balance_loss_mlp": 1.17615128, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.06778030965882964, + "language_loss": 0.88354933, + "learning_rate": 0.0008433892973982031, + "loss": 0.89579964, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.48901367, + "step": 1462, + "time_per_iteration": 2.5101869106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212759, + "balance_loss_mlp": 1.16168988, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07940790981700917, + "language_loss": 0.85705763, + "learning_rate": 0.0008431627810324431, + "loss": 0.86918521, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.51098633, + "step": 1463, + "time_per_iteration": 2.6701931953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208608, + "balance_loss_mlp": 1.15906441, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.1112721524597414, + "language_loss": 0.81312853, + "learning_rate": 0.000842936131439076, + "loss": 0.82521462, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.49584961, + "step": 1464, + "time_per_iteration": 2.626397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182235, + "balance_loss_mlp": 1.13440847, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.10805991000078381, + "language_loss": 0.88305855, + "learning_rate": 0.0008427093487060951, + "loss": 0.89488095, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.4777832, + "step": 1465, + "time_per_iteration": 2.6287689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152025, + "balance_loss_mlp": 1.10815573, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.05392746655550109, + "language_loss": 0.85014635, + "learning_rate": 0.000842482432921545, + "loss": 0.86166662, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.4387207, + "step": 1466, + "time_per_iteration": 2.843055009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140929, + "balance_loss_mlp": 1.09691691, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.12216249404138245, + "language_loss": 0.8786549, + "learning_rate": 0.0008422553841735225, + "loss": 0.89006418, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.44018555, + "step": 1467, + "time_per_iteration": 2.4870855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130953, + "balance_loss_mlp": 1.08686972, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.0834179705505054, + "language_loss": 0.85186172, + "learning_rate": 0.0008420282025501757, + "loss": 0.86317128, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.44091797, + "step": 1468, + "time_per_iteration": 2.746919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.09730196, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07747841896553878, + "language_loss": 0.85862702, + "learning_rate": 0.0008418008881397043, + "loss": 0.8700223, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.42236328, + "step": 1469, + "time_per_iteration": 2.7157111167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011536, + "balance_loss_mlp": 1.11108959, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.09196817065592088, + "language_loss": 0.83090472, + "learning_rate": 0.0008415734410303595, + "loss": 0.84244066, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.42529297, + "step": 1470, + "time_per_iteration": 3.2546660900115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.1166662, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07745609031802311, + "language_loss": 0.91133046, + "learning_rate": 0.0008413458613104444, + "loss": 0.92292744, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.43017578, + "step": 1471, + "time_per_iteration": 2.683119773864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124215, + "balance_loss_mlp": 1.08091772, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06716648824100378, + "language_loss": 0.83225214, + "learning_rate": 0.0008411181490683129, + "loss": 0.84349424, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.43334961, + "step": 1472, + "time_per_iteration": 2.7247512340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112102, + "balance_loss_mlp": 1.06692195, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08730853561294576, + "language_loss": 0.83099282, + "learning_rate": 0.0008408903043923707, + "loss": 0.84211385, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.45166016, + "step": 1473, + "time_per_iteration": 2.9982750415802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_mlp": 1.06675041, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09441991509127853, + "language_loss": 0.81456125, + "learning_rate": 0.0008406623273710754, + "loss": 0.82569724, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.46826172, + "step": 1474, + "time_per_iteration": 2.6457254886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107143, + "balance_loss_mlp": 1.06482363, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.08147557265850319, + "language_loss": 0.83874208, + "learning_rate": 0.0008404342180929351, + "loss": 0.84981352, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.42358398, + "step": 1475, + "time_per_iteration": 2.6071481704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110668, + "balance_loss_mlp": 1.06758618, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0682383784230515, + "language_loss": 0.81900609, + "learning_rate": 0.00084020597664651, + "loss": 0.83011281, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.43066406, + "step": 1476, + "time_per_iteration": 2.831547260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118821, + "balance_loss_mlp": 1.07149458, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.08199753583087593, + "language_loss": 0.84526181, + "learning_rate": 0.0008399776031204111, + "loss": 0.85645002, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.47290039, + "step": 1477, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112444, + "balance_loss_mlp": 1.07832992, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07183050675580523, + "language_loss": 0.80975109, + "learning_rate": 0.0008397490976033009, + "loss": 0.82099551, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.46118164, + "step": 1478, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.03766239, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.035679392232843235, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933525, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.16210938, + "step": 1479, + "time_per_iteration": 4.813107252120972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132957, + "balance_loss_mlp": 1.08925462, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06426749014533666, + "language_loss": 0.85708797, + "learning_rate": 0.0008392916909509525, + "loss": 0.86841756, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.43725586, + "step": 1480, + "time_per_iteration": 3.105465888977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.10180378, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.12099224111333258, + "language_loss": 0.8583495, + "learning_rate": 0.0008390627899932954, + "loss": 0.86980623, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.43847656, + "step": 1481, + "time_per_iteration": 2.5961339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146403, + "balance_loss_mlp": 1.1041795, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.09850404509995118, + "language_loss": 0.88747412, + "learning_rate": 0.000838833757399789, + "loss": 0.89893812, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.42211914, + "step": 1482, + "time_per_iteration": 2.9445223808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160742, + "balance_loss_mlp": 1.11513209, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.09258701289693592, + "language_loss": 0.81233478, + "learning_rate": 0.0008386045932593515, + "loss": 0.82394218, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.45605469, + "step": 1483, + "time_per_iteration": 2.696171283721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172022, + "balance_loss_mlp": 1.12853456, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07718327666813503, + "language_loss": 0.8687939, + "learning_rate": 0.0008383752976609525, + "loss": 0.88051414, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.43481445, + "step": 1484, + "time_per_iteration": 2.948983907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159194, + "balance_loss_mlp": 1.11508679, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06564205880415652, + "language_loss": 0.80617285, + "learning_rate": 0.0008381458706936123, + "loss": 0.81776482, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.44116211, + "step": 1485, + "time_per_iteration": 2.689715623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117177, + "balance_loss_mlp": 1.12740064, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06570872016312425, + "language_loss": 0.87734085, + "learning_rate": 0.0008379163124464025, + "loss": 0.88905853, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.44384766, + "step": 1486, + "time_per_iteration": 2.7226197719573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166912, + "balance_loss_mlp": 1.12526059, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.0915307653224295, + "language_loss": 0.77564812, + "learning_rate": 0.0008376866230084452, + "loss": 0.78731728, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.41650391, + "step": 1487, + "time_per_iteration": 2.82708477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154293, + "balance_loss_mlp": 1.10901785, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07232162522245564, + "language_loss": 0.86754864, + "learning_rate": 0.000837456802468914, + "loss": 0.87909162, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.45239258, + "step": 1488, + "time_per_iteration": 2.6107335090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115391, + "balance_loss_mlp": 1.1082294, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.06580975478488113, + "language_loss": 0.85965604, + "learning_rate": 0.0008372268509170331, + "loss": 0.8711952, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.45678711, + "step": 1489, + "time_per_iteration": 2.682190418243408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147981, + "balance_loss_mlp": 1.10554218, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.0640942252200205, + "language_loss": 0.85215169, + "learning_rate": 0.0008369967684420779, + "loss": 0.86363149, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.42431641, + "step": 1490, + "time_per_iteration": 2.708315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.11154985, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.07293711729105107, + "language_loss": 0.84566355, + "learning_rate": 0.0008367665551333736, + "loss": 0.85722154, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.44262695, + "step": 1491, + "time_per_iteration": 2.605665445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159368, + "balance_loss_mlp": 1.11216116, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.0802107480821924, + "language_loss": 0.85808468, + "learning_rate": 0.0008365362110802977, + "loss": 0.86967838, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47241211, + "step": 1492, + "time_per_iteration": 2.879655122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155437, + "balance_loss_mlp": 1.109303, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.06007050516222503, + "language_loss": 0.82957923, + "learning_rate": 0.0008363057363722773, + "loss": 0.84113365, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.46142578, + "step": 1493, + "time_per_iteration": 2.8600335121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154458, + "balance_loss_mlp": 1.11085081, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.060904552171674266, + "language_loss": 0.8464222, + "learning_rate": 0.0008360751310987906, + "loss": 0.85796678, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.4362793, + "step": 1494, + "time_per_iteration": 2.602029800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151781, + "balance_loss_mlp": 1.11160707, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.06255193118064963, + "language_loss": 0.86073208, + "learning_rate": 0.0008358443953493666, + "loss": 0.87224984, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.40185547, + "step": 1495, + "time_per_iteration": 2.8682689666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116061, + "balance_loss_mlp": 1.11702669, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.06637793594414569, + "language_loss": 0.89093578, + "learning_rate": 0.0008356135292135851, + "loss": 0.90254188, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.43579102, + "step": 1496, + "time_per_iteration": 2.519700288772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162426, + "balance_loss_mlp": 1.11760294, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.07926576541007177, + "language_loss": 0.92873323, + "learning_rate": 0.0008353825327810758, + "loss": 0.94035745, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.44873047, + "step": 1497, + "time_per_iteration": 2.4195892810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.09852648, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.05522330058639147, + "language_loss": 0.81832987, + "learning_rate": 0.00083515140614152, + "loss": 0.82973409, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.41894531, + "step": 1498, + "time_per_iteration": 2.6989245414733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151843, + "balance_loss_mlp": 1.10992932, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.08112895482541128, + "language_loss": 0.87581354, + "learning_rate": 0.0008349201493846485, + "loss": 0.88733196, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.41894531, + "step": 1499, + "time_per_iteration": 2.647165298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113639, + "balance_loss_mlp": 1.09364128, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.06188269799142739, + "language_loss": 0.89485824, + "learning_rate": 0.0008346887626002432, + "loss": 0.90622216, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.42724609, + "step": 1500, + "time_per_iteration": 2.546494960784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.09546816, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.07756887509348087, + "language_loss": 0.86612689, + "learning_rate": 0.000834457245878137, + "loss": 0.87751424, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.43261719, + "step": 1501, + "time_per_iteration": 2.6271145343780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132854, + "balance_loss_mlp": 1.08993816, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07465598629984396, + "language_loss": 0.8176384, + "learning_rate": 0.000834225599308212, + "loss": 0.82896686, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.42895508, + "step": 1502, + "time_per_iteration": 3.2550971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150677, + "balance_loss_mlp": 1.10580611, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07581203663628927, + "language_loss": 0.85830456, + "learning_rate": 0.0008339938229804016, + "loss": 0.8698113, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.44897461, + "step": 1503, + "time_per_iteration": 2.704310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132016, + "balance_loss_mlp": 1.11475468, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04995777902546146, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76566839, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17285156, + "step": 1504, + "time_per_iteration": 4.959474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.10965538, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.06157445053236475, + "language_loss": 0.84505653, + "learning_rate": 0.0008335298814111094, + "loss": 0.85662901, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47607422, + "step": 1505, + "time_per_iteration": 2.5612986087799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178976, + "balance_loss_mlp": 1.13374829, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.05887296654917154, + "language_loss": 0.88222575, + "learning_rate": 0.0008332977163497455, + "loss": 0.89401549, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.4519043, + "step": 1506, + "time_per_iteration": 2.8017849922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183741, + "balance_loss_mlp": 1.13696313, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07773532252894584, + "language_loss": 0.83964998, + "learning_rate": 0.0008330654218907325, + "loss": 0.8514874, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.46801758, + "step": 1507, + "time_per_iteration": 2.6568052768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167782, + "balance_loss_mlp": 1.12016964, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.05364053536005051, + "language_loss": 0.82260346, + "learning_rate": 0.0008328329981242548, + "loss": 0.83428133, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47631836, + "step": 1508, + "time_per_iteration": 2.8732171058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161954, + "balance_loss_mlp": 1.11479485, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.06776855665971031, + "language_loss": 0.88091129, + "learning_rate": 0.0008326004451405475, + "loss": 0.8925308, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47143555, + "step": 1509, + "time_per_iteration": 2.762476921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.11104107, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.08089915602738365, + "language_loss": 0.82757521, + "learning_rate": 0.0008323677630298957, + "loss": 0.83914363, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.45800781, + "step": 1510, + "time_per_iteration": 2.554558753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152926, + "balance_loss_mlp": 1.1073643, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.07106066660777852, + "language_loss": 0.85773015, + "learning_rate": 0.0008321349518826345, + "loss": 0.86925942, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.45556641, + "step": 1511, + "time_per_iteration": 2.8341891765594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144812, + "balance_loss_mlp": 1.09870172, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.06994476337169399, + "language_loss": 0.95554525, + "learning_rate": 0.0008319020117891491, + "loss": 0.96699333, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.4609375, + "step": 1512, + "time_per_iteration": 2.6152215003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147304, + "balance_loss_mlp": 1.09902406, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.09218377020634298, + "language_loss": 0.87772787, + "learning_rate": 0.0008316689428398751, + "loss": 0.88920093, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.4831543, + "step": 1513, + "time_per_iteration": 2.687288522720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148068, + "balance_loss_mlp": 1.10407972, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05407373665960582, + "language_loss": 0.89050305, + "learning_rate": 0.0008314357451252979, + "loss": 0.90198368, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.44018555, + "step": 1514, + "time_per_iteration": 2.7870078086853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151939, + "balance_loss_mlp": 1.10644853, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.11283198751561448, + "language_loss": 0.88657945, + "learning_rate": 0.0008312024187359527, + "loss": 0.89809883, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.45483398, + "step": 1515, + "time_per_iteration": 2.6400256156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144613, + "balance_loss_mlp": 1.10060108, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.08270455580526427, + "language_loss": 0.87534022, + "learning_rate": 0.000830968963762425, + "loss": 0.8867864, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.43994141, + "step": 1516, + "time_per_iteration": 3.0442028045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151597, + "balance_loss_mlp": 1.10617828, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.06364079743342543, + "language_loss": 0.84482789, + "learning_rate": 0.0008307353802953497, + "loss": 0.85634387, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.45361328, + "step": 1517, + "time_per_iteration": 2.672921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171551, + "balance_loss_mlp": 1.12281811, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.060139597091390135, + "language_loss": 0.86612219, + "learning_rate": 0.0008305016684254125, + "loss": 0.87783766, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.48803711, + "step": 1518, + "time_per_iteration": 2.7845590114593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.12947094, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.09151635615922826, + "language_loss": 0.87469971, + "learning_rate": 0.0008302678282433479, + "loss": 0.88644284, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.44848633, + "step": 1519, + "time_per_iteration": 2.562605619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163342, + "balance_loss_mlp": 1.11999798, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07068722957296131, + "language_loss": 0.85016668, + "learning_rate": 0.0008300338598399411, + "loss": 0.86180007, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.43359375, + "step": 1520, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155651, + "balance_loss_mlp": 1.11111403, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07704766336953982, + "language_loss": 0.95187533, + "learning_rate": 0.0008297997633060263, + "loss": 0.96343178, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.44506836, + "step": 1521, + "time_per_iteration": 2.5206730365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_mlp": 1.08468485, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07256926042070597, + "language_loss": 0.85441822, + "learning_rate": 0.0008295655387324883, + "loss": 0.865695, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.42993164, + "step": 1522, + "time_per_iteration": 2.8186635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126011, + "balance_loss_mlp": 1.08090246, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.07210388942873598, + "language_loss": 0.8532753, + "learning_rate": 0.0008293311862102609, + "loss": 0.86453545, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.45092773, + "step": 1523, + "time_per_iteration": 2.4982752799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.07334912, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.0579845522804068, + "language_loss": 0.89434093, + "learning_rate": 0.0008290967058303275, + "loss": 0.90552431, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.44995117, + "step": 1524, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.07575774, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07735764089304721, + "language_loss": 0.86793721, + "learning_rate": 0.0008288620976837219, + "loss": 0.87910557, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.41088867, + "step": 1525, + "time_per_iteration": 2.4877853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.06881261, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.06064034312392981, + "language_loss": 0.83118868, + "learning_rate": 0.000828627361861527, + "loss": 0.84231043, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.43383789, + "step": 1526, + "time_per_iteration": 2.567406415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06620967, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.0729369607745646, + "language_loss": 0.84539104, + "learning_rate": 0.0008283924984548752, + "loss": 0.85648245, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.42919922, + "step": 1527, + "time_per_iteration": 2.8396716117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117649, + "balance_loss_mlp": 1.07480514, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.05516048868040139, + "language_loss": 0.85423326, + "learning_rate": 0.0008281575075549485, + "loss": 0.86540973, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.4284668, + "step": 1528, + "time_per_iteration": 2.596402645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093475, + "balance_loss_mlp": 1.0787884, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.03776357558455706, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78446174, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.14648438, + "step": 1529, + "time_per_iteration": 4.641916513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118614, + "balance_loss_mlp": 1.07436347, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.11599739785132454, + "language_loss": 0.90857148, + "learning_rate": 0.0008276871436402469, + "loss": 0.91975754, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.44238281, + "step": 1530, + "time_per_iteration": 2.8211593627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113901, + "balance_loss_mlp": 1.07239282, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.06834093724659761, + "language_loss": 0.87937176, + "learning_rate": 0.000827451770808083, + "loss": 0.8905108, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.41503906, + "step": 1531, + "time_per_iteration": 2.7127888202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.06357539, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.06489723039655686, + "language_loss": 0.8385976, + "learning_rate": 0.0008272162708478674, + "loss": 0.84966749, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.43457031, + "step": 1532, + "time_per_iteration": 2.580057144165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119293, + "balance_loss_mlp": 1.07749844, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.06938693493012958, + "language_loss": 0.86437017, + "learning_rate": 0.000826980643851029, + "loss": 0.87556309, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.41821289, + "step": 1533, + "time_per_iteration": 2.689450740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118363, + "balance_loss_mlp": 1.07518554, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.057495804655394826, + "language_loss": 0.85101378, + "learning_rate": 0.0008267448899090464, + "loss": 0.8621974, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.43188477, + "step": 1534, + "time_per_iteration": 2.5541234016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139738, + "balance_loss_mlp": 1.09460509, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.0763188518859088, + "language_loss": 0.81071836, + "learning_rate": 0.0008265090091134473, + "loss": 0.82211578, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.45117188, + "step": 1535, + "time_per_iteration": 2.851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.09309804, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06589165398662913, + "language_loss": 0.80565453, + "learning_rate": 0.0008262730015558088, + "loss": 0.8170197, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.43432617, + "step": 1536, + "time_per_iteration": 2.8671340942382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113965, + "balance_loss_mlp": 1.09423184, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.08099910548300644, + "language_loss": 0.82513618, + "learning_rate": 0.0008260368673277574, + "loss": 0.83653271, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.45410156, + "step": 1537, + "time_per_iteration": 3.114685297012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134888, + "balance_loss_mlp": 1.08973145, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06868209454347093, + "language_loss": 0.84501362, + "learning_rate": 0.0008258006065209682, + "loss": 0.85636258, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.45141602, + "step": 1538, + "time_per_iteration": 2.7343428134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112017, + "balance_loss_mlp": 1.07341647, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.07819005704771397, + "language_loss": 0.80795646, + "learning_rate": 0.0008255642192271657, + "loss": 0.8191582, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.4675293, + "step": 1539, + "time_per_iteration": 2.7900264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123831, + "balance_loss_mlp": 1.0775305, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06984070899888078, + "language_loss": 0.84251219, + "learning_rate": 0.0008253277055381241, + "loss": 0.85375053, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.46313477, + "step": 1540, + "time_per_iteration": 2.7936105728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126968, + "balance_loss_mlp": 1.08383858, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09213105437911238, + "language_loss": 0.86479163, + "learning_rate": 0.0008250910655456658, + "loss": 0.87606132, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.43115234, + "step": 1541, + "time_per_iteration": 3.119706392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141818, + "balance_loss_mlp": 1.09723353, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.06264221574110865, + "language_loss": 0.84348595, + "learning_rate": 0.0008248542993416625, + "loss": 0.85490412, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.44628906, + "step": 1542, + "time_per_iteration": 2.6273162364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.09224987, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.062187844768518095, + "language_loss": 0.838552, + "learning_rate": 0.0008246174070180352, + "loss": 0.84992176, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.44702148, + "step": 1543, + "time_per_iteration": 2.6559441089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155532, + "balance_loss_mlp": 1.11099529, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09249403217806111, + "language_loss": 0.84424686, + "learning_rate": 0.0008243803886667537, + "loss": 0.85580218, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.44506836, + "step": 1544, + "time_per_iteration": 3.161595582962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155762, + "balance_loss_mlp": 1.11196482, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.11473976054569617, + "language_loss": 0.79569989, + "learning_rate": 0.0008241432443798364, + "loss": 0.80725753, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.43774414, + "step": 1545, + "time_per_iteration": 2.8056137561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154045, + "balance_loss_mlp": 1.11160624, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05050947415994233, + "language_loss": 0.86053026, + "learning_rate": 0.0008239059742493512, + "loss": 0.87207067, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.42456055, + "step": 1546, + "time_per_iteration": 2.6890687942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146751, + "balance_loss_mlp": 1.10383546, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.060404475813103174, + "language_loss": 0.87675822, + "learning_rate": 0.0008236685783674142, + "loss": 0.88822567, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.42944336, + "step": 1547, + "time_per_iteration": 3.0594639778137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176135, + "balance_loss_mlp": 1.15439153, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05730794129930028, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77397329, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.21777344, + "step": 1548, + "time_per_iteration": 4.907459020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115635, + "balance_loss_mlp": 1.11174202, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08902597202075696, + "language_loss": 0.82813615, + "learning_rate": 0.0008231934097178955, + "loss": 0.83969963, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.44604492, + "step": 1549, + "time_per_iteration": 2.622082471847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.1013267, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.06733871211748228, + "language_loss": 0.85700476, + "learning_rate": 0.0008229556371347903, + "loss": 0.86848152, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.46362305, + "step": 1550, + "time_per_iteration": 3.0081942081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133769, + "balance_loss_mlp": 1.09018564, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.09176779567237862, + "language_loss": 0.79384351, + "learning_rate": 0.0008227177391691874, + "loss": 0.80518115, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.43554688, + "step": 1551, + "time_per_iteration": 3.1698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126053, + "balance_loss_mlp": 1.08218408, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07033401560901072, + "language_loss": 0.89799201, + "learning_rate": 0.0008224797159134463, + "loss": 0.90925252, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.4387207, + "step": 1552, + "time_per_iteration": 2.714494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.07816052, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.05144631995573129, + "language_loss": 0.83942962, + "learning_rate": 0.0008222415674599765, + "loss": 0.85061103, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.39990234, + "step": 1553, + "time_per_iteration": 3.0642828941345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130247, + "balance_loss_mlp": 1.08563888, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07574846124683007, + "language_loss": 0.83871847, + "learning_rate": 0.0008220032939012349, + "loss": 0.85002089, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.44628906, + "step": 1554, + "time_per_iteration": 2.714172840118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.08810425, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.05026836342639273, + "language_loss": 0.8851645, + "learning_rate": 0.0008217648953297277, + "loss": 0.89646089, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.41503906, + "step": 1555, + "time_per_iteration": 2.8413305282592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139651, + "balance_loss_mlp": 1.09692693, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07726233455877282, + "language_loss": 0.78621179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79760832, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.42749023, + "step": 1556, + "time_per_iteration": 2.6995439529418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153013, + "balance_loss_mlp": 1.10766625, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07367356569931041, + "language_loss": 0.8461448, + "learning_rate": 0.0008212877235186833, + "loss": 0.85767496, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.45361328, + "step": 1557, + "time_per_iteration": 2.655294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105489, + "balance_loss_mlp": 1.09290004, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.039126881386902713, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78843045, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12597656, + "step": 1558, + "time_per_iteration": 4.953773021697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148338, + "balance_loss_mlp": 1.10647154, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.07045252665170362, + "language_loss": 0.81300378, + "learning_rate": 0.0008208100527678611, + "loss": 0.82448721, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.41870117, + "step": 1559, + "time_per_iteration": 2.5706257820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142691, + "balance_loss_mlp": 1.10223174, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.09371754463761041, + "language_loss": 0.79173958, + "learning_rate": 0.0008205710305218135, + "loss": 0.80316657, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.40454102, + "step": 1560, + "time_per_iteration": 3.001490354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152428, + "balance_loss_mlp": 1.11292171, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.06044421333553386, + "language_loss": 0.90459639, + "learning_rate": 0.0008203318838190541, + "loss": 0.91612065, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.39501953, + "step": 1561, + "time_per_iteration": 2.753243923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166566, + "balance_loss_mlp": 1.1229353, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.07449479195038491, + "language_loss": 0.85542631, + "learning_rate": 0.0008200926127524281, + "loss": 0.86709195, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.43676758, + "step": 1562, + "time_per_iteration": 2.6388282775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184921, + "balance_loss_mlp": 1.14045644, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.07268784417656445, + "language_loss": 0.83160597, + "learning_rate": 0.0008198532174148289, + "loss": 0.8434552, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.44482422, + "step": 1563, + "time_per_iteration": 2.71712589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076623, + "balance_loss_mlp": 1.06308043, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03416296623034226, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81762791, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.13574219, + "step": 1564, + "time_per_iteration": 4.830719232559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194058, + "balance_loss_mlp": 1.15185785, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08914748552149089, + "language_loss": 0.88889605, + "learning_rate": 0.0008193740542985244, + "loss": 0.90083665, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.421875, + "step": 1565, + "time_per_iteration": 2.6047041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199035, + "balance_loss_mlp": 1.15647733, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.07863054385005203, + "language_loss": 0.8685202, + "learning_rate": 0.0008191342867058467, + "loss": 0.88051057, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.42578125, + "step": 1566, + "time_per_iteration": 2.715708017349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196816, + "balance_loss_mlp": 1.15280378, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.087093537774187, + "language_loss": 0.83839655, + "learning_rate": 0.0008188943952142509, + "loss": 0.85036469, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.43994141, + "step": 1567, + "time_per_iteration": 2.831888198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118972, + "balance_loss_mlp": 1.14663815, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09637975850341399, + "language_loss": 0.82476509, + "learning_rate": 0.0008186543799168711, + "loss": 0.83666229, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.43041992, + "step": 1568, + "time_per_iteration": 3.121755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_mlp": 1.13324285, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.08024736909630528, + "language_loss": 0.88665748, + "learning_rate": 0.0008184142409068892, + "loss": 0.89842814, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.43847656, + "step": 1569, + "time_per_iteration": 2.990497350692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163968, + "balance_loss_mlp": 1.12343669, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.05684047424393967, + "language_loss": 0.86850333, + "learning_rate": 0.000818173978277536, + "loss": 0.88014305, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.40551758, + "step": 1570, + "time_per_iteration": 2.636310338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171599, + "balance_loss_mlp": 1.12956595, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.07636807389642969, + "language_loss": 0.84349716, + "learning_rate": 0.000817933592122089, + "loss": 0.85521317, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.4206543, + "step": 1571, + "time_per_iteration": 2.699178695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163998, + "balance_loss_mlp": 1.11984301, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.07546742874281152, + "language_loss": 0.83585215, + "learning_rate": 0.0008176930825338749, + "loss": 0.8474921, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.44189453, + "step": 1572, + "time_per_iteration": 2.550837516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166441, + "balance_loss_mlp": 1.12385964, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07092433148156627, + "language_loss": 0.89086282, + "learning_rate": 0.0008174524496062679, + "loss": 0.90252721, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.42578125, + "step": 1573, + "time_per_iteration": 2.883683919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116421, + "balance_loss_mlp": 1.11907697, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.061103918995996154, + "language_loss": 0.8587321, + "learning_rate": 0.0008172116934326894, + "loss": 0.8703742, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.45092773, + "step": 1574, + "time_per_iteration": 2.7379467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162954, + "balance_loss_mlp": 1.12132585, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.07023429776023385, + "language_loss": 0.87709713, + "learning_rate": 0.0008169708141066097, + "loss": 0.88872665, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.41625977, + "step": 1575, + "time_per_iteration": 2.571963310241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154168, + "balance_loss_mlp": 1.11435199, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.11601472076904104, + "language_loss": 0.90864658, + "learning_rate": 0.0008167298117215465, + "loss": 0.92018831, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.39819336, + "step": 1576, + "time_per_iteration": 2.562636375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153517, + "balance_loss_mlp": 1.11141217, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08960201833145559, + "language_loss": 0.88355744, + "learning_rate": 0.0008164886863710649, + "loss": 0.89509267, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.42138672, + "step": 1577, + "time_per_iteration": 2.921163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151824, + "balance_loss_mlp": 1.11212754, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07034131144929774, + "language_loss": 0.86199445, + "learning_rate": 0.0008162474381487783, + "loss": 0.87351274, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.39697266, + "step": 1578, + "time_per_iteration": 3.029076337814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.11016417, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.07584256466560314, + "language_loss": 0.85196549, + "learning_rate": 0.0008160060671483475, + "loss": 0.86348867, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.42163086, + "step": 1579, + "time_per_iteration": 2.7073986530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142614, + "balance_loss_mlp": 1.10289371, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.08686038732079729, + "language_loss": 0.83729678, + "learning_rate": 0.0008157645734634809, + "loss": 0.84872293, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.3972168, + "step": 1580, + "time_per_iteration": 2.6613049507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090857, + "balance_loss_mlp": 1.07302368, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.0332286598930082, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77987349, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.17871094, + "step": 1581, + "time_per_iteration": 4.915473699569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074598, + "balance_loss_mlp": 1.05705047, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.028649014265593315, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74289095, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17578125, + "step": 1582, + "time_per_iteration": 4.889309883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129405, + "balance_loss_mlp": 1.08827806, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.06812522797045092, + "language_loss": 0.84052569, + "learning_rate": 0.000815039357240067, + "loss": 0.85181975, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.41113281, + "step": 1583, + "time_per_iteration": 2.6366286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138467, + "balance_loss_mlp": 1.09672034, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.06492424308297744, + "language_loss": 0.85869169, + "learning_rate": 0.0008147973737554952, + "loss": 0.87007636, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.41748047, + "step": 1584, + "time_per_iteration": 2.7854599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136804, + "balance_loss_mlp": 1.095963, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.08202571879527615, + "language_loss": 0.86834013, + "learning_rate": 0.000814555268055744, + "loss": 0.87970817, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.40844727, + "step": 1585, + "time_per_iteration": 2.6199045181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132861, + "balance_loss_mlp": 1.09130502, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07393752668393892, + "language_loss": 0.87929702, + "learning_rate": 0.0008143130402348073, + "loss": 0.89062566, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.41625977, + "step": 1586, + "time_per_iteration": 2.638741970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129239, + "balance_loss_mlp": 1.08868384, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.06849121050203105, + "language_loss": 0.7939502, + "learning_rate": 0.0008140706903867265, + "loss": 0.80524254, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.4050293, + "step": 1587, + "time_per_iteration": 2.810335874557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134042, + "balance_loss_mlp": 1.0908649, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.07851663365650921, + "language_loss": 0.91122121, + "learning_rate": 0.0008138282186055897, + "loss": 0.92256165, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.43188477, + "step": 1588, + "time_per_iteration": 2.7237448692321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137411, + "balance_loss_mlp": 1.09661722, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.06832590097240848, + "language_loss": 0.8307212, + "learning_rate": 0.0008135856249855331, + "loss": 0.84209532, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.40771484, + "step": 1589, + "time_per_iteration": 2.7399301528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153972, + "balance_loss_mlp": 1.11241579, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.09162978556143483, + "language_loss": 0.89933717, + "learning_rate": 0.0008133429096207398, + "loss": 0.91087687, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.41577148, + "step": 1590, + "time_per_iteration": 2.8074302673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_mlp": 1.0156827, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.025543227678258826, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76341486, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.13574219, + "step": 1591, + "time_per_iteration": 4.961095094680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153411, + "balance_loss_mlp": 1.11330891, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.05628096053427355, + "language_loss": 0.87358719, + "learning_rate": 0.0008128571140339123, + "loss": 0.88512129, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.40087891, + "step": 1592, + "time_per_iteration": 2.6484899520874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137482, + "balance_loss_mlp": 1.09497237, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.058132540851188214, + "language_loss": 0.87688839, + "learning_rate": 0.0008126140340004805, + "loss": 0.88826323, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.42529297, + "step": 1593, + "time_per_iteration": 2.509239912033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144438, + "balance_loss_mlp": 1.10316801, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06371804566889869, + "language_loss": 0.82466245, + "learning_rate": 0.0008123708325995172, + "loss": 0.83610678, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.4128418, + "step": 1594, + "time_per_iteration": 3.1773130893707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133345, + "balance_loss_mlp": 1.09240818, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06060698504548286, + "language_loss": 0.79972136, + "learning_rate": 0.0008121275099254414, + "loss": 0.81105477, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 2.9426517486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142693, + "balance_loss_mlp": 1.10244751, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06149446857353131, + "language_loss": 0.88748306, + "learning_rate": 0.0008118840660727194, + "loss": 0.89890993, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.40283203, + "step": 1596, + "time_per_iteration": 2.665166139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.09553957, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.15751252363629464, + "language_loss": 0.88104224, + "learning_rate": 0.0008116405011358644, + "loss": 0.89240128, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.40380859, + "step": 1597, + "time_per_iteration": 3.1415486335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.10291696, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.06428245482632208, + "language_loss": 0.80117774, + "learning_rate": 0.0008113968152094369, + "loss": 0.81262958, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.42285156, + "step": 1598, + "time_per_iteration": 2.50484037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140725, + "balance_loss_mlp": 1.09781003, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.069373282908973, + "language_loss": 0.82692802, + "learning_rate": 0.0008111530083880438, + "loss": 0.83833528, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.42895508, + "step": 1599, + "time_per_iteration": 2.9072136878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.11211586, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.09326308305844169, + "language_loss": 0.86715603, + "learning_rate": 0.0008109090807663399, + "loss": 0.87871301, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.43554688, + "step": 1600, + "time_per_iteration": 2.8556277751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154517, + "balance_loss_mlp": 1.1142, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.07163974647376076, + "language_loss": 0.89029115, + "learning_rate": 0.0008106650324390257, + "loss": 0.90183634, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.40307617, + "step": 1601, + "time_per_iteration": 2.8016483783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115055, + "balance_loss_mlp": 1.10768259, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.06437682840273379, + "language_loss": 0.81480461, + "learning_rate": 0.0008104208635008493, + "loss": 0.82631016, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.42871094, + "step": 1602, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150496, + "balance_loss_mlp": 1.10631728, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.13502170342564263, + "language_loss": 0.8243258, + "learning_rate": 0.0008101765740466058, + "loss": 0.83583081, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.44165039, + "step": 1603, + "time_per_iteration": 2.506427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144916, + "balance_loss_mlp": 1.10135674, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0649160929519563, + "language_loss": 0.84340334, + "learning_rate": 0.0008099321641711364, + "loss": 0.85485256, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.43579102, + "step": 1604, + "time_per_iteration": 2.6318166255950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151756, + "balance_loss_mlp": 1.10938883, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.0523010874933109, + "language_loss": 0.83940029, + "learning_rate": 0.0008096876339693295, + "loss": 0.85091782, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.42407227, + "step": 1605, + "time_per_iteration": 2.620199680328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150228, + "balance_loss_mlp": 1.1086241, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.07539888612246932, + "language_loss": 0.8184768, + "learning_rate": 0.0008094429835361206, + "loss": 0.82997912, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.41625977, + "step": 1606, + "time_per_iteration": 2.9251575469970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147276, + "balance_loss_mlp": 1.10679281, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.07700051037162058, + "language_loss": 0.85932112, + "learning_rate": 0.0008091982129664908, + "loss": 0.87079388, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.40478516, + "step": 1607, + "time_per_iteration": 2.7032129764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169169, + "balance_loss_mlp": 1.12427497, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.11394505928871175, + "language_loss": 0.83292013, + "learning_rate": 0.0008089533223554687, + "loss": 0.84461182, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.44897461, + "step": 1608, + "time_per_iteration": 2.6975207328796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161949, + "balance_loss_mlp": 1.12115526, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.06275490202685644, + "language_loss": 0.85402906, + "learning_rate": 0.0008087083117981294, + "loss": 0.86564851, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.40795898, + "step": 1609, + "time_per_iteration": 2.8709142208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158469, + "balance_loss_mlp": 1.11402774, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.06357956742359384, + "language_loss": 0.88521934, + "learning_rate": 0.0008084631813895943, + "loss": 0.89680409, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.44433594, + "step": 1610, + "time_per_iteration": 2.7704904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148821, + "balance_loss_mlp": 1.1059773, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07818022356789546, + "language_loss": 0.84349322, + "learning_rate": 0.0008082179312250315, + "loss": 0.85498142, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.42871094, + "step": 1611, + "time_per_iteration": 2.6352171897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118188, + "balance_loss_mlp": 1.10588562, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.03204939869531237, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8097403, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.12255859, + "step": 1612, + "time_per_iteration": 4.865812301635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095093, + "balance_loss_mlp": 1.08288634, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.024031397097536, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77724421, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.12207031, + "step": 1613, + "time_per_iteration": 5.057459831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163336, + "balance_loss_mlp": 1.12020612, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.056757119691581794, + "language_loss": 0.82255232, + "learning_rate": 0.0008074814631475545, + "loss": 0.83418566, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.43139648, + "step": 1614, + "time_per_iteration": 3.3026204109191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164621, + "balance_loss_mlp": 1.12153852, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.0685570598787085, + "language_loss": 0.79806983, + "learning_rate": 0.0008072357349114907, + "loss": 0.80971605, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.4309082, + "step": 1615, + "time_per_iteration": 2.663853645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187873, + "balance_loss_mlp": 1.14369345, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.06371446427292905, + "language_loss": 0.8904891, + "learning_rate": 0.0008069898873959363, + "loss": 0.90236783, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.44189453, + "step": 1616, + "time_per_iteration": 2.675607919692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199097, + "balance_loss_mlp": 1.15773141, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.10138062428343411, + "language_loss": 0.8626408, + "learning_rate": 0.0008067439206963375, + "loss": 0.87463176, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.41381836, + "step": 1617, + "time_per_iteration": 2.6264841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193178, + "balance_loss_mlp": 1.15119278, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.06654120721966555, + "language_loss": 0.8650856, + "learning_rate": 0.0008064978349081873, + "loss": 0.87701744, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.41967773, + "step": 1618, + "time_per_iteration": 2.9114232063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180658, + "balance_loss_mlp": 1.13712287, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.06279818174684408, + "language_loss": 0.86905777, + "learning_rate": 0.0008062516301270245, + "loss": 0.88086432, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.43530273, + "step": 1619, + "time_per_iteration": 2.697016477584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174783, + "balance_loss_mlp": 1.13341749, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.07259268941115717, + "language_loss": 0.89074606, + "learning_rate": 0.0008060053064484343, + "loss": 0.90249389, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.41381836, + "step": 1620, + "time_per_iteration": 2.9220941066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160759, + "balance_loss_mlp": 1.11996579, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.054906942105454146, + "language_loss": 0.85286081, + "learning_rate": 0.0008057588639680482, + "loss": 0.8644684, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.40795898, + "step": 1621, + "time_per_iteration": 2.7432475090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161698, + "balance_loss_mlp": 1.11754274, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.08428579582226577, + "language_loss": 0.83045304, + "learning_rate": 0.0008055123027815434, + "loss": 0.84207004, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.44165039, + "step": 1622, + "time_per_iteration": 2.888124465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149406, + "balance_loss_mlp": 1.10947073, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.06442378780427988, + "language_loss": 0.85635763, + "learning_rate": 0.0008052656229846436, + "loss": 0.86785173, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.39916992, + "step": 1623, + "time_per_iteration": 2.7215354442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.11259365, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.1013205930173775, + "language_loss": 0.90875685, + "learning_rate": 0.0008050188246731182, + "loss": 0.92030621, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.42333984, + "step": 1624, + "time_per_iteration": 2.6636321544647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146822, + "balance_loss_mlp": 1.10655355, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08961406202901398, + "language_loss": 0.82641953, + "learning_rate": 0.0008047719079427834, + "loss": 0.83788776, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.40283203, + "step": 1625, + "time_per_iteration": 2.9943442344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067888, + "balance_loss_mlp": 1.05425012, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.02225722433359613, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75419593, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.13671875, + "step": 1626, + "time_per_iteration": 4.865052700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124122, + "balance_loss_mlp": 1.0819937, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.05828883069087806, + "language_loss": 0.86570215, + "learning_rate": 0.0008042777196091757, + "loss": 0.87694335, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.42138672, + "step": 1627, + "time_per_iteration": 2.668349266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127492, + "balance_loss_mlp": 1.08481538, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08399253674550058, + "language_loss": 0.82332879, + "learning_rate": 0.0008040304481977643, + "loss": 0.83460367, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.42675781, + "step": 1628, + "time_per_iteration": 2.6445093154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130913, + "balance_loss_mlp": 1.09224153, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.06122809929096989, + "language_loss": 0.86751842, + "learning_rate": 0.0008037830587512649, + "loss": 0.87882763, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.38671875, + "step": 1629, + "time_per_iteration": 3.0830209255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131503, + "balance_loss_mlp": 1.09068549, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.06235185724616104, + "language_loss": 0.7940957, + "learning_rate": 0.0008035355513657224, + "loss": 0.80541074, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.40820312, + "step": 1630, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135326, + "balance_loss_mlp": 1.09326935, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.06249119555349938, + "language_loss": 0.9321425, + "learning_rate": 0.0008032879261372279, + "loss": 0.94349587, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.42089844, + "step": 1631, + "time_per_iteration": 2.7995047569274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_mlp": 1.01777005, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.019617221588718974, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80666578, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12988281, + "step": 1632, + "time_per_iteration": 5.3968565464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149415, + "balance_loss_mlp": 1.10959888, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.05783646939860944, + "language_loss": 0.87576675, + "learning_rate": 0.0008027923225359748, + "loss": 0.88726091, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.39819336, + "step": 1633, + "time_per_iteration": 2.5933566093444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153635, + "balance_loss_mlp": 1.11145878, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.05944909670445279, + "language_loss": 0.88579285, + "learning_rate": 0.0008025443443556267, + "loss": 0.89732921, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.421875, + "step": 1634, + "time_per_iteration": 2.728522777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149168, + "balance_loss_mlp": 1.109519, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.0772983201911997, + "language_loss": 0.88333809, + "learning_rate": 0.000802296248717147, + "loss": 0.89482975, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.39648438, + "step": 1635, + "time_per_iteration": 2.9030401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140791, + "balance_loss_mlp": 1.0971607, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06629024784700413, + "language_loss": 0.7930302, + "learning_rate": 0.0008020480357168554, + "loss": 0.80443811, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.43603516, + "step": 1636, + "time_per_iteration": 2.839134931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145583, + "balance_loss_mlp": 1.1038121, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.06656267016529639, + "language_loss": 0.88396037, + "learning_rate": 0.0008017997054511165, + "loss": 0.89541626, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.41796875, + "step": 1637, + "time_per_iteration": 2.5937085151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148115, + "balance_loss_mlp": 1.10424566, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06622170213435077, + "language_loss": 0.85649616, + "learning_rate": 0.0008015512580163407, + "loss": 0.86797726, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.43896484, + "step": 1638, + "time_per_iteration": 2.8432726860046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138121, + "balance_loss_mlp": 1.09639752, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.06676164925493694, + "language_loss": 0.81149763, + "learning_rate": 0.0008013026935089838, + "loss": 0.82287884, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.41699219, + "step": 1639, + "time_per_iteration": 2.8703761100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142857, + "balance_loss_mlp": 1.1031127, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.060786667538297263, + "language_loss": 0.84702241, + "learning_rate": 0.0008010540120255472, + "loss": 0.85845095, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.3972168, + "step": 1640, + "time_per_iteration": 2.6741273403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136768, + "balance_loss_mlp": 1.09511614, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.06934658167266547, + "language_loss": 0.86723542, + "learning_rate": 0.0008008052136625774, + "loss": 0.8786031, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.41650391, + "step": 1641, + "time_per_iteration": 2.8395094871520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135427, + "balance_loss_mlp": 1.09272623, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07613576058544219, + "language_loss": 0.87025082, + "learning_rate": 0.0008005562985166666, + "loss": 0.88160515, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.42675781, + "step": 1642, + "time_per_iteration": 2.708812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127626, + "balance_loss_mlp": 1.08621287, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05118616143218352, + "language_loss": 0.85440576, + "learning_rate": 0.0008003072666844524, + "loss": 0.86568201, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.41430664, + "step": 1643, + "time_per_iteration": 2.74019193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127922, + "balance_loss_mlp": 1.08746231, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.07457594622010144, + "language_loss": 0.82632107, + "learning_rate": 0.0008000581182626173, + "loss": 0.83760029, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.40478516, + "step": 1644, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011327, + "balance_loss_mlp": 1.09159672, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.0586598658040055, + "language_loss": 0.86714005, + "learning_rate": 0.0007998088533478894, + "loss": 0.87846708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.41137695, + "step": 1645, + "time_per_iteration": 2.674678087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130403, + "balance_loss_mlp": 1.08805966, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.10428151324619617, + "language_loss": 0.84319067, + "learning_rate": 0.000799559472037042, + "loss": 0.85449469, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.4230957, + "step": 1646, + "time_per_iteration": 2.5389983654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130022, + "balance_loss_mlp": 1.08939528, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05498023868715711, + "language_loss": 0.8798641, + "learning_rate": 0.0007993099744268932, + "loss": 0.8911643, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.40625, + "step": 1647, + "time_per_iteration": 2.919410467147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127448, + "balance_loss_mlp": 1.0858674, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.07648109375468225, + "language_loss": 0.88298547, + "learning_rate": 0.000799060360614307, + "loss": 0.89425999, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.41577148, + "step": 1648, + "time_per_iteration": 2.679098606109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132184, + "balance_loss_mlp": 1.09117627, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.17676844539598618, + "language_loss": 0.83707428, + "learning_rate": 0.0007988106306961917, + "loss": 0.84839618, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.41015625, + "step": 1649, + "time_per_iteration": 3.1304876804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139719, + "balance_loss_mlp": 1.09809113, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.06731506602110418, + "language_loss": 0.84557772, + "learning_rate": 0.0007985607847695014, + "loss": 0.85697484, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.41625977, + "step": 1650, + "time_per_iteration": 2.6152966022491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151307, + "balance_loss_mlp": 1.11087108, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.08658277444707524, + "language_loss": 0.83160597, + "learning_rate": 0.0007983108229312345, + "loss": 0.84311903, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.40454102, + "step": 1651, + "time_per_iteration": 2.9157605171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180085, + "balance_loss_mlp": 1.13864803, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.12326743545136284, + "language_loss": 0.86631948, + "learning_rate": 0.0007980607452784351, + "loss": 0.8781203, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.4140625, + "step": 1652, + "time_per_iteration": 2.5533528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170234, + "balance_loss_mlp": 1.12798643, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.07656805667485655, + "language_loss": 0.90550399, + "learning_rate": 0.0007978105519081919, + "loss": 0.91720629, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.42236328, + "step": 1653, + "time_per_iteration": 2.683962821960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162895, + "balance_loss_mlp": 1.12088561, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.06859901935764132, + "language_loss": 0.88378012, + "learning_rate": 0.0007975602429176385, + "loss": 0.89540899, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.42041016, + "step": 1654, + "time_per_iteration": 2.563507556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165514, + "balance_loss_mlp": 1.12421989, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.07830522948057009, + "language_loss": 0.81779003, + "learning_rate": 0.0007973098184039536, + "loss": 0.82944512, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.4128418, + "step": 1655, + "time_per_iteration": 2.6503560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154556, + "balance_loss_mlp": 1.11433494, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.07004293098644994, + "language_loss": 0.87212098, + "learning_rate": 0.0007970592784643602, + "loss": 0.88366652, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.40185547, + "step": 1656, + "time_per_iteration": 2.8598649501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167315, + "balance_loss_mlp": 1.12366056, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.08267452239342069, + "language_loss": 0.8563, + "learning_rate": 0.0007968086231961272, + "loss": 0.86797309, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.43676758, + "step": 1657, + "time_per_iteration": 2.637216806411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158327, + "balance_loss_mlp": 1.11343288, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.09173012098392071, + "language_loss": 0.83764172, + "learning_rate": 0.0007965578526965671, + "loss": 0.84922498, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.44897461, + "step": 1658, + "time_per_iteration": 2.607729911804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154777, + "balance_loss_mlp": 1.11307764, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.08650327787833377, + "language_loss": 0.86397582, + "learning_rate": 0.0007963069670630377, + "loss": 0.87552357, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.41723633, + "step": 1659, + "time_per_iteration": 2.7385904788970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154696, + "balance_loss_mlp": 1.11175728, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.06815630012467462, + "language_loss": 0.88107586, + "learning_rate": 0.0007960559663929416, + "loss": 0.89262283, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.4296875, + "step": 1660, + "time_per_iteration": 2.696936845779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155709, + "balance_loss_mlp": 1.11372399, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.07443207173064395, + "language_loss": 0.8773188, + "learning_rate": 0.0007958048507837259, + "loss": 0.88887584, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.41992188, + "step": 1661, + "time_per_iteration": 3.0276992321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.12168884, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.07361086812440759, + "language_loss": 0.87900233, + "learning_rate": 0.0007955536203328822, + "loss": 0.89066029, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.44116211, + "step": 1662, + "time_per_iteration": 2.9181947708129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167079, + "balance_loss_mlp": 1.12497449, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0536049497981301, + "language_loss": 0.8375597, + "learning_rate": 0.0007953022751379469, + "loss": 0.84923047, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.42089844, + "step": 1663, + "time_per_iteration": 2.8502774238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160364, + "balance_loss_mlp": 1.11749601, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.09076105210561375, + "language_loss": 0.82297581, + "learning_rate": 0.000795050815296501, + "loss": 0.83457941, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.42871094, + "step": 1664, + "time_per_iteration": 2.990253210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149395, + "balance_loss_mlp": 1.10821986, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.05392034602485258, + "language_loss": 0.93401325, + "learning_rate": 0.0007947992409061695, + "loss": 0.94550717, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.41162109, + "step": 1665, + "time_per_iteration": 2.5734803676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146101, + "balance_loss_mlp": 1.10456824, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07147454481835314, + "language_loss": 0.86398005, + "learning_rate": 0.0007945475520646226, + "loss": 0.87544107, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.4152832, + "step": 1666, + "time_per_iteration": 2.9147067070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144126, + "balance_loss_mlp": 1.10156846, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08541845552139904, + "language_loss": 0.85159481, + "learning_rate": 0.0007942957488695743, + "loss": 0.8630361, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.42578125, + "step": 1667, + "time_per_iteration": 2.6842408180236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138005, + "balance_loss_mlp": 1.09725952, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06001483498827303, + "language_loss": 0.81309706, + "learning_rate": 0.0007940438314187833, + "loss": 0.82447714, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.4074707, + "step": 1668, + "time_per_iteration": 3.0340676307678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128156, + "balance_loss_mlp": 1.08769631, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.06998559069767052, + "language_loss": 0.81191337, + "learning_rate": 0.0007937917998100529, + "loss": 0.82319492, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.40454102, + "step": 1669, + "time_per_iteration": 2.635629177093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.09313023, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.08304565240235381, + "language_loss": 0.79254091, + "learning_rate": 0.0007935396541412302, + "loss": 0.80392736, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.45532227, + "step": 1670, + "time_per_iteration": 2.6226065158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141669, + "balance_loss_mlp": 1.09896851, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07816166477955887, + "language_loss": 0.85914934, + "learning_rate": 0.0007932873945102068, + "loss": 0.87056601, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.42724609, + "step": 1671, + "time_per_iteration": 2.559443473815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_mlp": 1.03238678, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.025388272809080015, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76809424, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15234375, + "step": 1672, + "time_per_iteration": 4.8329596519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176113, + "balance_loss_mlp": 1.13319826, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.10680060394368475, + "language_loss": 0.86589128, + "learning_rate": 0.0007927825337533461, + "loss": 0.87765247, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.42895508, + "step": 1673, + "time_per_iteration": 2.670067071914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117015, + "balance_loss_mlp": 1.12651968, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.0659920492524482, + "language_loss": 0.84953517, + "learning_rate": 0.0007925299328235131, + "loss": 0.86123669, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.43652344, + "step": 1674, + "time_per_iteration": 2.6559884548187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169615, + "balance_loss_mlp": 1.12543643, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.10142438885407562, + "language_loss": 0.85307467, + "learning_rate": 0.000792277218323488, + "loss": 0.86477083, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.44189453, + "step": 1675, + "time_per_iteration": 2.5843372344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158765, + "balance_loss_mlp": 1.11673164, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.06840501438298492, + "language_loss": 0.85418063, + "learning_rate": 0.0007920243903513833, + "loss": 0.86576831, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.4206543, + "step": 1676, + "time_per_iteration": 2.562697649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.09280825, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.06731593225582447, + "language_loss": 0.84609574, + "learning_rate": 0.0007917714490053556, + "loss": 0.85747755, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.45361328, + "step": 1677, + "time_per_iteration": 2.685854434967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131879, + "balance_loss_mlp": 1.09029913, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06310440112326268, + "language_loss": 0.86562228, + "learning_rate": 0.0007915183943836055, + "loss": 0.87694108, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.41601562, + "step": 1678, + "time_per_iteration": 2.8568227291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128366, + "balance_loss_mlp": 1.08466363, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07690366782162197, + "language_loss": 0.84428912, + "learning_rate": 0.0007912652265843773, + "loss": 0.85557282, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.43725586, + "step": 1679, + "time_per_iteration": 3.079998254776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110817, + "balance_loss_mlp": 1.06930852, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.07712564159484636, + "language_loss": 0.8213551, + "learning_rate": 0.0007910119457059597, + "loss": 0.83246326, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.4152832, + "step": 1680, + "time_per_iteration": 2.6812973022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112077, + "balance_loss_mlp": 1.06975782, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.10745693955939492, + "language_loss": 0.81109858, + "learning_rate": 0.0007907585518466849, + "loss": 0.82221937, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.42333984, + "step": 1681, + "time_per_iteration": 2.9406683444976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115262, + "balance_loss_mlp": 1.07265627, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07157404686533678, + "language_loss": 0.89948541, + "learning_rate": 0.000790505045104929, + "loss": 0.91063797, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.42602539, + "step": 1682, + "time_per_iteration": 2.5241646766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119953, + "balance_loss_mlp": 1.07606041, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.06937214564576595, + "language_loss": 0.87034553, + "learning_rate": 0.0007902514255791125, + "loss": 0.88154507, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.43896484, + "step": 1683, + "time_per_iteration": 2.8741068840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111917, + "balance_loss_mlp": 1.076231, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06778435640114842, + "language_loss": 0.87994444, + "learning_rate": 0.0007899976933676986, + "loss": 0.89113617, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.42919922, + "step": 1684, + "time_per_iteration": 2.959290027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117467, + "balance_loss_mlp": 1.07469463, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.06453517439379398, + "language_loss": 0.87573123, + "learning_rate": 0.0007897438485691955, + "loss": 0.88690597, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.42773438, + "step": 1685, + "time_per_iteration": 2.6591978073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_mlp": 1.08655035, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.13512041919643347, + "language_loss": 0.82386112, + "learning_rate": 0.0007894898912821542, + "loss": 0.835177, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.45043945, + "step": 1686, + "time_per_iteration": 2.5375750064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134689, + "balance_loss_mlp": 1.09201205, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.07414292899066016, + "language_loss": 0.8748548, + "learning_rate": 0.0007892358216051695, + "loss": 0.88620168, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.42675781, + "step": 1687, + "time_per_iteration": 2.73968243598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132707, + "balance_loss_mlp": 1.09098339, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06337992950379638, + "language_loss": 0.92269105, + "learning_rate": 0.0007889816396368803, + "loss": 0.93401814, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.41699219, + "step": 1688, + "time_per_iteration": 2.6067299842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131771, + "balance_loss_mlp": 1.08961868, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07885708031147778, + "language_loss": 0.85782814, + "learning_rate": 0.0007887273454759687, + "loss": 0.86914587, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.421875, + "step": 1689, + "time_per_iteration": 2.484260320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122278, + "balance_loss_mlp": 1.08031607, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.06527022407794938, + "language_loss": 0.82859224, + "learning_rate": 0.0007884729392211603, + "loss": 0.83981502, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.41943359, + "step": 1690, + "time_per_iteration": 2.642786741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129634, + "balance_loss_mlp": 1.08812594, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09568065131307975, + "language_loss": 0.86132944, + "learning_rate": 0.0007882184209712245, + "loss": 0.87262577, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.41503906, + "step": 1691, + "time_per_iteration": 2.5199530124664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123067, + "balance_loss_mlp": 1.08234525, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06282055281729462, + "language_loss": 0.86132228, + "learning_rate": 0.000787963790824974, + "loss": 0.87255299, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.40722656, + "step": 1692, + "time_per_iteration": 2.9768075942993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124522, + "balance_loss_mlp": 1.08427668, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.07612118071262816, + "language_loss": 0.89543802, + "learning_rate": 0.0007877090488812651, + "loss": 0.90668321, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.40258789, + "step": 1693, + "time_per_iteration": 2.4604485034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124012, + "balance_loss_mlp": 1.08207428, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.1035661329718289, + "language_loss": 0.83982152, + "learning_rate": 0.0007874541952389973, + "loss": 0.85106164, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.41943359, + "step": 1694, + "time_per_iteration": 2.6709587574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113814, + "balance_loss_mlp": 1.09753752, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08446561178027004, + "language_loss": 0.86949492, + "learning_rate": 0.0007871992299971136, + "loss": 0.8808763, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.40625, + "step": 1695, + "time_per_iteration": 2.5585403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150743, + "balance_loss_mlp": 1.11023593, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.05830689117178756, + "language_loss": 0.84793502, + "learning_rate": 0.0007869441532546001, + "loss": 0.85944247, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.4050293, + "step": 1696, + "time_per_iteration": 2.7510788440704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148317, + "balance_loss_mlp": 1.1100266, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06976949490853021, + "language_loss": 0.79791546, + "learning_rate": 0.0007866889651104867, + "loss": 0.80939865, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.38256836, + "step": 1697, + "time_per_iteration": 2.7944459915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152152, + "balance_loss_mlp": 1.11114383, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.06767982610774756, + "language_loss": 0.83777177, + "learning_rate": 0.000786433665663846, + "loss": 0.84929335, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.40991211, + "step": 1698, + "time_per_iteration": 2.6864194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167936, + "balance_loss_mlp": 1.12514019, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.0725657973515617, + "language_loss": 0.87005848, + "learning_rate": 0.0007861782550137942, + "loss": 0.88173789, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.42797852, + "step": 1699, + "time_per_iteration": 2.896897792816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160393, + "balance_loss_mlp": 1.11986172, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.0774952835645251, + "language_loss": 0.86092401, + "learning_rate": 0.0007859227332594901, + "loss": 0.87252796, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.40527344, + "step": 1700, + "time_per_iteration": 2.8986380100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165908, + "balance_loss_mlp": 1.12449527, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09509515836767467, + "language_loss": 0.85007191, + "learning_rate": 0.0007856671005001365, + "loss": 0.86173105, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.41381836, + "step": 1701, + "time_per_iteration": 3.148084878921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168963, + "balance_loss_mlp": 1.12726378, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.07560076292899535, + "language_loss": 0.82363045, + "learning_rate": 0.0007854113568349787, + "loss": 0.83532006, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.41699219, + "step": 1702, + "time_per_iteration": 3.1411454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191314, + "balance_loss_mlp": 1.14882779, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.08142047178498793, + "language_loss": 0.81090933, + "learning_rate": 0.0007851555023633052, + "loss": 0.82282251, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.42504883, + "step": 1703, + "time_per_iteration": 2.9109766483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197058, + "balance_loss_mlp": 1.1559788, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07993965020483434, + "language_loss": 0.82561779, + "learning_rate": 0.0007848995371844474, + "loss": 0.83758843, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.41088867, + "step": 1704, + "time_per_iteration": 2.531611680984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197334, + "balance_loss_mlp": 1.15267849, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.11293951672356671, + "language_loss": 0.81012988, + "learning_rate": 0.0007846434613977801, + "loss": 0.82210326, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.44677734, + "step": 1705, + "time_per_iteration": 2.5413970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175519, + "balance_loss_mlp": 1.1340816, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.10106481858624654, + "language_loss": 0.78958142, + "learning_rate": 0.0007843872751027203, + "loss": 0.80133665, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.41455078, + "step": 1706, + "time_per_iteration": 2.817387580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158115, + "balance_loss_mlp": 1.1166296, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.06764312208644677, + "language_loss": 0.87366319, + "learning_rate": 0.0007841309783987287, + "loss": 0.88524431, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.41503906, + "step": 1707, + "time_per_iteration": 2.7335729598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155907, + "balance_loss_mlp": 1.11117959, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06220723681544313, + "language_loss": 0.89445031, + "learning_rate": 0.0007838745713853084, + "loss": 0.90600932, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.44702148, + "step": 1708, + "time_per_iteration": 2.6179606914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114633, + "balance_loss_mlp": 1.10207939, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.09473479000062662, + "language_loss": 0.84092307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85238636, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.44213867, + "step": 1709, + "time_per_iteration": 2.703660249710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160153, + "balance_loss_mlp": 1.11723721, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06816782803484764, + "language_loss": 0.86778289, + "learning_rate": 0.0007833614268284082, + "loss": 0.8793844, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.42944336, + "step": 1710, + "time_per_iteration": 2.548859119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077221, + "balance_loss_mlp": 1.06558585, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.029019472878356288, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75186992, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.11621094, + "step": 1711, + "time_per_iteration": 4.9234619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117236, + "balance_loss_mlp": 1.12934983, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.10714861433418864, + "language_loss": 0.78928375, + "learning_rate": 0.0007828478422289016, + "loss": 0.80100739, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.43017578, + "step": 1712, + "time_per_iteration": 2.584307909011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167703, + "balance_loss_mlp": 1.12228465, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.08165577234876795, + "language_loss": 0.89409995, + "learning_rate": 0.0007825908851623833, + "loss": 0.90577698, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.45410156, + "step": 1713, + "time_per_iteration": 2.7400283813476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158648, + "balance_loss_mlp": 1.11475515, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08464988169520862, + "language_loss": 0.85764992, + "learning_rate": 0.0007823338183843533, + "loss": 0.86923635, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.43896484, + "step": 1714, + "time_per_iteration": 2.671375036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157012, + "balance_loss_mlp": 1.11419201, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.0730773907324959, + "language_loss": 0.81870985, + "learning_rate": 0.0007820766419946141, + "loss": 0.83028001, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4284668, + "step": 1715, + "time_per_iteration": 3.3361854553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_mlp": 1.01473284, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.017749933707714268, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80699992, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.12792969, + "step": 1716, + "time_per_iteration": 4.933880567550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193401, + "balance_loss_mlp": 1.14895988, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.1003306893312863, + "language_loss": 0.76434684, + "learning_rate": 0.0007815619607794288, + "loss": 0.77628088, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.4440918, + "step": 1717, + "time_per_iteration": 2.6259148120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191125, + "balance_loss_mlp": 1.14823365, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.07927399877074098, + "language_loss": 0.83156073, + "learning_rate": 0.0007813044561538001, + "loss": 0.84347194, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.42895508, + "step": 1718, + "time_per_iteration": 3.1473774909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.145239, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06905487251407855, + "language_loss": 0.88941157, + "learning_rate": 0.0007810468423160958, + "loss": 0.9013117, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.44799805, + "step": 1719, + "time_per_iteration": 2.895155906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181044, + "balance_loss_mlp": 1.13943982, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06204943336400955, + "language_loss": 0.82643551, + "learning_rate": 0.0007807891193663306, + "loss": 0.83824587, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.41625977, + "step": 1720, + "time_per_iteration": 2.7824859619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165341, + "balance_loss_mlp": 1.12357068, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07732363095630222, + "language_loss": 0.82492876, + "learning_rate": 0.0007805312874045614, + "loss": 0.83658212, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.41796875, + "step": 1721, + "time_per_iteration": 2.5710601806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170989, + "balance_loss_mlp": 1.12807381, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.07358039625922873, + "language_loss": 0.86639178, + "learning_rate": 0.0007802733465308874, + "loss": 0.87810171, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.42895508, + "step": 1722, + "time_per_iteration": 2.4402778148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171295, + "balance_loss_mlp": 1.12632966, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06616160911514579, + "language_loss": 0.8424235, + "learning_rate": 0.0007800152968454501, + "loss": 0.85413647, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.44970703, + "step": 1723, + "time_per_iteration": 2.689309597015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115688, + "balance_loss_mlp": 1.11634886, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06191321033146657, + "language_loss": 0.90671206, + "learning_rate": 0.0007797571384484334, + "loss": 0.91828084, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.40527344, + "step": 1724, + "time_per_iteration": 2.8473238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147699, + "balance_loss_mlp": 1.10421109, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.06062690844208358, + "language_loss": 0.92524576, + "learning_rate": 0.0007794988714400633, + "loss": 0.93672276, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.43530273, + "step": 1725, + "time_per_iteration": 2.62685227394104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146389, + "balance_loss_mlp": 1.10118532, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.09351886782013036, + "language_loss": 0.85586655, + "learning_rate": 0.0007792404959206079, + "loss": 0.86733043, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.45214844, + "step": 1726, + "time_per_iteration": 2.487520694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150134, + "balance_loss_mlp": 1.10707533, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.09481341164405561, + "language_loss": 0.81825417, + "learning_rate": 0.0007789820119903774, + "loss": 0.82975549, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.4309082, + "step": 1727, + "time_per_iteration": 2.9732954502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118165, + "balance_loss_mlp": 1.16734493, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.0769954731958624, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79674315, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14257812, + "step": 1728, + "time_per_iteration": 4.8314409255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149054, + "balance_loss_mlp": 1.10599601, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.06765949793064117, + "language_loss": 0.84123361, + "learning_rate": 0.0007784647192990428, + "loss": 0.85272419, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.43041992, + "step": 1729, + "time_per_iteration": 2.715163230895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147649, + "balance_loss_mlp": 1.10799968, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06156065876328187, + "language_loss": 0.80939102, + "learning_rate": 0.0007782059107387696, + "loss": 0.82086754, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.39672852, + "step": 1730, + "time_per_iteration": 2.865858554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165768, + "balance_loss_mlp": 1.12247074, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.07708666526094303, + "language_loss": 0.88668191, + "learning_rate": 0.0007779469941693826, + "loss": 0.89833963, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.43261719, + "step": 1731, + "time_per_iteration": 2.8640921115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166075, + "balance_loss_mlp": 1.12351775, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.08600344935746515, + "language_loss": 0.76943499, + "learning_rate": 0.0007776879696914029, + "loss": 0.78109574, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.42553711, + "step": 1732, + "time_per_iteration": 2.8162899017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159987, + "balance_loss_mlp": 1.11745262, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.07534435583192022, + "language_loss": 0.89131331, + "learning_rate": 0.000777428837405392, + "loss": 0.90291321, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.42553711, + "step": 1733, + "time_per_iteration": 2.869436740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151668, + "balance_loss_mlp": 1.11042213, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.0649827105829465, + "language_loss": 0.87220478, + "learning_rate": 0.0007771695974119544, + "loss": 0.88372147, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.41259766, + "step": 1734, + "time_per_iteration": 2.5153088569641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138148, + "balance_loss_mlp": 1.0959959, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07614790264044081, + "language_loss": 0.76295686, + "learning_rate": 0.0007769102498117359, + "loss": 0.77433836, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.42163086, + "step": 1735, + "time_per_iteration": 3.1105504035949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136381, + "balance_loss_mlp": 1.09430027, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06230250245944302, + "language_loss": 0.80020654, + "learning_rate": 0.000776650794705424, + "loss": 0.81157035, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.42089844, + "step": 1736, + "time_per_iteration": 3.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141082, + "balance_loss_mlp": 1.09890568, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.053956568858798265, + "language_loss": 0.82610357, + "learning_rate": 0.0007763912321937483, + "loss": 0.8375144, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.421875, + "step": 1737, + "time_per_iteration": 2.6871769428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126175, + "balance_loss_mlp": 1.0870508, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.06336651482337263, + "language_loss": 0.82955027, + "learning_rate": 0.0007761315623774799, + "loss": 0.84081209, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.39111328, + "step": 1738, + "time_per_iteration": 3.4055540561676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113225, + "balance_loss_mlp": 1.09088469, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08278309899958468, + "language_loss": 0.88244802, + "learning_rate": 0.0007758717853574313, + "loss": 0.89377058, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.41381836, + "step": 1739, + "time_per_iteration": 2.7666313648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120554, + "balance_loss_mlp": 1.08114362, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0696820530517557, + "language_loss": 0.90798807, + "learning_rate": 0.0007756119012344571, + "loss": 0.91919363, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.39404297, + "step": 1740, + "time_per_iteration": 2.5491223335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115915, + "balance_loss_mlp": 1.07428706, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06589349032225494, + "language_loss": 0.85103011, + "learning_rate": 0.0007753519101094535, + "loss": 0.86218929, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.41625977, + "step": 1741, + "time_per_iteration": 2.765583038330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112401, + "balance_loss_mlp": 1.0837177, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.0662644502369307, + "language_loss": 0.86452365, + "learning_rate": 0.0007750918120833575, + "loss": 0.87576377, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.40283203, + "step": 1742, + "time_per_iteration": 2.6085479259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140409, + "balance_loss_mlp": 1.10240483, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.07280628286033199, + "language_loss": 0.87783647, + "learning_rate": 0.0007748316072571485, + "loss": 0.88924056, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.37963867, + "step": 1743, + "time_per_iteration": 2.793119192123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133272, + "balance_loss_mlp": 1.09259784, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0850070564381928, + "language_loss": 0.79522568, + "learning_rate": 0.0007745712957318467, + "loss": 0.80655837, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.40698242, + "step": 1744, + "time_per_iteration": 2.943847417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137205, + "balance_loss_mlp": 1.09700739, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06831295126385283, + "language_loss": 0.86807823, + "learning_rate": 0.0007743108776085141, + "loss": 0.87945032, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.40136719, + "step": 1745, + "time_per_iteration": 2.771634101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011368, + "balance_loss_mlp": 1.09743714, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.05902486087385494, + "language_loss": 0.83364028, + "learning_rate": 0.0007740503529882543, + "loss": 0.84500825, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.39331055, + "step": 1746, + "time_per_iteration": 2.7896366119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139374, + "balance_loss_mlp": 1.09831822, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.061665767711377016, + "language_loss": 0.90955931, + "learning_rate": 0.0007737897219722114, + "loss": 0.92095304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.41088867, + "step": 1747, + "time_per_iteration": 2.7088165283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129332, + "balance_loss_mlp": 1.08725071, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08528813851267185, + "language_loss": 0.81553382, + "learning_rate": 0.0007735289846615716, + "loss": 0.82682711, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.42089844, + "step": 1748, + "time_per_iteration": 2.635098934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129982, + "balance_loss_mlp": 1.09119081, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.09169024401551043, + "language_loss": 0.82026851, + "learning_rate": 0.0007732681411575621, + "loss": 0.83156836, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.38818359, + "step": 1749, + "time_per_iteration": 2.6693224906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134785, + "balance_loss_mlp": 1.09437299, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0698579909367107, + "language_loss": 0.88035583, + "learning_rate": 0.0007730071915614514, + "loss": 0.89170372, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.40405273, + "step": 1750, + "time_per_iteration": 2.6900789737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.09800839, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09938227861633823, + "language_loss": 0.89158392, + "learning_rate": 0.0007727461359745489, + "loss": 0.90296388, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.3996582, + "step": 1751, + "time_per_iteration": 2.5086123943328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154901, + "balance_loss_mlp": 1.1132257, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06249007419708336, + "language_loss": 0.86569941, + "learning_rate": 0.0007724849744982056, + "loss": 0.87724847, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.41674805, + "step": 1752, + "time_per_iteration": 2.700474739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169913, + "balance_loss_mlp": 1.12737882, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.06015013269361517, + "language_loss": 0.8195309, + "learning_rate": 0.0007722237072338131, + "loss": 0.83123004, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.42529297, + "step": 1753, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.14816022, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.11537307258838475, + "language_loss": 0.85648489, + "learning_rate": 0.0007719623342828046, + "loss": 0.86841327, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.44726562, + "step": 1754, + "time_per_iteration": 2.517010450363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191581, + "balance_loss_mlp": 1.14685392, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.06847069318075473, + "language_loss": 0.84535718, + "learning_rate": 0.000771700855746654, + "loss": 0.85727292, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.44750977, + "step": 1755, + "time_per_iteration": 2.5961217880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164795, + "balance_loss_mlp": 1.1231432, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.05626734330263072, + "language_loss": 0.8872534, + "learning_rate": 0.0007714392717268763, + "loss": 0.89890134, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.41674805, + "step": 1756, + "time_per_iteration": 2.5784223079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166558, + "balance_loss_mlp": 1.12185431, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.07105398160496887, + "language_loss": 0.8649826, + "learning_rate": 0.0007711775823250273, + "loss": 0.87664813, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.44702148, + "step": 1757, + "time_per_iteration": 2.5373613834381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115594, + "balance_loss_mlp": 1.11207056, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06341765106008965, + "language_loss": 0.83797616, + "learning_rate": 0.0007709157876427039, + "loss": 0.84953558, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.43896484, + "step": 1758, + "time_per_iteration": 3.1393754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144144, + "balance_loss_mlp": 1.10027504, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.0573406658982909, + "language_loss": 0.85933769, + "learning_rate": 0.0007706538877815439, + "loss": 0.8707791, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4387207, + "step": 1759, + "time_per_iteration": 2.6080896854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152987, + "balance_loss_mlp": 1.11054862, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.06135171113161323, + "language_loss": 0.83615482, + "learning_rate": 0.0007703918828432259, + "loss": 0.84768468, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.42456055, + "step": 1760, + "time_per_iteration": 2.5886309146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148897, + "balance_loss_mlp": 1.10464644, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.05937499082636783, + "language_loss": 0.88942921, + "learning_rate": 0.000770129772929469, + "loss": 0.90091813, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.44238281, + "step": 1761, + "time_per_iteration": 2.645293951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140708, + "balance_loss_mlp": 1.09629107, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07244625367361128, + "language_loss": 0.88504505, + "learning_rate": 0.0007698675581420334, + "loss": 0.89645213, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.4440918, + "step": 1762, + "time_per_iteration": 2.849560022354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149138, + "balance_loss_mlp": 1.10469711, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06385607916775927, + "language_loss": 0.79163915, + "learning_rate": 0.0007696052385827199, + "loss": 0.80313051, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.44458008, + "step": 1763, + "time_per_iteration": 2.9164280891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138684, + "balance_loss_mlp": 1.09765172, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.07477333876977248, + "language_loss": 0.78203613, + "learning_rate": 0.00076934281435337, + "loss": 0.79342294, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.41040039, + "step": 1764, + "time_per_iteration": 2.7213284969329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131547, + "balance_loss_mlp": 1.08922768, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0661700543843282, + "language_loss": 0.86476332, + "learning_rate": 0.0007690802855558658, + "loss": 0.87607884, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.4230957, + "step": 1765, + "time_per_iteration": 2.8648691177368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144213, + "balance_loss_mlp": 1.12981212, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.0393682164062729, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77519166, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.14355469, + "step": 1766, + "time_per_iteration": 4.883134603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138855, + "balance_loss_mlp": 1.09441423, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.06478844738748038, + "language_loss": 0.89260793, + "learning_rate": 0.0007685549146641262, + "loss": 0.90399647, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.44458008, + "step": 1767, + "time_per_iteration": 2.5584475994110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138308, + "balance_loss_mlp": 1.09780085, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0552886410345199, + "language_loss": 0.8865279, + "learning_rate": 0.0007682920727738579, + "loss": 0.89791095, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.4050293, + "step": 1768, + "time_per_iteration": 2.462104558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.09170651, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.07550967393636049, + "language_loss": 0.84987569, + "learning_rate": 0.000768029126723369, + "loss": 0.86121619, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.42333984, + "step": 1769, + "time_per_iteration": 2.5362985134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.09360242, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.0745429404709064, + "language_loss": 0.82167029, + "learning_rate": 0.0007677660766147447, + "loss": 0.83301806, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.41186523, + "step": 1770, + "time_per_iteration": 2.516824960708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.06356168, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.02503514207226814, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73550433, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.15917969, + "step": 1771, + "time_per_iteration": 4.943475008010864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137395, + "balance_loss_mlp": 1.09543359, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.06960763795190199, + "language_loss": 0.80136019, + "learning_rate": 0.0007672396646316306, + "loss": 0.81273413, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.41918945, + "step": 1772, + "time_per_iteration": 2.5425803661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145424, + "balance_loss_mlp": 1.10341442, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.05748114386543088, + "language_loss": 0.80760133, + "learning_rate": 0.000766976302961512, + "loss": 0.81905556, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.42041016, + "step": 1773, + "time_per_iteration": 2.982287645339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155937, + "balance_loss_mlp": 1.11330807, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.06912006035569716, + "language_loss": 0.81549138, + "learning_rate": 0.0007667128376420003, + "loss": 0.82705075, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.42626953, + "step": 1774, + "time_per_iteration": 2.5396063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151156, + "balance_loss_mlp": 1.10926604, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07768471353958366, + "language_loss": 0.84963071, + "learning_rate": 0.0007664492687753817, + "loss": 0.86114228, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.41894531, + "step": 1775, + "time_per_iteration": 2.7326042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139013, + "balance_loss_mlp": 1.09845805, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10552495092435867, + "language_loss": 0.81927752, + "learning_rate": 0.000766185596463983, + "loss": 0.83066773, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.40551758, + "step": 1776, + "time_per_iteration": 2.622465133666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126657, + "balance_loss_mlp": 1.08455205, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.06005887645947995, + "language_loss": 0.77224028, + "learning_rate": 0.0007659218208101706, + "loss": 0.78350687, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.42114258, + "step": 1777, + "time_per_iteration": 3.099862575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124902, + "balance_loss_mlp": 1.0852288, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.057585659974550854, + "language_loss": 0.85272229, + "learning_rate": 0.0007656579419163515, + "loss": 0.86397129, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.39672852, + "step": 1778, + "time_per_iteration": 2.7696709632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129572, + "balance_loss_mlp": 1.08794475, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.07376046533358642, + "language_loss": 0.77272999, + "learning_rate": 0.0007653939598849724, + "loss": 0.78402567, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.41650391, + "step": 1779, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131616, + "balance_loss_mlp": 1.11511779, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.05276839393693404, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84011823, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16503906, + "step": 1780, + "time_per_iteration": 4.96061897277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112473, + "balance_loss_mlp": 1.08267307, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07129012841004771, + "language_loss": 0.80831903, + "learning_rate": 0.000764865686819522, + "loss": 0.81956631, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.4206543, + "step": 1781, + "time_per_iteration": 3.089735507965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.08492422, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0622927262326037, + "language_loss": 0.86375809, + "learning_rate": 0.0007646013959905449, + "loss": 0.87502241, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.41503906, + "step": 1782, + "time_per_iteration": 2.6112704277038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123127, + "balance_loss_mlp": 1.08130884, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10167310682771787, + "language_loss": 0.81018484, + "learning_rate": 0.0007643370024341949, + "loss": 0.82141614, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.41821289, + "step": 1783, + "time_per_iteration": 3.1074132919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115046, + "balance_loss_mlp": 1.07563567, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.057781870331099924, + "language_loss": 0.83518296, + "learning_rate": 0.0007640725062531195, + "loss": 0.84633338, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.39404297, + "step": 1784, + "time_per_iteration": 2.491313934326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112121, + "balance_loss_mlp": 1.07228112, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.12476428026998775, + "language_loss": 0.86600161, + "learning_rate": 0.0007638079075500047, + "loss": 0.87712288, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.39819336, + "step": 1785, + "time_per_iteration": 2.5236706733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070785, + "balance_loss_mlp": 1.05457258, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.032988320908807454, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76251453, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.16210938, + "step": 1786, + "time_per_iteration": 4.938300609588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_mlp": 1.09274352, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.06899034270313556, + "language_loss": 0.83409935, + "learning_rate": 0.0007632784029886026, + "loss": 0.84544241, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.41552734, + "step": 1787, + "time_per_iteration": 2.6218347549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140121, + "balance_loss_mlp": 1.09968519, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.05777013506444436, + "language_loss": 0.85674673, + "learning_rate": 0.0007630134973358873, + "loss": 0.86814797, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.40429688, + "step": 1788, + "time_per_iteration": 2.9675180912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.11780846, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.11323624876812292, + "language_loss": 0.86969185, + "learning_rate": 0.0007627484895722763, + "loss": 0.88126147, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.39160156, + "step": 1789, + "time_per_iteration": 2.6400198936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164783, + "balance_loss_mlp": 1.1222018, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.06957715435201431, + "language_loss": 0.80509681, + "learning_rate": 0.0007624833798006552, + "loss": 0.81674469, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.42602539, + "step": 1790, + "time_per_iteration": 3.042621374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162497, + "balance_loss_mlp": 1.11924767, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.09367673394256656, + "language_loss": 0.84194326, + "learning_rate": 0.0007622181681239483, + "loss": 0.85356832, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.43261719, + "step": 1791, + "time_per_iteration": 2.642648220062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140416, + "balance_loss_mlp": 1.09907472, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07487034842421487, + "language_loss": 0.84962463, + "learning_rate": 0.0007619528546451202, + "loss": 0.86102873, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.41333008, + "step": 1792, + "time_per_iteration": 2.8014347553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.08941662, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.05771787988130437, + "language_loss": 0.84187096, + "learning_rate": 0.0007616874394671745, + "loss": 0.85317373, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.40869141, + "step": 1793, + "time_per_iteration": 3.336076498031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137422, + "balance_loss_mlp": 1.09276664, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08239177777048284, + "language_loss": 0.85433841, + "learning_rate": 0.0007614219226931547, + "loss": 0.86571258, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44677734, + "step": 1794, + "time_per_iteration": 2.6596035957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.0951401, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.06809904369873732, + "language_loss": 0.85092592, + "learning_rate": 0.0007611563044261435, + "loss": 0.86229378, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.41674805, + "step": 1795, + "time_per_iteration": 2.545440435409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140576, + "balance_loss_mlp": 1.09601521, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.08865061616635866, + "language_loss": 0.8722235, + "learning_rate": 0.0007608905847692631, + "loss": 0.88362932, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.44555664, + "step": 1796, + "time_per_iteration": 2.471306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112492, + "balance_loss_mlp": 1.08486605, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07442154430907115, + "language_loss": 0.86828166, + "learning_rate": 0.0007606247638256749, + "loss": 0.87953079, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.40039062, + "step": 1797, + "time_per_iteration": 2.8728272914886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_mlp": 1.03099036, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.022391201486326673, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79215777, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.14453125, + "step": 1798, + "time_per_iteration": 4.99533486366272 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_mlp": 1.0224725, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.020693498138200886, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80363786, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.14160156, + "step": 1799, + "time_per_iteration": 4.871920347213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131321, + "balance_loss_mlp": 1.086761, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.06425687332848114, + "language_loss": 0.8622126, + "learning_rate": 0.0007598266943068686, + "loss": 0.8735258, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44555664, + "step": 1800, + "time_per_iteration": 2.7352967262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128705, + "balance_loss_mlp": 1.0892942, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.06122285990583016, + "language_loss": 0.84089196, + "learning_rate": 0.0007595604692488507, + "loss": 0.85217899, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.39404297, + "step": 1801, + "time_per_iteration": 2.520047664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145052, + "balance_loss_mlp": 1.10182643, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.08959882775364528, + "language_loss": 0.83156121, + "learning_rate": 0.0007592941434205215, + "loss": 0.84301168, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.43237305, + "step": 1802, + "time_per_iteration": 2.774533987045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_mlp": 1.01191127, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.0173366039721641, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74594939, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11962891, + "step": 1803, + "time_per_iteration": 5.441190004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130945, + "balance_loss_mlp": 1.08481145, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07392614166366455, + "language_loss": 0.80754089, + "learning_rate": 0.0007587611898665566, + "loss": 0.81885034, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.4609375, + "step": 1804, + "time_per_iteration": 3.0738565921783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126694, + "balance_loss_mlp": 1.08320653, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.052717282161679486, + "language_loss": 0.82365519, + "learning_rate": 0.0007584945623478315, + "loss": 0.83492208, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.43530273, + "step": 1805, + "time_per_iteration": 2.810065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.08773112, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.0654216117506123, + "language_loss": 0.81839657, + "learning_rate": 0.000758227834472617, + "loss": 0.8297019, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.42822266, + "step": 1806, + "time_per_iteration": 3.0400753021240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129234, + "balance_loss_mlp": 1.08631909, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.06780310502945991, + "language_loss": 0.77468187, + "learning_rate": 0.0007579610063444664, + "loss": 0.78597426, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.42895508, + "step": 1807, + "time_per_iteration": 2.720200538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_mlp": 1.0805254, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.056464817781099026, + "language_loss": 0.87875664, + "learning_rate": 0.0007576940780669712, + "loss": 0.88999271, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4309082, + "step": 1808, + "time_per_iteration": 3.1972455978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119319, + "balance_loss_mlp": 1.07723832, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.06350201854913072, + "language_loss": 0.84194762, + "learning_rate": 0.0007574270497437624, + "loss": 0.85314083, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.42089844, + "step": 1809, + "time_per_iteration": 2.956308364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.08036816, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.05949268624371524, + "language_loss": 0.88030243, + "learning_rate": 0.000757159921478509, + "loss": 0.89152765, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.42138672, + "step": 1810, + "time_per_iteration": 2.7515318393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_mlp": 1.04769194, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.027450813841054106, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75509393, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.719837427139282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.09272385, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.06099509375847796, + "language_loss": 0.87676752, + "learning_rate": 0.0007566253655367423, + "loss": 0.88813394, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.43896484, + "step": 1812, + "time_per_iteration": 2.6117310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145498, + "balance_loss_mlp": 1.10196316, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.26075237363376164, + "language_loss": 0.90086293, + "learning_rate": 0.000756357938067762, + "loss": 0.91231787, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.43554688, + "step": 1813, + "time_per_iteration": 2.6537845134735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09305573, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.07803772738029488, + "language_loss": 0.8299284, + "learning_rate": 0.0007560904110718033, + "loss": 0.84130079, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44165039, + "step": 1814, + "time_per_iteration": 3.2229981422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131299, + "balance_loss_mlp": 1.08549881, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06602375994559181, + "language_loss": 0.83648008, + "learning_rate": 0.0007558227846527297, + "loss": 0.8477931, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.45751953, + "step": 1815, + "time_per_iteration": 2.8217966556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137186, + "balance_loss_mlp": 1.09300709, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.06552880481969095, + "language_loss": 0.83563447, + "learning_rate": 0.0007555550589144429, + "loss": 0.84700632, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44189453, + "step": 1816, + "time_per_iteration": 2.4231276512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148289, + "balance_loss_mlp": 1.1026082, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.05960251663438414, + "language_loss": 0.84705317, + "learning_rate": 0.000755287233960883, + "loss": 0.85853606, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.45678711, + "step": 1817, + "time_per_iteration": 2.5598244667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148754, + "balance_loss_mlp": 1.10297787, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.06564730471203778, + "language_loss": 0.78051704, + "learning_rate": 0.0007550193098960292, + "loss": 0.79200459, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.45751953, + "step": 1818, + "time_per_iteration": 2.8570642471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115418, + "balance_loss_mlp": 1.11033523, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.05538445579726575, + "language_loss": 0.8654325, + "learning_rate": 0.0007547512868238988, + "loss": 0.87697428, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.43847656, + "step": 1819, + "time_per_iteration": 3.1437833309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170578, + "balance_loss_mlp": 1.12499213, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.0822966351911203, + "language_loss": 0.83893883, + "learning_rate": 0.0007544831648485473, + "loss": 0.85064459, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.45605469, + "step": 1820, + "time_per_iteration": 2.660233736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162235, + "balance_loss_mlp": 1.11684048, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.06443547558053964, + "language_loss": 0.81439716, + "learning_rate": 0.0007542149440740694, + "loss": 0.82601953, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.45385742, + "step": 1821, + "time_per_iteration": 2.6618528366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154684, + "balance_loss_mlp": 1.10938418, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.06960442221541481, + "language_loss": 0.86201102, + "learning_rate": 0.000753946624604597, + "loss": 0.87355781, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.45288086, + "step": 1822, + "time_per_iteration": 2.7180583477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138697, + "balance_loss_mlp": 1.09466076, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.11840223630221765, + "language_loss": 0.88456279, + "learning_rate": 0.0007536782065443015, + "loss": 0.89594972, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44042969, + "step": 1823, + "time_per_iteration": 2.6035680770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147734, + "balance_loss_mlp": 1.1024822, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.08971754998357863, + "language_loss": 0.75357497, + "learning_rate": 0.0007534096899973919, + "loss": 0.76505232, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.45263672, + "step": 1824, + "time_per_iteration": 2.592313528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136405, + "balance_loss_mlp": 1.095397, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.056380284358423516, + "language_loss": 0.8296026, + "learning_rate": 0.0007531410750681154, + "loss": 0.84096658, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.41015625, + "step": 1825, + "time_per_iteration": 2.7599031925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149352, + "balance_loss_mlp": 1.10710466, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.06329210930184016, + "language_loss": 0.8686763, + "learning_rate": 0.0007528723618607575, + "loss": 0.88016987, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.42236328, + "step": 1826, + "time_per_iteration": 3.423145055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156709, + "balance_loss_mlp": 1.11808527, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.05752886424443174, + "language_loss": 0.8293525, + "learning_rate": 0.0007526035504796422, + "loss": 0.84091961, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.38598633, + "step": 1827, + "time_per_iteration": 2.774202346801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164193, + "balance_loss_mlp": 1.12080038, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.08334994788856638, + "language_loss": 0.87348354, + "learning_rate": 0.0007523346410291312, + "loss": 0.8851254, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.43408203, + "step": 1828, + "time_per_iteration": 2.7933921813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172191, + "balance_loss_mlp": 1.13127816, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.05847449829546615, + "language_loss": 0.85163879, + "learning_rate": 0.0007520656336136245, + "loss": 0.86336064, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.40942383, + "step": 1829, + "time_per_iteration": 2.9654810428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167386, + "balance_loss_mlp": 1.12675905, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06508844853371867, + "language_loss": 0.88540596, + "learning_rate": 0.0007517965283375599, + "loss": 0.89707983, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.40625, + "step": 1830, + "time_per_iteration": 2.833653211593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161789, + "balance_loss_mlp": 1.12078059, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.05306701185260888, + "language_loss": 0.89636958, + "learning_rate": 0.0007515273253054132, + "loss": 0.90798748, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.41015625, + "step": 1831, + "time_per_iteration": 2.648688554763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162371, + "balance_loss_mlp": 1.11788237, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.060637132075448665, + "language_loss": 0.8317945, + "learning_rate": 0.0007512580246216988, + "loss": 0.84341824, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44482422, + "step": 1832, + "time_per_iteration": 2.695558786392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152178, + "balance_loss_mlp": 1.11288619, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.06652239867864222, + "language_loss": 0.8520152, + "learning_rate": 0.000750988626390968, + "loss": 0.86353695, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.39306641, + "step": 1833, + "time_per_iteration": 2.5903215408325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114923, + "balance_loss_mlp": 1.10810232, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.05520517467567221, + "language_loss": 0.85274744, + "learning_rate": 0.0007507191307178108, + "loss": 0.86423969, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.41137695, + "step": 1834, + "time_per_iteration": 2.7567453384399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132557, + "balance_loss_mlp": 1.0890696, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.06897138795442613, + "language_loss": 0.75032014, + "learning_rate": 0.0007504495377068543, + "loss": 0.76164567, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.43481445, + "step": 1835, + "time_per_iteration": 2.7309370040893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134622, + "balance_loss_mlp": 1.08972788, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09099083327189633, + "language_loss": 0.81936944, + "learning_rate": 0.0007501798474627642, + "loss": 0.8307156, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44873047, + "step": 1836, + "time_per_iteration": 2.9126806259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113171, + "balance_loss_mlp": 1.08853245, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.058808043239055564, + "language_loss": 0.8375026, + "learning_rate": 0.0007499100600902433, + "loss": 0.84881973, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.43164062, + "step": 1837, + "time_per_iteration": 2.9810633659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124171, + "balance_loss_mlp": 1.08118403, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08552727697149294, + "language_loss": 0.8450433, + "learning_rate": 0.0007496401756940324, + "loss": 0.85628498, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.43017578, + "step": 1838, + "time_per_iteration": 2.670412540435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130914, + "balance_loss_mlp": 1.08897638, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.06964876492363449, + "language_loss": 0.82608843, + "learning_rate": 0.0007493701943789098, + "loss": 0.83739758, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.41967773, + "step": 1839, + "time_per_iteration": 2.772620677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.09537208, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07045943234490067, + "language_loss": 0.83116889, + "learning_rate": 0.000749100116249692, + "loss": 0.84255433, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.43188477, + "step": 1840, + "time_per_iteration": 2.6031582355499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144616, + "balance_loss_mlp": 1.10110414, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.08424265710124153, + "language_loss": 0.86582088, + "learning_rate": 0.0007488299414112321, + "loss": 0.87726706, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.43505859, + "step": 1841, + "time_per_iteration": 2.5864784717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.10726476, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.058600000923872894, + "language_loss": 0.77847576, + "learning_rate": 0.0007485596699684215, + "loss": 0.78998852, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.43994141, + "step": 1842, + "time_per_iteration": 2.8149642944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156484, + "balance_loss_mlp": 1.11266279, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.055073821734726955, + "language_loss": 0.85694617, + "learning_rate": 0.000748289302026189, + "loss": 0.86851102, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.43823242, + "step": 1843, + "time_per_iteration": 2.8475751876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158372, + "balance_loss_mlp": 1.11688685, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.057565803102883874, + "language_loss": 0.85718876, + "learning_rate": 0.0007480188376895004, + "loss": 0.86877251, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.41479492, + "step": 1844, + "time_per_iteration": 3.0344529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140478, + "balance_loss_mlp": 1.12693632, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.05127204690943662, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74951822, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.13574219, + "step": 1845, + "time_per_iteration": 4.8589537143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176931, + "balance_loss_mlp": 1.13518405, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08988090291235612, + "language_loss": 0.78641856, + "learning_rate": 0.0007474776202528074, + "loss": 0.79818785, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.41772461, + "step": 1846, + "time_per_iteration": 2.9269866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184559, + "balance_loss_mlp": 1.14243031, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08000045078310114, + "language_loss": 0.81513619, + "learning_rate": 0.000747206867362922, + "loss": 0.82698178, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.42114258, + "step": 1847, + "time_per_iteration": 3.067870616912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169442, + "balance_loss_mlp": 1.12573957, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.0760432300690223, + "language_loss": 0.84328806, + "learning_rate": 0.0007469360184988194, + "loss": 0.85498255, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.43701172, + "step": 1848, + "time_per_iteration": 2.8130369186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159569, + "balance_loss_mlp": 1.11837053, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08168000095068725, + "language_loss": 0.86707914, + "learning_rate": 0.0007466650737656518, + "loss": 0.87867486, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.41162109, + "step": 1849, + "time_per_iteration": 2.592503309249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115621, + "balance_loss_mlp": 1.11324644, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06757272046168854, + "language_loss": 0.89898217, + "learning_rate": 0.0007463940332686098, + "loss": 0.91054422, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.42944336, + "step": 1850, + "time_per_iteration": 2.4776744842529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148398, + "balance_loss_mlp": 1.10607898, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.05922624538442341, + "language_loss": 0.84461212, + "learning_rate": 0.0007461228971129205, + "loss": 0.85609609, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.42358398, + "step": 1851, + "time_per_iteration": 2.9012656211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.11387658, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.058626739978073765, + "language_loss": 0.85743707, + "learning_rate": 0.0007458516654038483, + "loss": 0.86898398, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.40820312, + "step": 1852, + "time_per_iteration": 2.666947603225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165665, + "balance_loss_mlp": 1.12160563, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06798765543406252, + "language_loss": 0.86475062, + "learning_rate": 0.0007455803382466946, + "loss": 0.87640727, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44042969, + "step": 1853, + "time_per_iteration": 2.804776191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162987, + "balance_loss_mlp": 1.11985719, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07311152518110202, + "language_loss": 0.87308323, + "learning_rate": 0.0007453089157467979, + "loss": 0.88471317, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.43139648, + "step": 1854, + "time_per_iteration": 2.8038864135742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159292, + "balance_loss_mlp": 1.1161381, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06621845487790666, + "language_loss": 0.82129812, + "learning_rate": 0.0007450373980095341, + "loss": 0.83289105, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.43164062, + "step": 1855, + "time_per_iteration": 3.0980496406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154286, + "balance_loss_mlp": 1.11268187, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.05908088829108725, + "language_loss": 0.87076378, + "learning_rate": 0.0007447657851403155, + "loss": 0.88230669, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.41601562, + "step": 1856, + "time_per_iteration": 2.6393351554870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148054, + "balance_loss_mlp": 1.10609269, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.07116077808597938, + "language_loss": 0.79415643, + "learning_rate": 0.0007444940772445915, + "loss": 0.805637, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.41943359, + "step": 1857, + "time_per_iteration": 2.7049038410186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.10770321, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06303496934817837, + "language_loss": 0.80443203, + "learning_rate": 0.0007442222744278484, + "loss": 0.81591749, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.40844727, + "step": 1858, + "time_per_iteration": 2.6416029930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.10056937, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.06290523981550739, + "language_loss": 0.84690839, + "learning_rate": 0.0007439503767956099, + "loss": 0.85831463, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.40063477, + "step": 1859, + "time_per_iteration": 2.697295665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095769, + "balance_loss_mlp": 1.08213139, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.02707100394521806, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80767375, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.13671875, + "step": 1860, + "time_per_iteration": 4.896381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157881, + "balance_loss_mlp": 1.11744571, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.054355964588402354, + "language_loss": 0.86204398, + "learning_rate": 0.000743406297506922, + "loss": 0.87362283, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.40478516, + "step": 1861, + "time_per_iteration": 2.7121450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154988, + "balance_loss_mlp": 1.11362243, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.056412092641732435, + "language_loss": 0.8442747, + "learning_rate": 0.0007431341160617031, + "loss": 0.85582459, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.41381836, + "step": 1862, + "time_per_iteration": 2.902806520462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.13052833, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06986467819319542, + "language_loss": 0.88734752, + "learning_rate": 0.0007428618402234491, + "loss": 0.89907002, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.41723633, + "step": 1863, + "time_per_iteration": 2.644352436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159657, + "balance_loss_mlp": 1.11831546, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.06293448628505635, + "language_loss": 0.8061077, + "learning_rate": 0.0007425894700978668, + "loss": 0.81770432, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.41357422, + "step": 1864, + "time_per_iteration": 2.782757043838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.10699308, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.056888458094662434, + "language_loss": 0.79858804, + "learning_rate": 0.0007423170057906996, + "loss": 0.81006974, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.41162109, + "step": 1865, + "time_per_iteration": 3.848773956298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133926, + "balance_loss_mlp": 1.09391952, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.06447904861600703, + "language_loss": 0.86500657, + "learning_rate": 0.0007420444474077275, + "loss": 0.87634581, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.40014648, + "step": 1866, + "time_per_iteration": 2.542572498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126566, + "balance_loss_mlp": 1.0855341, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.07300351460408123, + "language_loss": 0.8986578, + "learning_rate": 0.0007417717950547671, + "loss": 0.90992349, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.41040039, + "step": 1867, + "time_per_iteration": 2.5633254051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073925, + "balance_loss_mlp": 1.06143153, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.026482390846264015, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77070534, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.125, + "step": 1868, + "time_per_iteration": 4.904905557632446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111694, + "balance_loss_mlp": 1.07345176, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.053992922509511466, + "language_loss": 0.850173, + "learning_rate": 0.0007412262088623299, + "loss": 0.86128998, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.38232422, + "step": 1869, + "time_per_iteration": 2.7310874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110773, + "balance_loss_mlp": 1.07200575, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08370102618564679, + "language_loss": 0.79675972, + "learning_rate": 0.0007409532752346684, + "loss": 0.80786741, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.38769531, + "step": 1870, + "time_per_iteration": 2.6629347801208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110981, + "balance_loss_mlp": 1.07166612, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06403903481871269, + "language_loss": 0.88829064, + "learning_rate": 0.0007406802480606491, + "loss": 0.89940047, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.39306641, + "step": 1871, + "time_per_iteration": 2.6200008392333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.06835461, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.0729370697679506, + "language_loss": 0.90798759, + "learning_rate": 0.0007404071274462707, + "loss": 0.9190588, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.38769531, + "step": 1872, + "time_per_iteration": 2.5693628787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111805, + "balance_loss_mlp": 1.07978415, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.06627703814726228, + "language_loss": 0.84024733, + "learning_rate": 0.0007401339134975682, + "loss": 0.85142779, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.38208008, + "step": 1873, + "time_per_iteration": 2.7031140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127585, + "balance_loss_mlp": 1.08760262, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.06845959531373838, + "language_loss": 0.84298885, + "learning_rate": 0.0007398606063206122, + "loss": 0.85426462, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.39990234, + "step": 1874, + "time_per_iteration": 2.6090316772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.07598901, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.06521397848462201, + "language_loss": 0.78764814, + "learning_rate": 0.0007395872060215101, + "loss": 0.79879999, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.3918457, + "step": 1875, + "time_per_iteration": 2.620976448059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0831089, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.06345733178575377, + "language_loss": 0.88705117, + "learning_rate": 0.0007393137127064056, + "loss": 0.89827275, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.39013672, + "step": 1876, + "time_per_iteration": 2.7320597171783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125006, + "balance_loss_mlp": 1.08511841, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.056097062255587686, + "language_loss": 0.84576774, + "learning_rate": 0.0007390401264814779, + "loss": 0.85701776, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.39868164, + "step": 1877, + "time_per_iteration": 2.605865478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123607, + "balance_loss_mlp": 1.08503079, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.06159732683880817, + "language_loss": 0.84937686, + "learning_rate": 0.0007387664474529427, + "loss": 0.86061299, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.38598633, + "step": 1878, + "time_per_iteration": 2.6548514366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.09750319, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.05796680079252983, + "language_loss": 0.91768891, + "learning_rate": 0.0007384926757270518, + "loss": 0.92906928, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.40527344, + "step": 1879, + "time_per_iteration": 2.6339149475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137039, + "balance_loss_mlp": 1.09791493, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.05405313293747941, + "language_loss": 0.79881001, + "learning_rate": 0.0007382188114100924, + "loss": 0.81018037, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.39111328, + "step": 1880, + "time_per_iteration": 2.983384132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139197, + "balance_loss_mlp": 1.09964395, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.12141150358978081, + "language_loss": 0.82206392, + "learning_rate": 0.0007379448546083884, + "loss": 0.83345592, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.39575195, + "step": 1881, + "time_per_iteration": 2.9186532497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140707, + "balance_loss_mlp": 1.10127282, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06284373597557333, + "language_loss": 0.88377333, + "learning_rate": 0.0007376708054282992, + "loss": 0.8951804, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.39428711, + "step": 1882, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144635, + "balance_loss_mlp": 1.10605919, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.05224621202588268, + "language_loss": 0.84316945, + "learning_rate": 0.0007373966639762201, + "loss": 0.85461575, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.38574219, + "step": 1883, + "time_per_iteration": 2.623133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147786, + "balance_loss_mlp": 1.10620606, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.06751899300287477, + "language_loss": 0.89170045, + "learning_rate": 0.0007371224303585822, + "loss": 0.90317833, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.41577148, + "step": 1884, + "time_per_iteration": 2.628394842147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021984, + "balance_loss_mlp": 1.01154125, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.007236456832270123, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8137905, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10449219, + "step": 1885, + "time_per_iteration": 4.717620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114112, + "balance_loss_mlp": 1.10049307, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.057116908748179596, + "language_loss": 0.82560247, + "learning_rate": 0.0007365736870525335, + "loss": 0.83701366, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.40625, + "step": 1886, + "time_per_iteration": 2.8198611736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132227, + "balance_loss_mlp": 1.09310222, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.06530442713985495, + "language_loss": 0.83123338, + "learning_rate": 0.000736299177577164, + "loss": 0.84255564, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.39135742, + "step": 1887, + "time_per_iteration": 2.613863945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128864, + "balance_loss_mlp": 1.08992994, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0666501464088242, + "language_loss": 0.84363097, + "learning_rate": 0.0007360245763623174, + "loss": 0.85491955, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3894043, + "step": 1888, + "time_per_iteration": 2.6378068923950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115221, + "balance_loss_mlp": 1.07702661, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06993226621121658, + "language_loss": 0.90142351, + "learning_rate": 0.0007357498835146039, + "loss": 0.91257572, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.38183594, + "step": 1889, + "time_per_iteration": 2.8125081062316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128167, + "balance_loss_mlp": 1.08878016, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.07359030033413445, + "language_loss": 0.87316656, + "learning_rate": 0.0007354750991406684, + "loss": 0.88444823, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.39379883, + "step": 1890, + "time_per_iteration": 2.714569568634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121285, + "balance_loss_mlp": 1.0807066, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07836036923074335, + "language_loss": 0.80991101, + "learning_rate": 0.0007352002233471919, + "loss": 0.8211239, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.40576172, + "step": 1891, + "time_per_iteration": 2.6287412643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121974, + "balance_loss_mlp": 1.08180022, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.058839902089765785, + "language_loss": 0.79524523, + "learning_rate": 0.0007349252562408906, + "loss": 0.80646491, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.40161133, + "step": 1892, + "time_per_iteration": 2.669903039932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125098, + "balance_loss_mlp": 1.08449531, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.057079030651025625, + "language_loss": 0.81590033, + "learning_rate": 0.0007346501979285158, + "loss": 0.8271513, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.40600586, + "step": 1893, + "time_per_iteration": 2.9146764278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083448, + "balance_loss_mlp": 1.07238543, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.036364529291757694, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81622547, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11083984, + "step": 1894, + "time_per_iteration": 4.784435272216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126267, + "balance_loss_mlp": 1.08444858, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06549610472034906, + "language_loss": 0.86352968, + "learning_rate": 0.0007340998081127308, + "loss": 0.87479234, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.41796875, + "step": 1895, + "time_per_iteration": 2.7702367305755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130662, + "balance_loss_mlp": 1.09113181, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06520113052193731, + "language_loss": 0.91046786, + "learning_rate": 0.0007338244768230007, + "loss": 0.92177445, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.39550781, + "step": 1896, + "time_per_iteration": 2.7612760066986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133468, + "balance_loss_mlp": 1.09315181, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.058734972315737245, + "language_loss": 0.89108521, + "learning_rate": 0.0007335490547545578, + "loss": 0.90241992, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.40307617, + "step": 1897, + "time_per_iteration": 3.024462938308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135084, + "balance_loss_mlp": 1.09343266, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06208128991116815, + "language_loss": 0.82833707, + "learning_rate": 0.0007332735420143308, + "loss": 0.83968788, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.41650391, + "step": 1898, + "time_per_iteration": 2.725468158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112873, + "balance_loss_mlp": 1.08669686, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.09645190116324148, + "language_loss": 0.86573303, + "learning_rate": 0.0007329979387092826, + "loss": 0.8770203, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.42016602, + "step": 1899, + "time_per_iteration": 2.6357531547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133626, + "balance_loss_mlp": 1.09259379, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.06150604002201611, + "language_loss": 0.84294677, + "learning_rate": 0.0007327222449464124, + "loss": 0.85428298, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.41040039, + "step": 1900, + "time_per_iteration": 3.2381174564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136855, + "balance_loss_mlp": 1.09382069, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07567830151973255, + "language_loss": 0.89052904, + "learning_rate": 0.0007324464608327538, + "loss": 0.90189761, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4309082, + "step": 1901, + "time_per_iteration": 2.597569227218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.10814035, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.07712085030005716, + "language_loss": 0.88794601, + "learning_rate": 0.0007321705864753758, + "loss": 0.89944601, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.41870117, + "step": 1902, + "time_per_iteration": 2.6877686977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151954, + "balance_loss_mlp": 1.11097002, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.05591922142148154, + "language_loss": 0.84586883, + "learning_rate": 0.0007318946219813823, + "loss": 0.85738844, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.40991211, + "step": 1903, + "time_per_iteration": 3.0283257961273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11341679, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.0702623940180467, + "language_loss": 0.90117764, + "learning_rate": 0.000731618567457912, + "loss": 0.91269374, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.38208008, + "step": 1904, + "time_per_iteration": 2.651491165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114788, + "balance_loss_mlp": 1.10522676, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07047012066076976, + "language_loss": 0.87036794, + "learning_rate": 0.000731342423012139, + "loss": 0.88184673, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.42700195, + "step": 1905, + "time_per_iteration": 3.0361618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143776, + "balance_loss_mlp": 1.10331631, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.06969182334255739, + "language_loss": 0.82982039, + "learning_rate": 0.0007310661887512722, + "loss": 0.84125817, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.40478516, + "step": 1906, + "time_per_iteration": 3.020333766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134716, + "balance_loss_mlp": 1.09592557, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.056548054453958524, + "language_loss": 0.82503444, + "learning_rate": 0.0007307898647825549, + "loss": 0.83638155, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.38793945, + "step": 1907, + "time_per_iteration": 2.6819958686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128623, + "balance_loss_mlp": 1.08568358, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.0662764931561561, + "language_loss": 0.89910614, + "learning_rate": 0.0007305134512132659, + "loss": 0.9103924, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.42944336, + "step": 1908, + "time_per_iteration": 2.688716411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.08063269, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.07972147303822336, + "language_loss": 0.83329952, + "learning_rate": 0.0007302369481507183, + "loss": 0.8445071, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.40136719, + "step": 1909, + "time_per_iteration": 2.520551919937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043123, + "balance_loss_mlp": 1.03272831, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.028970701382128577, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004882, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10400391, + "step": 1910, + "time_per_iteration": 4.862990140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_mlp": 1.07534695, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.0535153553246422, + "language_loss": 0.85860741, + "learning_rate": 0.000729683673975274, + "loss": 0.86976075, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.3996582, + "step": 1911, + "time_per_iteration": 2.6834514141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117796, + "balance_loss_mlp": 1.07783747, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.07394300555179863, + "language_loss": 0.83108044, + "learning_rate": 0.0007294069030771774, + "loss": 0.84225845, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.39941406, + "step": 1912, + "time_per_iteration": 3.6458523273468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124936, + "balance_loss_mlp": 1.08483398, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.05916806609098389, + "language_loss": 0.90897858, + "learning_rate": 0.0007291300431154224, + "loss": 0.920228, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.40112305, + "step": 1913, + "time_per_iteration": 2.5737557411193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_mlp": 1.02157927, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.013681752942923219, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71422619, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.11279297, + "step": 1914, + "time_per_iteration": 5.031456232070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113953, + "balance_loss_mlp": 1.07499564, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.06158754254944219, + "language_loss": 0.79961407, + "learning_rate": 0.0007285760564309179, + "loss": 0.81075364, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.38964844, + "step": 1915, + "time_per_iteration": 3.152339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122924, + "balance_loss_mlp": 1.08346629, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10197178679971165, + "language_loss": 0.85308397, + "learning_rate": 0.0007282989299232448, + "loss": 0.86431319, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.39453125, + "step": 1916, + "time_per_iteration": 3.0152268409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119949, + "balance_loss_mlp": 1.08013296, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.05980283450468872, + "language_loss": 0.8385278, + "learning_rate": 0.0007280217147820668, + "loss": 0.84972733, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.39794922, + "step": 1917, + "time_per_iteration": 2.625802755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114962, + "balance_loss_mlp": 1.07512259, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06755957483710798, + "language_loss": 0.79489267, + "learning_rate": 0.0007277444111150079, + "loss": 0.80604231, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.3984375, + "step": 1918, + "time_per_iteration": 2.6753525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112846, + "balance_loss_mlp": 1.08785725, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.07157808177363079, + "language_loss": 0.84730321, + "learning_rate": 0.0007274670190297272, + "loss": 0.8585878, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.40576172, + "step": 1919, + "time_per_iteration": 2.6149959564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.09986341, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.05944559747374387, + "language_loss": 0.8264004, + "learning_rate": 0.0007271895386339179, + "loss": 0.83782172, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.42285156, + "step": 1920, + "time_per_iteration": 2.7611513137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140712, + "balance_loss_mlp": 1.09970427, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.059089751588204814, + "language_loss": 0.83542717, + "learning_rate": 0.0007269119700353073, + "loss": 0.8468343, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.41015625, + "step": 1921, + "time_per_iteration": 2.782167911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148229, + "balance_loss_mlp": 1.10738814, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06644949508392005, + "language_loss": 0.85268104, + "learning_rate": 0.0007266343133416571, + "loss": 0.8641634, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.40844727, + "step": 1922, + "time_per_iteration": 2.7218997478485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.06340241, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.03214674667569998, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78192997, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.12695312, + "step": 1923, + "time_per_iteration": 4.837427854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145902, + "balance_loss_mlp": 1.1028676, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.07518583721861193, + "language_loss": 0.84417462, + "learning_rate": 0.0007260787361004556, + "loss": 0.85563368, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.43041992, + "step": 1924, + "time_per_iteration": 2.5874598026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.03880954, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023888622594867324, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74812186, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11865234, + "step": 1925, + "time_per_iteration": 4.961286544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137865, + "balance_loss_mlp": 1.09571242, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.05584746966952834, + "language_loss": 0.87657702, + "learning_rate": 0.0007255228077730903, + "loss": 0.88795567, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.42163086, + "step": 1926, + "time_per_iteration": 2.663482666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.09786606, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.05562014185368244, + "language_loss": 0.81976974, + "learning_rate": 0.0007252447122218632, + "loss": 0.83117759, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.42919922, + "step": 1927, + "time_per_iteration": 3.1484758853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138853, + "balance_loss_mlp": 1.09655809, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.06601877155853234, + "language_loss": 0.88791764, + "learning_rate": 0.0007249665292228834, + "loss": 0.89930612, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.4230957, + "step": 1928, + "time_per_iteration": 2.5840864181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140563, + "balance_loss_mlp": 1.09872091, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.05314866644458525, + "language_loss": 0.83534646, + "learning_rate": 0.000724688258884151, + "loss": 0.84675211, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.41845703, + "step": 1929, + "time_per_iteration": 2.6063482761383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129765, + "balance_loss_mlp": 1.09166527, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.06946275153671234, + "language_loss": 0.86767673, + "learning_rate": 0.0007244099013137002, + "loss": 0.87897444, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.38085938, + "step": 1930, + "time_per_iteration": 3.0539071559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.0873971, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.05696415350586704, + "language_loss": 0.89040637, + "learning_rate": 0.0007241314566195993, + "loss": 0.90168232, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.40185547, + "step": 1931, + "time_per_iteration": 3.2625389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07861531, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.08463017827171934, + "language_loss": 0.85909784, + "learning_rate": 0.0007238529249099496, + "loss": 0.87028337, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.39941406, + "step": 1932, + "time_per_iteration": 2.6740944385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.09080601, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.046016525030599324, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78958464, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10791016, + "step": 1933, + "time_per_iteration": 4.862685203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125745, + "balance_loss_mlp": 1.08347321, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.10032321862894769, + "language_loss": 0.80747449, + "learning_rate": 0.000723295600876581, + "loss": 0.81873196, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.42285156, + "step": 1934, + "time_per_iteration": 2.990391969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125218, + "balance_loss_mlp": 1.08406699, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.057414096803471676, + "language_loss": 0.87956464, + "learning_rate": 0.0007230168087692344, + "loss": 0.89081681, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.41162109, + "step": 1935, + "time_per_iteration": 2.656625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119009, + "balance_loss_mlp": 1.07924092, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.060205825913767164, + "language_loss": 0.82307911, + "learning_rate": 0.0007227379300790839, + "loss": 0.83426917, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.39770508, + "step": 1936, + "time_per_iteration": 2.997037649154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114267, + "balance_loss_mlp": 1.07218599, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.06128365804507508, + "language_loss": 0.86067426, + "learning_rate": 0.0007224589649143997, + "loss": 0.87181687, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.4206543, + "step": 1937, + "time_per_iteration": 2.5290677547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124508, + "balance_loss_mlp": 1.08228397, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.06605047879793914, + "language_loss": 0.81297445, + "learning_rate": 0.0007221799133834861, + "loss": 0.82421947, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.42236328, + "step": 1938, + "time_per_iteration": 2.613140106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122203, + "balance_loss_mlp": 1.08195794, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.09318016716004435, + "language_loss": 0.8198092, + "learning_rate": 0.00072190077559468, + "loss": 0.83103126, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.40209961, + "step": 1939, + "time_per_iteration": 2.517237424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115818, + "balance_loss_mlp": 1.07578754, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.0553068133661429, + "language_loss": 0.8932575, + "learning_rate": 0.0007216215516563527, + "loss": 0.90441567, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.40014648, + "step": 1940, + "time_per_iteration": 2.7175915241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_mlp": 1.07089305, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.06982995582267476, + "language_loss": 0.83827746, + "learning_rate": 0.0007213422416769083, + "loss": 0.84939647, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.41015625, + "step": 1941, + "time_per_iteration": 2.5922279357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116664, + "balance_loss_mlp": 1.07684803, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.050249137281424494, + "language_loss": 0.75479639, + "learning_rate": 0.0007210628457647849, + "loss": 0.76596296, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.39819336, + "step": 1942, + "time_per_iteration": 2.583151340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.07781446, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.0794488438004998, + "language_loss": 0.79022861, + "learning_rate": 0.000720783364028453, + "loss": 0.8014161, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.40942383, + "step": 1943, + "time_per_iteration": 2.7737677097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114071, + "balance_loss_mlp": 1.07418346, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.05694655733140731, + "language_loss": 0.87941283, + "learning_rate": 0.0007205037965764177, + "loss": 0.89055347, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.39868164, + "step": 1944, + "time_per_iteration": 2.558089256286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123121, + "balance_loss_mlp": 1.08430672, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07621334150317126, + "language_loss": 0.85730159, + "learning_rate": 0.0007202241435172161, + "loss": 0.86853278, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.38769531, + "step": 1945, + "time_per_iteration": 2.7602779865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125439, + "balance_loss_mlp": 1.08574176, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07927003262790512, + "language_loss": 0.88465476, + "learning_rate": 0.0007199444049594198, + "loss": 0.89590919, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.39697266, + "step": 1946, + "time_per_iteration": 2.9583580493927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119027, + "balance_loss_mlp": 1.07665968, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.055396154938164174, + "language_loss": 0.8346498, + "learning_rate": 0.0007196645810116322, + "loss": 0.8458401, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.42382812, + "step": 1947, + "time_per_iteration": 2.6851320266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131178, + "balance_loss_mlp": 1.09045637, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.05889971918419499, + "language_loss": 0.84302223, + "learning_rate": 0.0007193846717824912, + "loss": 0.854334, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.40698242, + "step": 1948, + "time_per_iteration": 2.9035325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.08848619, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.07994215642664601, + "language_loss": 0.88549483, + "learning_rate": 0.0007191046773806669, + "loss": 0.89678907, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.40942383, + "step": 1949, + "time_per_iteration": 2.574697256088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135159, + "balance_loss_mlp": 1.09224343, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07615017139071276, + "language_loss": 0.8356899, + "learning_rate": 0.0007188245979148631, + "loss": 0.84704149, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.42919922, + "step": 1950, + "time_per_iteration": 3.216397285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137761, + "balance_loss_mlp": 1.09475029, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.061651705216508604, + "language_loss": 0.87894762, + "learning_rate": 0.0007185444334938157, + "loss": 0.89032525, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.43041992, + "step": 1951, + "time_per_iteration": 2.6782584190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127424, + "balance_loss_mlp": 1.08879972, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.07782676746029546, + "language_loss": 0.84900033, + "learning_rate": 0.0007182641842262947, + "loss": 0.86027455, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.38647461, + "step": 1952, + "time_per_iteration": 2.639446258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125752, + "balance_loss_mlp": 1.08603168, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.05954692469221933, + "language_loss": 0.78027642, + "learning_rate": 0.0007179838502211022, + "loss": 0.79153389, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.3972168, + "step": 1953, + "time_per_iteration": 2.84329891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131364, + "balance_loss_mlp": 1.09028411, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.10232430816689406, + "language_loss": 0.86411202, + "learning_rate": 0.0007177034315870738, + "loss": 0.8754257, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.41064453, + "step": 1954, + "time_per_iteration": 2.957648992538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124051, + "balance_loss_mlp": 1.08325803, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06271313302399782, + "language_loss": 0.91398948, + "learning_rate": 0.0007174229284330773, + "loss": 0.92523003, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.40795898, + "step": 1955, + "time_per_iteration": 2.5879859924316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128257, + "balance_loss_mlp": 1.08879828, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.06607511431735706, + "language_loss": 0.86850858, + "learning_rate": 0.0007171423408680141, + "loss": 0.87979114, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.39453125, + "step": 1956, + "time_per_iteration": 2.7903566360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123297, + "balance_loss_mlp": 1.08295655, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.06886679209235984, + "language_loss": 0.90041375, + "learning_rate": 0.0007168616690008176, + "loss": 0.91164672, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.40356445, + "step": 1957, + "time_per_iteration": 2.6327474117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07705224, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.062429689069725576, + "language_loss": 0.85725892, + "learning_rate": 0.0007165809129404545, + "loss": 0.86842352, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.39404297, + "step": 1958, + "time_per_iteration": 2.7385900020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124898, + "balance_loss_mlp": 1.08527279, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.05793527093847313, + "language_loss": 0.85962278, + "learning_rate": 0.0007163000727959239, + "loss": 0.87087178, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.39624023, + "step": 1959, + "time_per_iteration": 2.485438585281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_mlp": 1.0320313, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.027906108498427614, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79005599, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.14453125, + "step": 1960, + "time_per_iteration": 4.834578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_mlp": 1.07865775, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.05325294699236946, + "language_loss": 0.84349847, + "learning_rate": 0.00071573814069052, + "loss": 0.85467696, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.39208984, + "step": 1961, + "time_per_iteration": 2.9086802005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120534, + "balance_loss_mlp": 1.08219612, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.09498383670658105, + "language_loss": 0.88074362, + "learning_rate": 0.0007154570489478081, + "loss": 0.89194894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.38330078, + "step": 1962, + "time_per_iteration": 3.2217841148376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117183, + "balance_loss_mlp": 1.07889283, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.05466788938828107, + "language_loss": 0.86278516, + "learning_rate": 0.0007151758735572514, + "loss": 0.87395698, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.38232422, + "step": 1963, + "time_per_iteration": 3.01104998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130106, + "balance_loss_mlp": 1.08921766, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.06218420858169212, + "language_loss": 0.81413925, + "learning_rate": 0.0007148946146280119, + "loss": 0.82544029, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.40893555, + "step": 1964, + "time_per_iteration": 2.8039112091064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_mlp": 1.01440012, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.022738468700431315, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73218751, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12207031, + "step": 1965, + "time_per_iteration": 4.8600172996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024213, + "balance_loss_mlp": 1.0124352, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.018349030303600054, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76366156, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.11767578, + "step": 1966, + "time_per_iteration": 4.918729782104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135372, + "balance_loss_mlp": 1.09648633, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.2766921299066869, + "language_loss": 0.83812642, + "learning_rate": 0.0007140503377003022, + "loss": 0.84948009, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.38891602, + "step": 1967, + "time_per_iteration": 3.015761613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149326, + "balance_loss_mlp": 1.10862756, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.07158509383086724, + "language_loss": 0.8519339, + "learning_rate": 0.000713768745708599, + "loss": 0.8634271, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.40698242, + "step": 1968, + "time_per_iteration": 2.6109209060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140905, + "balance_loss_mlp": 1.09996843, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.05954158443482363, + "language_loss": 0.774553, + "learning_rate": 0.0007134870707245085, + "loss": 0.78596205, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.40893555, + "step": 1969, + "time_per_iteration": 3.2631757259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150809, + "balance_loss_mlp": 1.11008716, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.05763521765218817, + "language_loss": 0.84313977, + "learning_rate": 0.0007132053128573864, + "loss": 0.85464787, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.40698242, + "step": 1970, + "time_per_iteration": 2.7791051864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143919, + "balance_loss_mlp": 1.10353041, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06905446326925666, + "language_loss": 0.84168518, + "learning_rate": 0.0007129234722166211, + "loss": 0.85312432, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.40356445, + "step": 1971, + "time_per_iteration": 2.8210554122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149932, + "balance_loss_mlp": 1.11152232, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.07023460279096982, + "language_loss": 0.91057038, + "learning_rate": 0.0007126415489116328, + "loss": 0.92206967, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3840332, + "step": 1972, + "time_per_iteration": 2.672755002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153021, + "balance_loss_mlp": 1.11210799, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06814261110374484, + "language_loss": 0.81719398, + "learning_rate": 0.0007123595430518736, + "loss": 0.82872415, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.40917969, + "step": 1973, + "time_per_iteration": 2.8325109481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_mlp": 1.10081029, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.06503005991167149, + "language_loss": 0.86840981, + "learning_rate": 0.0007120774547468282, + "loss": 0.87980628, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.38793945, + "step": 1974, + "time_per_iteration": 2.6115715503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148199, + "balance_loss_mlp": 1.10781133, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.05441443516000103, + "language_loss": 0.81729043, + "learning_rate": 0.0007117952841060128, + "loss": 0.82877243, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.40380859, + "step": 1975, + "time_per_iteration": 2.6378135681152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135454, + "balance_loss_mlp": 1.09389758, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08133175482890537, + "language_loss": 0.83869064, + "learning_rate": 0.0007115130312389756, + "loss": 0.85004514, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.41552734, + "step": 1976, + "time_per_iteration": 2.664318084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139177, + "balance_loss_mlp": 1.0974772, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.06620518382871708, + "language_loss": 0.79781663, + "learning_rate": 0.0007112306962552973, + "loss": 0.80920839, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.41699219, + "step": 1977, + "time_per_iteration": 2.6198599338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.0891974, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.05972767263520316, + "language_loss": 0.85605282, + "learning_rate": 0.0007109482792645896, + "loss": 0.86734867, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.40356445, + "step": 1978, + "time_per_iteration": 2.728576898574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132218, + "balance_loss_mlp": 1.09066188, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.09572440125940551, + "language_loss": 0.84308225, + "learning_rate": 0.0007106657803764969, + "loss": 0.85440445, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.41552734, + "step": 1979, + "time_per_iteration": 2.7279720306396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126537, + "balance_loss_mlp": 1.08340704, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.05862837672704736, + "language_loss": 0.82269728, + "learning_rate": 0.0007103831997006948, + "loss": 0.83396262, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.43164062, + "step": 1980, + "time_per_iteration": 2.746915817260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127621, + "balance_loss_mlp": 1.08663654, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.05821983888794681, + "language_loss": 0.85798764, + "learning_rate": 0.0007101005373468908, + "loss": 0.86926389, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.40991211, + "step": 1981, + "time_per_iteration": 2.878394365310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131348, + "balance_loss_mlp": 1.09060264, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.057148713710776886, + "language_loss": 0.86977971, + "learning_rate": 0.0007098177934248242, + "loss": 0.88109326, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.40771484, + "step": 1982, + "time_per_iteration": 2.7281908988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142672, + "balance_loss_mlp": 1.09918451, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.07304374640444197, + "language_loss": 0.85583997, + "learning_rate": 0.0007095349680442661, + "loss": 0.86726665, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.43505859, + "step": 1983, + "time_per_iteration": 2.831989288330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132213, + "balance_loss_mlp": 1.09015596, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.059661631452858944, + "language_loss": 0.79073238, + "learning_rate": 0.0007092520613150188, + "loss": 0.80205452, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4206543, + "step": 1984, + "time_per_iteration": 2.6566810607910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.09416926, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.0624399969319272, + "language_loss": 0.81395054, + "learning_rate": 0.0007089690733469165, + "loss": 0.82531422, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.42236328, + "step": 1985, + "time_per_iteration": 2.713041067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133128, + "balance_loss_mlp": 1.09023643, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.0833415836593691, + "language_loss": 0.83054602, + "learning_rate": 0.000708686004249825, + "loss": 0.84187728, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.42895508, + "step": 1986, + "time_per_iteration": 2.7708489894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135389, + "balance_loss_mlp": 1.09311724, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.050231849807362665, + "language_loss": 0.91983181, + "learning_rate": 0.0007084028541336413, + "loss": 0.93118572, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.42260742, + "step": 1987, + "time_per_iteration": 2.7049031257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135282, + "balance_loss_mlp": 1.09205675, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.07987509930436443, + "language_loss": 0.86416399, + "learning_rate": 0.0007081196231082942, + "loss": 0.87551689, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.43212891, + "step": 1988, + "time_per_iteration": 2.769559860229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.09949565, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.09872496004335095, + "language_loss": 0.80492568, + "learning_rate": 0.0007078363112837436, + "loss": 0.81635618, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.43579102, + "step": 1989, + "time_per_iteration": 2.836904525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144237, + "balance_loss_mlp": 1.10065365, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.05755280117815587, + "language_loss": 0.85391158, + "learning_rate": 0.000707552918769981, + "loss": 0.86535394, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43579102, + "step": 1990, + "time_per_iteration": 2.552560806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114164, + "balance_loss_mlp": 1.09846199, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.058237292508227935, + "language_loss": 0.83844453, + "learning_rate": 0.000707269445677029, + "loss": 0.84986091, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43188477, + "step": 1991, + "time_per_iteration": 2.717240571975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155192, + "balance_loss_mlp": 1.11270583, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.08345502818850435, + "language_loss": 0.85774487, + "learning_rate": 0.0007069858921149416, + "loss": 0.86929679, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.42480469, + "step": 1992, + "time_per_iteration": 2.937901496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143498, + "balance_loss_mlp": 1.10120225, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.06679457573221616, + "language_loss": 0.86415881, + "learning_rate": 0.0007067022581938043, + "loss": 0.87559378, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.4230957, + "step": 1993, + "time_per_iteration": 2.8283159732818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147458, + "balance_loss_mlp": 1.10614026, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.06079929242541683, + "language_loss": 0.83476102, + "learning_rate": 0.0007064185440237334, + "loss": 0.84623557, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.41333008, + "step": 1994, + "time_per_iteration": 2.738664150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148789, + "balance_loss_mlp": 1.10627878, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.05320553517563596, + "language_loss": 0.8495338, + "learning_rate": 0.0007061347497148764, + "loss": 0.8610217, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.42504883, + "step": 1995, + "time_per_iteration": 2.7379775047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147554, + "balance_loss_mlp": 1.10444832, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.059351713178290334, + "language_loss": 0.86747766, + "learning_rate": 0.0007058508753774122, + "loss": 0.87895322, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.4309082, + "step": 1996, + "time_per_iteration": 2.6882424354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144268, + "balance_loss_mlp": 1.10242534, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.08780844300106258, + "language_loss": 0.87086272, + "learning_rate": 0.0007055669211215505, + "loss": 0.88230544, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.41870117, + "step": 1997, + "time_per_iteration": 2.5902607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136259, + "balance_loss_mlp": 1.09236586, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.0743501008638896, + "language_loss": 0.77852333, + "learning_rate": 0.0007052828870575322, + "loss": 0.78988594, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43896484, + "step": 1998, + "time_per_iteration": 2.643887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113691, + "balance_loss_mlp": 1.09521055, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.05655172042288627, + "language_loss": 0.87035221, + "learning_rate": 0.0007049987732956291, + "loss": 0.88172132, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.41723633, + "step": 1999, + "time_per_iteration": 2.9655773639678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132979, + "balance_loss_mlp": 1.09325886, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.061738893850828154, + "language_loss": 0.83046496, + "learning_rate": 0.0007047145799461439, + "loss": 0.84179473, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.39746094, + "step": 2000, + "time_per_iteration": 2.8509583473205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_mlp": 1.0917958, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06203375299954445, + "language_loss": 0.82530397, + "learning_rate": 0.00070443030711941, + "loss": 0.83663273, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.41088867, + "step": 2001, + "time_per_iteration": 2.759324312210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134639, + "balance_loss_mlp": 1.09386945, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.05757301433327453, + "language_loss": 0.83082199, + "learning_rate": 0.0007041459549257924, + "loss": 0.84216839, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.40771484, + "step": 2002, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121155, + "balance_loss_mlp": 1.08014655, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.07528883527847323, + "language_loss": 0.78547823, + "learning_rate": 0.0007038615234756859, + "loss": 0.79668975, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.41015625, + "step": 2003, + "time_per_iteration": 3.211712598800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_mlp": 1.08257461, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.05751633762771481, + "language_loss": 0.83558142, + "learning_rate": 0.000703577012879517, + "loss": 0.84683371, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.42651367, + "step": 2004, + "time_per_iteration": 2.628211498260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130283, + "balance_loss_mlp": 1.08956099, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.08619617913051347, + "language_loss": 0.89379585, + "learning_rate": 0.0007032924232477423, + "loss": 0.90509868, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.40722656, + "step": 2005, + "time_per_iteration": 2.631619930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128848, + "balance_loss_mlp": 1.08743477, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.06586843636176778, + "language_loss": 0.80831605, + "learning_rate": 0.0007030077546908493, + "loss": 0.81960452, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.6160101890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336479, + "balance_loss_mlp": 1.3253212, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.11294410837330418, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84401143, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11181641, + "step": 2007, + "time_per_iteration": 4.7873475551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131514, + "balance_loss_mlp": 1.09014845, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.06382618687285554, + "language_loss": 0.79329109, + "learning_rate": 0.0007024381812438117, + "loss": 0.8046062, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.41381836, + "step": 2008, + "time_per_iteration": 2.5387141704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152986, + "balance_loss_mlp": 1.11390948, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.0811673363837608, + "language_loss": 0.83681285, + "learning_rate": 0.0007021532765747951, + "loss": 0.84834278, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.390625, + "step": 2009, + "time_per_iteration": 2.9795420169830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164171, + "balance_loss_mlp": 1.12082672, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.11123688030830275, + "language_loss": 0.7961666, + "learning_rate": 0.0007018682934229162, + "loss": 0.80780828, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43334961, + "step": 2010, + "time_per_iteration": 2.9108352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164881, + "balance_loss_mlp": 1.1216315, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.07913719393788664, + "language_loss": 0.83099723, + "learning_rate": 0.0007015832318988152, + "loss": 0.842646, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43237305, + "step": 2011, + "time_per_iteration": 2.605280637741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082789, + "balance_loss_mlp": 1.07096386, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.024547203760462325, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74972868, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11816406, + "step": 2012, + "time_per_iteration": 4.955415964126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161774, + "balance_loss_mlp": 1.12167192, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.062010894867637535, + "language_loss": 0.84259552, + "learning_rate": 0.0007010128741766604, + "loss": 0.85421324, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.40112305, + "step": 2013, + "time_per_iteration": 2.738905906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162675, + "balance_loss_mlp": 1.12080884, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.08443522979585812, + "language_loss": 0.84619504, + "learning_rate": 0.0007007275782000391, + "loss": 0.85782182, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.41870117, + "step": 2014, + "time_per_iteration": 2.6049582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178912, + "balance_loss_mlp": 1.13528132, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.05901822901260885, + "language_loss": 0.84836662, + "learning_rate": 0.0007004422042940605, + "loss": 0.8601557, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.43603516, + "step": 2015, + "time_per_iteration": 2.5449817180633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174031, + "balance_loss_mlp": 1.13106763, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.07137462797198264, + "language_loss": 0.89881837, + "learning_rate": 0.0007001567525695169, + "loss": 0.9105587, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.42993164, + "step": 2016, + "time_per_iteration": 2.5804128646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191346, + "balance_loss_mlp": 1.14921737, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.11416128839824946, + "language_loss": 0.84030014, + "learning_rate": 0.0006998712231372303, + "loss": 0.85221362, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.42138672, + "step": 2017, + "time_per_iteration": 2.9779462814331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182085, + "balance_loss_mlp": 1.13845432, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06300984009010882, + "language_loss": 0.86622429, + "learning_rate": 0.0006995856161080532, + "loss": 0.87804508, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43652344, + "step": 2018, + "time_per_iteration": 2.8405675888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160301, + "balance_loss_mlp": 1.11588371, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.0764923139512956, + "language_loss": 0.8250891, + "learning_rate": 0.0006992999315928679, + "loss": 0.83669221, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.44433594, + "step": 2019, + "time_per_iteration": 2.7929439544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146323, + "balance_loss_mlp": 1.10407472, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.09156853050649941, + "language_loss": 0.86159158, + "learning_rate": 0.0006990141697025871, + "loss": 0.8730548, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.42236328, + "step": 2020, + "time_per_iteration": 2.7913589477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137863, + "balance_loss_mlp": 1.12422562, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.035838926183426385, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77497506, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.13671875, + "step": 2021, + "time_per_iteration": 4.727250576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011348, + "balance_loss_mlp": 1.09398317, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0717580829802053, + "language_loss": 0.82676983, + "learning_rate": 0.0006984424142405392, + "loss": 0.8381179, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.40771484, + "step": 2022, + "time_per_iteration": 2.810420513153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.09006715, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.11474151925346394, + "language_loss": 0.8263585, + "learning_rate": 0.0006981564208907474, + "loss": 0.83766377, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.40454102, + "step": 2023, + "time_per_iteration": 2.604849100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139234, + "balance_loss_mlp": 1.09763026, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.05701984367640102, + "language_loss": 0.90312237, + "learning_rate": 0.0006978703506098102, + "loss": 0.91451472, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.41601562, + "step": 2024, + "time_per_iteration": 2.7345082759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115758, + "balance_loss_mlp": 1.11683416, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.06830457595999238, + "language_loss": 0.87819719, + "learning_rate": 0.00069758420350879, + "loss": 0.88977301, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.40722656, + "step": 2025, + "time_per_iteration": 2.6252336502075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160672, + "balance_loss_mlp": 1.11689889, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.07405760759256953, + "language_loss": 0.8637889, + "learning_rate": 0.000697297979698779, + "loss": 0.87539566, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43774414, + "step": 2026, + "time_per_iteration": 2.709831476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.11291099, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06812366476721117, + "language_loss": 0.83983821, + "learning_rate": 0.0006970116792908992, + "loss": 0.85135239, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.38500977, + "step": 2027, + "time_per_iteration": 3.0651228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.10976994, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.06881031116362346, + "language_loss": 0.82086015, + "learning_rate": 0.000696725302396302, + "loss": 0.832358, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.39990234, + "step": 2028, + "time_per_iteration": 2.6441640853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134129, + "balance_loss_mlp": 1.09400284, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.05768401763088921, + "language_loss": 0.86036873, + "learning_rate": 0.0006964388491261692, + "loss": 0.87171006, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.40136719, + "step": 2029, + "time_per_iteration": 3.3004355430603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129182, + "balance_loss_mlp": 1.08941352, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.06928638271863855, + "language_loss": 0.87596297, + "learning_rate": 0.0006961523195917114, + "loss": 0.88725477, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.39770508, + "step": 2030, + "time_per_iteration": 2.8312549591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112269, + "balance_loss_mlp": 1.08041883, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.06430070846126967, + "language_loss": 0.78209358, + "learning_rate": 0.0006958657139041696, + "loss": 0.79332048, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.4230957, + "step": 2031, + "time_per_iteration": 2.789843797683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172125, + "balance_loss_mlp": 1.1593461, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.04676690558545683, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77885091, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.12792969, + "step": 2032, + "time_per_iteration": 4.9584527015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118419, + "balance_loss_mlp": 1.07781672, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.06222398192409584, + "language_loss": 0.78433788, + "learning_rate": 0.0006952922745149434, + "loss": 0.79552209, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.40600586, + "step": 2033, + "time_per_iteration": 2.6696994304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125088, + "balance_loss_mlp": 1.08288765, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06080690179225973, + "language_loss": 0.88040847, + "learning_rate": 0.000695005441035888, + "loss": 0.89165938, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.421875, + "step": 2034, + "time_per_iteration": 2.675685167312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126781, + "balance_loss_mlp": 1.11333418, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.02489517999219278, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74850214, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.13476562, + "step": 2035, + "time_per_iteration": 4.8780670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114086, + "balance_loss_mlp": 1.10006714, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.09902005838056731, + "language_loss": 0.81387436, + "learning_rate": 0.0006944315470656863, + "loss": 0.82528299, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.40795898, + "step": 2036, + "time_per_iteration": 3.04048228263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132051, + "balance_loss_mlp": 1.08858752, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.07431126960541347, + "language_loss": 0.91352618, + "learning_rate": 0.000694144486797345, + "loss": 0.92484671, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.43432617, + "step": 2037, + "time_per_iteration": 2.692013740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110594, + "balance_loss_mlp": 1.09695601, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.027663679576331687, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8063103, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.13671875, + "step": 2038, + "time_per_iteration": 4.626150369644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128452, + "balance_loss_mlp": 1.08796859, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.06987974305662424, + "language_loss": 0.90060711, + "learning_rate": 0.0006935701402514156, + "loss": 0.91189158, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.40454102, + "step": 2039, + "time_per_iteration": 2.5738487243652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099838, + "balance_loss_mlp": 1.0864867, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.03469500580229188, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74134731, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13378906, + "step": 2040, + "time_per_iteration": 4.957871437072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140825, + "balance_loss_mlp": 1.10112846, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.08036310752647091, + "language_loss": 0.84965599, + "learning_rate": 0.0006929954931031422, + "loss": 0.86106431, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.39672852, + "step": 2041, + "time_per_iteration": 4.232867956161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_mlp": 1.09039509, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05705738410966496, + "language_loss": 0.8864727, + "learning_rate": 0.0006927080570819805, + "loss": 0.89776957, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.39282227, + "step": 2042, + "time_per_iteration": 2.6111459732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_mlp": 1.10252953, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.08862983476083965, + "language_loss": 0.81371272, + "learning_rate": 0.0006924205462449161, + "loss": 0.82514596, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.40795898, + "step": 2043, + "time_per_iteration": 2.6160669326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128783, + "balance_loss_mlp": 1.08932424, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.06601435567751561, + "language_loss": 0.82073617, + "learning_rate": 0.0006921329607035702, + "loss": 0.83202398, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.39453125, + "step": 2044, + "time_per_iteration": 3.2338860034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.08441699, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.06846789620147704, + "language_loss": 0.88441163, + "learning_rate": 0.0006918453005695938, + "loss": 0.89562631, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.37011719, + "step": 2045, + "time_per_iteration": 2.6499555110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135104, + "balance_loss_mlp": 1.09426332, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.05142411082006327, + "language_loss": 0.84655213, + "learning_rate": 0.0006915575659546662, + "loss": 0.85790318, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.40869141, + "step": 2046, + "time_per_iteration": 2.652902364730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133716, + "balance_loss_mlp": 1.09339929, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.08744808643608758, + "language_loss": 0.80837369, + "learning_rate": 0.0006912697569704959, + "loss": 0.81971085, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.40307617, + "step": 2047, + "time_per_iteration": 2.6129064559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131331, + "balance_loss_mlp": 1.09158659, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07468037026935817, + "language_loss": 0.86945641, + "learning_rate": 0.0006909818737288205, + "loss": 0.88076973, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.3972168, + "step": 2048, + "time_per_iteration": 2.5576181411743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10632348, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07110132916922086, + "language_loss": 0.81226838, + "learning_rate": 0.000690693916341406, + "loss": 0.82373071, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.39916992, + "step": 2049, + "time_per_iteration": 2.5884814262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154156, + "balance_loss_mlp": 1.11398268, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.05472880535545416, + "language_loss": 0.82429487, + "learning_rate": 0.0006904058849200475, + "loss": 0.83583641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.40185547, + "step": 2050, + "time_per_iteration": 2.7662599086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144327, + "balance_loss_mlp": 1.10565519, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.06127353443593348, + "language_loss": 0.85204089, + "learning_rate": 0.0006901177795765683, + "loss": 0.86348414, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.38647461, + "step": 2051, + "time_per_iteration": 2.577353000640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011475, + "balance_loss_mlp": 1.10768366, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.10882102145067868, + "language_loss": 0.81508064, + "learning_rate": 0.0006898296004228213, + "loss": 0.82655561, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.39819336, + "step": 2052, + "time_per_iteration": 2.7242588996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118361, + "balance_loss_mlp": 1.10605848, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03880030883121314, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79245102, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12304688, + "step": 2053, + "time_per_iteration": 4.852335691452026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.1204555, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.06533456514809383, + "language_loss": 0.79943091, + "learning_rate": 0.0006892530211320763, + "loss": 0.81103128, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.39575195, + "step": 2054, + "time_per_iteration": 2.726592779159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163981, + "balance_loss_mlp": 1.12528563, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.06955061494726521, + "language_loss": 0.8399905, + "learning_rate": 0.000688964621218926, + "loss": 0.85163033, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.38696289, + "step": 2055, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156737, + "balance_loss_mlp": 1.11737382, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.06754212988294535, + "language_loss": 0.80637926, + "learning_rate": 0.0006886761479432037, + "loss": 0.81794661, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.39379883, + "step": 2056, + "time_per_iteration": 2.8334691524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169364, + "balance_loss_mlp": 1.12866604, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.08783588969410645, + "language_loss": 0.85058302, + "learning_rate": 0.0006883876014169045, + "loss": 0.86227667, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.40698242, + "step": 2057, + "time_per_iteration": 2.4859981536865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163618, + "balance_loss_mlp": 1.12344468, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07066278036752763, + "language_loss": 0.90527105, + "learning_rate": 0.000688098981752052, + "loss": 0.91690719, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.40161133, + "step": 2058, + "time_per_iteration": 2.737825393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169191, + "balance_loss_mlp": 1.12849319, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08574741875980238, + "language_loss": 0.80283022, + "learning_rate": 0.0006878102890606982, + "loss": 0.81452215, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.40722656, + "step": 2059, + "time_per_iteration": 3.0589451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159966, + "balance_loss_mlp": 1.12034082, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.07158976818793618, + "language_loss": 0.81510139, + "learning_rate": 0.0006875215234549239, + "loss": 0.8267011, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.39648438, + "step": 2060, + "time_per_iteration": 2.5404529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11150885, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.11168111879418678, + "language_loss": 0.86092877, + "learning_rate": 0.0006872326850468376, + "loss": 0.87244487, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.40087891, + "step": 2061, + "time_per_iteration": 2.6653215885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153197, + "balance_loss_mlp": 1.11133087, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.0731410886524803, + "language_loss": 0.79433036, + "learning_rate": 0.0006869437739485762, + "loss": 0.80586231, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.41870117, + "step": 2062, + "time_per_iteration": 2.6032299995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147299, + "balance_loss_mlp": 1.1086272, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06685158443863869, + "language_loss": 0.9296748, + "learning_rate": 0.0006866547902723053, + "loss": 0.9411478, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.38647461, + "step": 2063, + "time_per_iteration": 2.676166534423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150184, + "balance_loss_mlp": 1.11148858, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10136223850880095, + "language_loss": 0.80330342, + "learning_rate": 0.000686365734130218, + "loss": 0.81480527, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.38696289, + "step": 2064, + "time_per_iteration": 2.6844232082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143564, + "balance_loss_mlp": 1.10420108, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06083764513088428, + "language_loss": 0.84282482, + "learning_rate": 0.000686076605634536, + "loss": 0.85426044, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.39379883, + "step": 2065, + "time_per_iteration": 2.6315250396728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156007, + "balance_loss_mlp": 1.11704922, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.07154960647229537, + "language_loss": 0.84777498, + "learning_rate": 0.0006857874048975088, + "loss": 0.85933506, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.38964844, + "step": 2066, + "time_per_iteration": 2.651740074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144331, + "balance_loss_mlp": 1.10298944, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06215318135177391, + "language_loss": 0.87357152, + "learning_rate": 0.0006854981320314142, + "loss": 0.88501477, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.41381836, + "step": 2067, + "time_per_iteration": 2.5062263011932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150736, + "balance_loss_mlp": 1.11089611, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.07144157906743025, + "language_loss": 0.87282014, + "learning_rate": 0.0006852087871485579, + "loss": 0.88432747, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.3984375, + "step": 2068, + "time_per_iteration": 2.6593010425567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141379, + "balance_loss_mlp": 1.10206354, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08492249089395289, + "language_loss": 0.82224536, + "learning_rate": 0.0006849193703612735, + "loss": 0.83365911, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.39282227, + "step": 2069, + "time_per_iteration": 2.755782127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137654, + "balance_loss_mlp": 1.09817159, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07327967142242812, + "language_loss": 0.78054988, + "learning_rate": 0.0006846298817819225, + "loss": 0.79192644, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.39477539, + "step": 2070, + "time_per_iteration": 2.987943410873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148271, + "balance_loss_mlp": 1.10909855, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.08050617332568782, + "language_loss": 0.81162381, + "learning_rate": 0.0006843403215228945, + "loss": 0.82310653, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.3918457, + "step": 2071, + "time_per_iteration": 2.4827940464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165055, + "balance_loss_mlp": 1.12585878, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.07083437878036915, + "language_loss": 0.80721962, + "learning_rate": 0.0006840506896966065, + "loss": 0.81887019, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.3918457, + "step": 2072, + "time_per_iteration": 2.6827309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166963, + "balance_loss_mlp": 1.12621748, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.06725102297232902, + "language_loss": 0.8278873, + "learning_rate": 0.0006837609864155038, + "loss": 0.83955693, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.40771484, + "step": 2073, + "time_per_iteration": 2.9130313396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116361, + "balance_loss_mlp": 1.12584436, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.07471059517929624, + "language_loss": 0.8375988, + "learning_rate": 0.0006834712117920592, + "loss": 0.84923482, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.37768555, + "step": 2074, + "time_per_iteration": 2.61501145362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162616, + "balance_loss_mlp": 1.12325335, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.13245970923224126, + "language_loss": 0.85901093, + "learning_rate": 0.0006831813659387729, + "loss": 0.87063706, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.39331055, + "step": 2075, + "time_per_iteration": 2.563549041748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149354, + "balance_loss_mlp": 1.11075377, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.06732512968880089, + "language_loss": 0.84738618, + "learning_rate": 0.0006828914489681733, + "loss": 0.85887969, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.38574219, + "step": 2076, + "time_per_iteration": 2.7011008262634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142979, + "balance_loss_mlp": 1.10440326, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.050728888200014394, + "language_loss": 0.85780215, + "learning_rate": 0.0006826014609928162, + "loss": 0.86923194, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.38598633, + "step": 2077, + "time_per_iteration": 2.699880838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_mlp": 1.01472485, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.012471286598558728, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84226274, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12158203, + "step": 2078, + "time_per_iteration": 4.819272518157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112436, + "balance_loss_mlp": 1.08549809, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.08765386089658693, + "language_loss": 0.80571902, + "learning_rate": 0.0006820212724781896, + "loss": 0.81696254, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.38842773, + "step": 2079, + "time_per_iteration": 2.6927945613861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112693, + "balance_loss_mlp": 1.07526088, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06830833334646268, + "language_loss": 0.84229112, + "learning_rate": 0.0006817310721641694, + "loss": 0.85341799, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.37402344, + "step": 2080, + "time_per_iteration": 2.8158507347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_mlp": 1.07422495, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0821477508940244, + "language_loss": 0.84532309, + "learning_rate": 0.00068144080129589, + "loss": 0.85646749, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.40234375, + "step": 2081, + "time_per_iteration": 2.665823221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111145, + "balance_loss_mlp": 1.07206321, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.06681211266265834, + "language_loss": 0.83178174, + "learning_rate": 0.0006811504599860441, + "loss": 0.84289622, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.39379883, + "step": 2082, + "time_per_iteration": 2.517651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112401, + "balance_loss_mlp": 1.07382464, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.04646658923847655, + "language_loss": 0.86172366, + "learning_rate": 0.0006808600483473526, + "loss": 0.87284768, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.38549805, + "step": 2083, + "time_per_iteration": 2.85060715675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106871, + "balance_loss_mlp": 1.06743646, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.05030907040360332, + "language_loss": 0.86459124, + "learning_rate": 0.0006805695664925629, + "loss": 0.87565994, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.39379883, + "step": 2084, + "time_per_iteration": 2.775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117346, + "balance_loss_mlp": 1.07810271, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.06453737345570608, + "language_loss": 0.84040797, + "learning_rate": 0.0006802790145344506, + "loss": 0.85158145, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.39233398, + "step": 2085, + "time_per_iteration": 2.4470229148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112227, + "balance_loss_mlp": 1.08459997, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07025741726477988, + "language_loss": 0.87659204, + "learning_rate": 0.0006799883925858176, + "loss": 0.8878147, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.37646484, + "step": 2086, + "time_per_iteration": 2.861490249633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136148, + "balance_loss_mlp": 1.09709549, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06341077230687828, + "language_loss": 0.85575259, + "learning_rate": 0.0006796977007594933, + "loss": 0.86711407, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.39038086, + "step": 2087, + "time_per_iteration": 2.619633197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.10920811, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.0625455511079972, + "language_loss": 0.86956239, + "learning_rate": 0.0006794069391683345, + "loss": 0.88106287, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.40844727, + "step": 2088, + "time_per_iteration": 4.210111618041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145795, + "balance_loss_mlp": 1.10683715, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.0705312667092641, + "language_loss": 0.81334388, + "learning_rate": 0.0006791161079252248, + "loss": 0.8248018, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.38916016, + "step": 2089, + "time_per_iteration": 2.614766836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_mlp": 1.10286903, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.084499094041807, + "language_loss": 0.82758236, + "learning_rate": 0.0006788252071430747, + "loss": 0.83899295, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.38183594, + "step": 2090, + "time_per_iteration": 2.617656707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135863, + "balance_loss_mlp": 1.09490228, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.0700927477934208, + "language_loss": 0.8703053, + "learning_rate": 0.0006785342369348222, + "loss": 0.88166392, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.40942383, + "step": 2091, + "time_per_iteration": 2.7607271671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122396, + "balance_loss_mlp": 1.08513117, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.09140990562062702, + "language_loss": 0.8009733, + "learning_rate": 0.0006782431974134316, + "loss": 0.81219733, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.37280273, + "step": 2092, + "time_per_iteration": 2.5610032081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118064, + "balance_loss_mlp": 1.07889199, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.054626907115785994, + "language_loss": 0.89608824, + "learning_rate": 0.0006779520886918949, + "loss": 0.90726894, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.3918457, + "step": 2093, + "time_per_iteration": 3.064581871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110103, + "balance_loss_mlp": 1.07279015, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.057101365791561574, + "language_loss": 0.81741238, + "learning_rate": 0.0006776609108832301, + "loss": 0.82851338, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.37304688, + "step": 2094, + "time_per_iteration": 2.77875018119812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_mlp": 1.06403446, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.06401566733015203, + "language_loss": 0.85612595, + "learning_rate": 0.0006773696641004828, + "loss": 0.86712897, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36254883, + "step": 2095, + "time_per_iteration": 2.5543506145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06522298, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.06439261414673134, + "language_loss": 0.77821416, + "learning_rate": 0.0006770783484567247, + "loss": 0.78923213, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.36572266, + "step": 2096, + "time_per_iteration": 3.14194393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114941, + "balance_loss_mlp": 1.07862973, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.051673087984505275, + "language_loss": 0.86408114, + "learning_rate": 0.000676786964065055, + "loss": 0.87523055, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36328125, + "step": 2097, + "time_per_iteration": 2.796668529510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109226, + "balance_loss_mlp": 1.07270014, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07558073774647381, + "language_loss": 0.79608446, + "learning_rate": 0.0006764955110385986, + "loss": 0.80717671, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.36547852, + "step": 2098, + "time_per_iteration": 2.721588134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.07998002, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06754969850087679, + "language_loss": 0.80409288, + "learning_rate": 0.0006762039894905083, + "loss": 0.8152715, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.37890625, + "step": 2099, + "time_per_iteration": 2.6286327838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126862, + "balance_loss_mlp": 1.08728456, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.06639046911061866, + "language_loss": 0.80760598, + "learning_rate": 0.000675912399533962, + "loss": 0.8188746, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.39599609, + "step": 2100, + "time_per_iteration": 2.5150249004364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110396, + "balance_loss_mlp": 1.07420361, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.05652757132031041, + "language_loss": 0.85431337, + "learning_rate": 0.0006756207412821656, + "loss": 0.86541736, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36206055, + "step": 2101, + "time_per_iteration": 2.9816384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.06962454, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08079981189537652, + "language_loss": 0.81269771, + "learning_rate": 0.0006753290148483505, + "loss": 0.8237704, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.37670898, + "step": 2102, + "time_per_iteration": 3.0291824340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111488, + "balance_loss_mlp": 1.07458103, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07115498960503684, + "language_loss": 0.79040611, + "learning_rate": 0.0006750372203457752, + "loss": 0.80152106, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.36914062, + "step": 2103, + "time_per_iteration": 2.5193490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111376, + "balance_loss_mlp": 1.07458746, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.049732783973711246, + "language_loss": 0.87039417, + "learning_rate": 0.0006747453578877242, + "loss": 0.88150793, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.36767578, + "step": 2104, + "time_per_iteration": 2.691030979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116651, + "balance_loss_mlp": 1.07998228, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.06592833756650988, + "language_loss": 0.83420014, + "learning_rate": 0.0006744534275875085, + "loss": 0.8453666, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.36669922, + "step": 2105, + "time_per_iteration": 2.9842946529388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.08099532, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.07270624559080442, + "language_loss": 0.85434729, + "learning_rate": 0.0006741614295584657, + "loss": 0.86553085, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.3737793, + "step": 2106, + "time_per_iteration": 2.63811993598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117316, + "balance_loss_mlp": 1.08057594, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.05922552771988275, + "language_loss": 0.78890157, + "learning_rate": 0.0006738693639139595, + "loss": 0.80007476, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.36743164, + "step": 2107, + "time_per_iteration": 2.9618351459503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116371, + "balance_loss_mlp": 1.07746077, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.06915522511623486, + "language_loss": 0.77808583, + "learning_rate": 0.0006735772307673796, + "loss": 0.78924954, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.38916016, + "step": 2108, + "time_per_iteration": 3.575981855392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111094, + "balance_loss_mlp": 1.07380557, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06309901973905298, + "language_loss": 0.83742046, + "learning_rate": 0.0006732850302321421, + "loss": 0.84853137, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.37280273, + "step": 2109, + "time_per_iteration": 3.045565605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114415, + "balance_loss_mlp": 1.0778178, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.060704196703692835, + "language_loss": 0.84782875, + "learning_rate": 0.00067299276242169, + "loss": 0.85897285, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.3659668, + "step": 2110, + "time_per_iteration": 2.6868693828582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047442, + "balance_loss_mlp": 1.03666544, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.029253972882140933, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75429612, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.10791016, + "step": 2111, + "time_per_iteration": 4.918604612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110434, + "balance_loss_mlp": 1.07281184, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.06207465310904933, + "language_loss": 0.78018594, + "learning_rate": 0.0006724080254290395, + "loss": 0.79129028, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.37597656, + "step": 2112, + "time_per_iteration": 2.798377752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116483, + "balance_loss_mlp": 1.08012438, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07195778929743761, + "language_loss": 0.89838338, + "learning_rate": 0.0006721155564738566, + "loss": 0.90954828, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36401367, + "step": 2113, + "time_per_iteration": 2.721280813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_mlp": 1.02395451, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.019551827625956694, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79657471, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.10888672, + "step": 2114, + "time_per_iteration": 4.956322193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110507, + "balance_loss_mlp": 1.07269359, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.052092004512661015, + "language_loss": 0.85970294, + "learning_rate": 0.0006715304182135078, + "loss": 0.87080801, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.37792969, + "step": 2115, + "time_per_iteration": 2.611116647720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114836, + "balance_loss_mlp": 1.07611692, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.051206353593090614, + "language_loss": 0.89130676, + "learning_rate": 0.0006712377491355127, + "loss": 0.90245515, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.38696289, + "step": 2116, + "time_per_iteration": 2.8788397312164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120118, + "balance_loss_mlp": 1.0829246, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.049235441975469474, + "language_loss": 0.81475073, + "learning_rate": 0.0006709450135771274, + "loss": 0.82595193, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.37182617, + "step": 2117, + "time_per_iteration": 2.944436550140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118262, + "balance_loss_mlp": 1.08233273, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.05682697017745506, + "language_loss": 0.86693907, + "learning_rate": 0.0006706522116520023, + "loss": 0.87812167, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.35913086, + "step": 2118, + "time_per_iteration": 2.6161422729492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125881, + "balance_loss_mlp": 1.08766294, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.060179733914174166, + "language_loss": 0.83147317, + "learning_rate": 0.0006703593434738127, + "loss": 0.84273201, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.38208008, + "step": 2119, + "time_per_iteration": 2.719313383102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123164, + "balance_loss_mlp": 1.0857563, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06825324786035328, + "language_loss": 0.78421569, + "learning_rate": 0.0006700664091562604, + "loss": 0.79544735, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.37402344, + "step": 2120, + "time_per_iteration": 2.569246530532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125736, + "balance_loss_mlp": 1.09090257, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.051920603902655335, + "language_loss": 0.85211694, + "learning_rate": 0.0006697734088130725, + "loss": 0.86337435, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.34863281, + "step": 2121, + "time_per_iteration": 2.67394757270813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124636, + "balance_loss_mlp": 1.08732319, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.05791753235244458, + "language_loss": 0.85750419, + "learning_rate": 0.0006694803425580018, + "loss": 0.86875051, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.37304688, + "step": 2122, + "time_per_iteration": 2.9812121391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129477, + "balance_loss_mlp": 1.09178257, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.06590998571054847, + "language_loss": 0.84986377, + "learning_rate": 0.0006691872105048268, + "loss": 0.86115849, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.37646484, + "step": 2123, + "time_per_iteration": 2.56272292137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137814, + "balance_loss_mlp": 1.10157394, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.05742584890727743, + "language_loss": 0.84864831, + "learning_rate": 0.0006688940127673513, + "loss": 0.86002642, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.36254883, + "step": 2124, + "time_per_iteration": 2.6935954093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113197, + "balance_loss_mlp": 1.09642184, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.05672589959491125, + "language_loss": 0.85339016, + "learning_rate": 0.0006686007494594049, + "loss": 0.86470985, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.35571289, + "step": 2125, + "time_per_iteration": 2.8291172981262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128385, + "balance_loss_mlp": 1.09097719, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.06786502616833631, + "language_loss": 0.81025755, + "learning_rate": 0.0006683074206948425, + "loss": 0.82154143, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.37402344, + "step": 2126, + "time_per_iteration": 2.5193305015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126549, + "balance_loss_mlp": 1.09095287, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.06065849070073351, + "language_loss": 0.81971312, + "learning_rate": 0.0006680140265875443, + "loss": 0.83097857, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.35595703, + "step": 2127, + "time_per_iteration": 2.8254714012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.09184861, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.054477830354085016, + "language_loss": 0.95947516, + "learning_rate": 0.0006677205672514162, + "loss": 0.97074527, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35205078, + "step": 2128, + "time_per_iteration": 2.6226608753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120165, + "balance_loss_mlp": 1.0867151, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.047090391860463696, + "language_loss": 0.88730562, + "learning_rate": 0.000667427042800389, + "loss": 0.8985073, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.3347168, + "step": 2129, + "time_per_iteration": 2.7718160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118491, + "balance_loss_mlp": 1.0833478, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.05934025192817406, + "language_loss": 0.83200449, + "learning_rate": 0.0006671334533484192, + "loss": 0.84318936, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.3515625, + "step": 2130, + "time_per_iteration": 2.7164061069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126113, + "balance_loss_mlp": 1.09199548, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.04849724567471186, + "language_loss": 0.83465552, + "learning_rate": 0.0006668397990094881, + "loss": 0.84591663, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.34130859, + "step": 2131, + "time_per_iteration": 2.684115171432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124098, + "balance_loss_mlp": 1.08738196, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.059898700524732326, + "language_loss": 0.84854865, + "learning_rate": 0.0006665460798976027, + "loss": 0.85978961, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.3671875, + "step": 2132, + "time_per_iteration": 2.748350143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114613, + "balance_loss_mlp": 1.07899356, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057665198388541644, + "language_loss": 0.81392014, + "learning_rate": 0.0006662522961267947, + "loss": 0.82506627, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.35620117, + "step": 2133, + "time_per_iteration": 2.696699619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117281, + "balance_loss_mlp": 1.08192313, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.05272213252392562, + "language_loss": 0.87773252, + "learning_rate": 0.0006659584478111211, + "loss": 0.88890535, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.35351562, + "step": 2134, + "time_per_iteration": 2.793302536010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08249605, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.06878890228068688, + "language_loss": 0.83315176, + "learning_rate": 0.000665664535064664, + "loss": 0.84434175, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.36499023, + "step": 2135, + "time_per_iteration": 3.0627260208129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104736, + "balance_loss_mlp": 1.06987929, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.05984370507865806, + "language_loss": 0.83017695, + "learning_rate": 0.0006653705580015303, + "loss": 0.84122425, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.34863281, + "step": 2136, + "time_per_iteration": 2.6851253509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103645, + "balance_loss_mlp": 1.0668807, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.07790160743926922, + "language_loss": 0.86554241, + "learning_rate": 0.0006650765167358523, + "loss": 0.87657887, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.36743164, + "step": 2137, + "time_per_iteration": 2.7750425338745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111084, + "balance_loss_mlp": 1.07579243, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06074101962252474, + "language_loss": 0.9028185, + "learning_rate": 0.0006647824113817864, + "loss": 0.91392696, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.3503418, + "step": 2138, + "time_per_iteration": 2.5466508865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120271, + "balance_loss_mlp": 1.08348298, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.0860402389983067, + "language_loss": 0.81677365, + "learning_rate": 0.000664488242053515, + "loss": 0.82797635, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.36767578, + "step": 2139, + "time_per_iteration": 2.7149875164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114944, + "balance_loss_mlp": 1.08108878, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.05168082296105111, + "language_loss": 0.83784723, + "learning_rate": 0.0006641940088652445, + "loss": 0.84899676, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.33886719, + "step": 2140, + "time_per_iteration": 2.7871952056884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118503, + "balance_loss_mlp": 1.08130932, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.07036618696819374, + "language_loss": 0.8248812, + "learning_rate": 0.0006638997119312065, + "loss": 0.83606619, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.37207031, + "step": 2141, + "time_per_iteration": 2.7391679286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_mlp": 1.02841258, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.01970513212166274, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76101923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10205078, + "step": 2142, + "time_per_iteration": 4.920190095901489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113096, + "balance_loss_mlp": 1.09562647, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.07114532863779677, + "language_loss": 0.8524918, + "learning_rate": 0.000663310927282877, + "loss": 0.86380136, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.35327148, + "step": 2143, + "time_per_iteration": 2.762634515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.09098172, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.06302616573108136, + "language_loss": 0.86451441, + "learning_rate": 0.000663016439797172, + "loss": 0.87578332, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.35913086, + "step": 2144, + "time_per_iteration": 2.623093366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.082816, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.054034946771414454, + "language_loss": 0.80777407, + "learning_rate": 0.0006627218890228724, + "loss": 0.81894982, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.34765625, + "step": 2145, + "time_per_iteration": 2.79042911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.08373237, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.06837741268569841, + "language_loss": 0.83587825, + "learning_rate": 0.0006624272750743326, + "loss": 0.84706175, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.34643555, + "step": 2146, + "time_per_iteration": 3.0066065788269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110844, + "balance_loss_mlp": 1.07591534, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.052525216454956766, + "language_loss": 0.83126348, + "learning_rate": 0.0006621325980659322, + "loss": 0.84237194, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.34912109, + "step": 2147, + "time_per_iteration": 2.77634334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110797, + "balance_loss_mlp": 1.07429504, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06743799661442922, + "language_loss": 0.82004929, + "learning_rate": 0.000661837858112075, + "loss": 0.83115721, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.36499023, + "step": 2148, + "time_per_iteration": 2.8309879302978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108806, + "balance_loss_mlp": 1.07156515, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.060878143567582824, + "language_loss": 0.88845801, + "learning_rate": 0.0006615430553271888, + "loss": 0.89954603, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.37231445, + "step": 2149, + "time_per_iteration": 2.7831413745880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110838, + "balance_loss_mlp": 1.0737617, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.05890657915946428, + "language_loss": 0.85358977, + "learning_rate": 0.0006612481898257264, + "loss": 0.86467361, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.34619141, + "step": 2150, + "time_per_iteration": 2.8594231605529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116144, + "balance_loss_mlp": 1.08021438, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.0708787663681645, + "language_loss": 0.85383213, + "learning_rate": 0.000660953261722165, + "loss": 0.86499357, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.359375, + "step": 2151, + "time_per_iteration": 2.616218090057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110269, + "balance_loss_mlp": 1.07512605, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.05740780888166335, + "language_loss": 0.82834315, + "learning_rate": 0.0006606582711310055, + "loss": 0.83944577, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.3515625, + "step": 2152, + "time_per_iteration": 2.752922773361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116071, + "balance_loss_mlp": 1.07918727, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.062483875204726216, + "language_loss": 0.83428371, + "learning_rate": 0.0006603632181667736, + "loss": 0.84544444, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.36865234, + "step": 2153, + "time_per_iteration": 2.6699299812316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093007, + "balance_loss_mlp": 1.0828501, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03944020407638644, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8003633, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.1015625, + "step": 2154, + "time_per_iteration": 4.931839227676392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117724, + "balance_loss_mlp": 1.0825572, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08977793466970029, + "language_loss": 0.82004881, + "learning_rate": 0.0006597729255773153, + "loss": 0.83122605, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.3515625, + "step": 2155, + "time_per_iteration": 2.5100300312042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114058, + "balance_loss_mlp": 1.07769895, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.24704033930451297, + "language_loss": 0.82534748, + "learning_rate": 0.0006594776861812608, + "loss": 0.83648813, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.36328125, + "step": 2156, + "time_per_iteration": 2.652275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124067, + "balance_loss_mlp": 1.0867784, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.053182178449683815, + "language_loss": 0.86615425, + "learning_rate": 0.0006591823848704776, + "loss": 0.87739491, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.37280273, + "step": 2157, + "time_per_iteration": 2.958137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123111, + "balance_loss_mlp": 1.08653796, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.05319975052329094, + "language_loss": 0.81529272, + "learning_rate": 0.0006588870217596117, + "loss": 0.82652378, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.36572266, + "step": 2158, + "time_per_iteration": 2.739755392074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136072, + "balance_loss_mlp": 1.09847283, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.06859141393857857, + "language_loss": 0.85955006, + "learning_rate": 0.0006585915969633334, + "loss": 0.87091076, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.37573242, + "step": 2159, + "time_per_iteration": 2.561397075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138332, + "balance_loss_mlp": 1.1019969, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06079365960323944, + "language_loss": 0.89314306, + "learning_rate": 0.0006582961105963366, + "loss": 0.90452635, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.36328125, + "step": 2160, + "time_per_iteration": 2.791609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141319, + "balance_loss_mlp": 1.10546052, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.06462553372591408, + "language_loss": 0.77976739, + "learning_rate": 0.0006580005627733395, + "loss": 0.79118055, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.35913086, + "step": 2161, + "time_per_iteration": 2.6615841388702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152655, + "balance_loss_mlp": 1.11536634, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.06713934338553489, + "language_loss": 0.82142949, + "learning_rate": 0.0006577049536090838, + "loss": 0.83295602, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.37280273, + "step": 2162, + "time_per_iteration": 2.7025601863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11163712, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.06389110494138472, + "language_loss": 0.8567937, + "learning_rate": 0.000657409283218335, + "loss": 0.86828005, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37011719, + "step": 2163, + "time_per_iteration": 2.6993329524993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160123, + "balance_loss_mlp": 1.12352586, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.058431004936437055, + "language_loss": 0.81466627, + "learning_rate": 0.0006571135517158829, + "loss": 0.82626748, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.3659668, + "step": 2164, + "time_per_iteration": 2.6519243717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114432, + "balance_loss_mlp": 1.13316202, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.04824937130362004, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77908379, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.11181641, + "step": 2165, + "time_per_iteration": 4.770123481750488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155561, + "balance_loss_mlp": 1.11765289, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.07363984603082524, + "language_loss": 0.83210087, + "learning_rate": 0.0006565219058351444, + "loss": 0.84365654, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37866211, + "step": 2166, + "time_per_iteration": 2.6601247787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10470724, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.06568932383648114, + "language_loss": 0.83008349, + "learning_rate": 0.0006562259916865553, + "loss": 0.84152913, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.39868164, + "step": 2167, + "time_per_iteration": 2.5785412788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137223, + "balance_loss_mlp": 1.0999341, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.06458514122378838, + "language_loss": 0.79966152, + "learning_rate": 0.0006559300168856573, + "loss": 0.81103373, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.37255859, + "step": 2168, + "time_per_iteration": 2.7237303256988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140045, + "balance_loss_mlp": 1.10316169, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.050633821406227124, + "language_loss": 0.86603534, + "learning_rate": 0.0006556339815473577, + "loss": 0.8774358, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.36889648, + "step": 2169, + "time_per_iteration": 2.6403653621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140714, + "balance_loss_mlp": 1.10254359, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.05999280354484277, + "language_loss": 0.86559451, + "learning_rate": 0.000655337885786588, + "loss": 0.87700164, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.3815918, + "step": 2170, + "time_per_iteration": 2.927175283432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144977, + "balance_loss_mlp": 1.10737872, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.06541761902088469, + "language_loss": 0.85292417, + "learning_rate": 0.0006550417297183025, + "loss": 0.86437398, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37597656, + "step": 2171, + "time_per_iteration": 2.617203950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139774, + "balance_loss_mlp": 1.10174668, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.06470887192105082, + "language_loss": 0.81668884, + "learning_rate": 0.0006547455134574793, + "loss": 0.82808661, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.37988281, + "step": 2172, + "time_per_iteration": 2.6800732612609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.10817289, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06060457888036509, + "language_loss": 0.84434199, + "learning_rate": 0.0006544492371191198, + "loss": 0.85579354, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.36962891, + "step": 2173, + "time_per_iteration": 3.134876251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140851, + "balance_loss_mlp": 1.10113096, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.09700819760133231, + "language_loss": 0.83721489, + "learning_rate": 0.0006541529008182485, + "loss": 0.84862345, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.39697266, + "step": 2174, + "time_per_iteration": 3.1724131107330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113477, + "balance_loss_mlp": 1.09893537, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.060160949925642034, + "language_loss": 0.87700981, + "learning_rate": 0.0006538565046699136, + "loss": 0.88835752, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.3581543, + "step": 2175, + "time_per_iteration": 2.5730292797088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133428, + "balance_loss_mlp": 1.09683084, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.06692113802371265, + "language_loss": 0.81824857, + "learning_rate": 0.0006535600487891862, + "loss": 0.82958287, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.3659668, + "step": 2176, + "time_per_iteration": 2.7692394256591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121709, + "balance_loss_mlp": 1.08651876, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.07459509047969586, + "language_loss": 0.89445305, + "learning_rate": 0.0006532635332911603, + "loss": 0.90567011, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.3515625, + "step": 2177, + "time_per_iteration": 2.668281078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122309, + "balance_loss_mlp": 1.08449602, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.054056674099833946, + "language_loss": 0.80669487, + "learning_rate": 0.0006529669582909541, + "loss": 0.81791794, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37744141, + "step": 2178, + "time_per_iteration": 3.234210729598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132134, + "balance_loss_mlp": 1.0946312, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.13706718234639897, + "language_loss": 0.85650241, + "learning_rate": 0.0006526703239037077, + "loss": 0.86782372, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.37475586, + "step": 2179, + "time_per_iteration": 2.6495871543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129835, + "balance_loss_mlp": 1.09094954, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.09871097727336539, + "language_loss": 0.86649984, + "learning_rate": 0.0006523736302445851, + "loss": 0.8777982, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.38891602, + "step": 2180, + "time_per_iteration": 2.7558817863464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133221, + "balance_loss_mlp": 1.09390545, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.05706426412838818, + "language_loss": 0.77595234, + "learning_rate": 0.0006520768774287728, + "loss": 0.78728461, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.39306641, + "step": 2181, + "time_per_iteration": 3.7205944061279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143371, + "balance_loss_mlp": 1.10436535, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06053658357019196, + "language_loss": 0.85689628, + "learning_rate": 0.0006517800655714806, + "loss": 0.86832994, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.39013672, + "step": 2182, + "time_per_iteration": 2.8325769901275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140717, + "balance_loss_mlp": 1.10218823, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07751994631636654, + "language_loss": 0.85342467, + "learning_rate": 0.0006514831947879407, + "loss": 0.86483186, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.38500977, + "step": 2183, + "time_per_iteration": 2.930466890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154155, + "balance_loss_mlp": 1.11531675, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.061313063449444025, + "language_loss": 0.78360265, + "learning_rate": 0.0006511862651934091, + "loss": 0.7951442, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.38842773, + "step": 2184, + "time_per_iteration": 3.0874462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168235, + "balance_loss_mlp": 1.1299212, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.07362784353092817, + "language_loss": 0.820894, + "learning_rate": 0.0006508892769031638, + "loss": 0.83257627, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.3828125, + "step": 2185, + "time_per_iteration": 2.6239352226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.11551726, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.06908564705964859, + "language_loss": 0.87278891, + "learning_rate": 0.000650592230032506, + "loss": 0.88430935, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.36523438, + "step": 2186, + "time_per_iteration": 2.7282140254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149658, + "balance_loss_mlp": 1.11079597, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.0823679101553184, + "language_loss": 0.85327846, + "learning_rate": 0.0006502951246967595, + "loss": 0.86477506, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38891602, + "step": 2187, + "time_per_iteration": 2.8729426860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154068, + "balance_loss_mlp": 1.1164453, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.05336445965116177, + "language_loss": 0.86749196, + "learning_rate": 0.0006499979610112706, + "loss": 0.87903261, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.3762207, + "step": 2188, + "time_per_iteration": 2.7119579315185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151369, + "balance_loss_mlp": 1.1127454, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.055701229884667774, + "language_loss": 0.84561181, + "learning_rate": 0.000649700739091409, + "loss": 0.85712552, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.38623047, + "step": 2189, + "time_per_iteration": 2.7023189067840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108128, + "balance_loss_mlp": 1.07126629, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.037864476589066096, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74917555, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10009766, + "step": 2190, + "time_per_iteration": 4.808679103851318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.10751486, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.07155258064415941, + "language_loss": 0.85762346, + "learning_rate": 0.0006491061210101557, + "loss": 0.8690486, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.35009766, + "step": 2191, + "time_per_iteration": 2.7315032482147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.10880995, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.05430057095490736, + "language_loss": 0.84269011, + "learning_rate": 0.0006488087250796157, + "loss": 0.85415035, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.37231445, + "step": 2192, + "time_per_iteration": 2.91867995262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140476, + "balance_loss_mlp": 1.10154223, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.05336306174245454, + "language_loss": 0.81998622, + "learning_rate": 0.0006485112713764049, + "loss": 0.83139098, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.38916016, + "step": 2193, + "time_per_iteration": 2.954740047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139123, + "balance_loss_mlp": 1.10178626, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.05416843930927548, + "language_loss": 0.83712393, + "learning_rate": 0.0006482137600160051, + "loss": 0.84851515, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.3737793, + "step": 2194, + "time_per_iteration": 2.4989676475524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144403, + "balance_loss_mlp": 1.10573113, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.05184002865736912, + "language_loss": 0.8501671, + "learning_rate": 0.0006479161911139206, + "loss": 0.86161113, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.38671875, + "step": 2195, + "time_per_iteration": 2.5739963054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.09721804, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08063840338659255, + "language_loss": 0.85617948, + "learning_rate": 0.0006476185647856778, + "loss": 0.86753291, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38134766, + "step": 2196, + "time_per_iteration": 2.578218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124656, + "balance_loss_mlp": 1.08808231, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.05804099842364966, + "language_loss": 0.82180464, + "learning_rate": 0.0006473208811468255, + "loss": 0.8330512, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.36547852, + "step": 2197, + "time_per_iteration": 2.8833000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123675, + "balance_loss_mlp": 1.08707809, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.058050592535879256, + "language_loss": 0.84475237, + "learning_rate": 0.0006470231403129347, + "loss": 0.8559891, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.36621094, + "step": 2198, + "time_per_iteration": 2.590959072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124319, + "balance_loss_mlp": 1.08781683, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.05086119187590394, + "language_loss": 0.82252729, + "learning_rate": 0.0006467253423995988, + "loss": 0.83377045, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.36499023, + "step": 2199, + "time_per_iteration": 2.8386192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128105, + "balance_loss_mlp": 1.0917697, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.06679650853448169, + "language_loss": 0.79627949, + "learning_rate": 0.000646427487522433, + "loss": 0.8075605, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.36352539, + "step": 2200, + "time_per_iteration": 2.635103464126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08423305, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.0831390189187338, + "language_loss": 0.83172977, + "learning_rate": 0.0006461295757970749, + "loss": 0.84293896, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.36669922, + "step": 2201, + "time_per_iteration": 2.819474697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.07891917, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.062417347947693186, + "language_loss": 0.81792694, + "learning_rate": 0.0006458316073391839, + "loss": 0.82911074, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39428711, + "step": 2202, + "time_per_iteration": 2.871166229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0872103, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.05500893378921445, + "language_loss": 0.88072616, + "learning_rate": 0.0006455335822644422, + "loss": 0.89194781, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.34936523, + "step": 2203, + "time_per_iteration": 2.6111316680908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123624, + "balance_loss_mlp": 1.08683574, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.06843699867702463, + "language_loss": 0.78204858, + "learning_rate": 0.0006452355006885527, + "loss": 0.79328489, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.36791992, + "step": 2204, + "time_per_iteration": 2.6248953342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119975, + "balance_loss_mlp": 1.08209014, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.07209183527246785, + "language_loss": 0.87310261, + "learning_rate": 0.0006449373627272412, + "loss": 0.88430238, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.37866211, + "step": 2205, + "time_per_iteration": 2.703838348388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119116, + "balance_loss_mlp": 1.08197045, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.07114514004539872, + "language_loss": 0.82698691, + "learning_rate": 0.0006446391684962553, + "loss": 0.8381781, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.37158203, + "step": 2206, + "time_per_iteration": 2.6619176864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115514, + "balance_loss_mlp": 1.08022797, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.05684297237550015, + "language_loss": 0.83361518, + "learning_rate": 0.000644340918111364, + "loss": 0.84477031, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.3527832, + "step": 2207, + "time_per_iteration": 2.5367140769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126169, + "balance_loss_mlp": 1.09016824, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07504639835111325, + "language_loss": 0.8513602, + "learning_rate": 0.0006440426116883585, + "loss": 0.8626219, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.36010742, + "step": 2208, + "time_per_iteration": 2.5879015922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118301, + "balance_loss_mlp": 1.08129835, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06421639244231503, + "language_loss": 0.86279738, + "learning_rate": 0.0006437442493430519, + "loss": 0.8739804, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.37011719, + "step": 2209, + "time_per_iteration": 2.6396701335906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114919, + "balance_loss_mlp": 1.07741535, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.06478280605491378, + "language_loss": 0.87082028, + "learning_rate": 0.000643445831191278, + "loss": 0.88196945, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.37524414, + "step": 2210, + "time_per_iteration": 2.902726173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109225, + "balance_loss_mlp": 1.07265139, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.0604627940505335, + "language_loss": 0.81718135, + "learning_rate": 0.0006431473573488937, + "loss": 0.82827359, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.3659668, + "step": 2211, + "time_per_iteration": 2.756131887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.06492758, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.0751061946408966, + "language_loss": 0.84961367, + "learning_rate": 0.0006428488279317765, + "loss": 0.86063254, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.36938477, + "step": 2212, + "time_per_iteration": 2.6532113552093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100031, + "balance_loss_mlp": 1.06541276, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.06889274289933833, + "language_loss": 0.87372804, + "learning_rate": 0.0006425502430558259, + "loss": 0.88472843, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.34619141, + "step": 2213, + "time_per_iteration": 2.6332669258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_mlp": 1.06874728, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.08165118310272598, + "language_loss": 0.84992635, + "learning_rate": 0.0006422516028369628, + "loss": 0.86098623, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.37231445, + "step": 2214, + "time_per_iteration": 2.618557929992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098802, + "balance_loss_mlp": 1.06237185, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.05512742279801928, + "language_loss": 0.8369562, + "learning_rate": 0.0006419529073911296, + "loss": 0.84794426, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.36425781, + "step": 2215, + "time_per_iteration": 2.833543062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095611, + "balance_loss_mlp": 1.06166017, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.0818108199754697, + "language_loss": 0.85651129, + "learning_rate": 0.0006416541568342901, + "loss": 0.8674674, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.33935547, + "step": 2216, + "time_per_iteration": 2.8430728912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.0622437, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.05864229124252446, + "language_loss": 0.84272695, + "learning_rate": 0.0006413553512824297, + "loss": 0.85369843, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.34912109, + "step": 2217, + "time_per_iteration": 2.7368276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095005, + "balance_loss_mlp": 1.05943322, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.06419705252846208, + "language_loss": 0.84589773, + "learning_rate": 0.0006410564908515549, + "loss": 0.85684776, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.35595703, + "step": 2218, + "time_per_iteration": 2.650841236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096054, + "balance_loss_mlp": 1.06052935, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.06892642653628764, + "language_loss": 0.85406113, + "learning_rate": 0.0006407575756576935, + "loss": 0.86502165, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.35546875, + "step": 2219, + "time_per_iteration": 2.7199461460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103421, + "balance_loss_mlp": 1.0681113, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.055123892223664483, + "language_loss": 0.88112384, + "learning_rate": 0.0006404586058168951, + "loss": 0.89215803, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.35327148, + "step": 2220, + "time_per_iteration": 2.7125062942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_mlp": 1.07129836, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.06740071030395202, + "language_loss": 0.86848915, + "learning_rate": 0.0006401595814452296, + "loss": 0.87955624, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.35424805, + "step": 2221, + "time_per_iteration": 2.6037752628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07349372, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.06763062065124635, + "language_loss": 0.81391692, + "learning_rate": 0.000639860502658789, + "loss": 0.82500279, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.35131836, + "step": 2222, + "time_per_iteration": 2.620530366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.07475281, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.07514934658842116, + "language_loss": 0.85168004, + "learning_rate": 0.0006395613695736853, + "loss": 0.86278212, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.35449219, + "step": 2223, + "time_per_iteration": 2.67494797706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106777, + "balance_loss_mlp": 1.07015634, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.06258659729032073, + "language_loss": 0.81998539, + "learning_rate": 0.0006392621823060529, + "loss": 0.83105314, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.36621094, + "step": 2224, + "time_per_iteration": 2.729048490524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107838, + "balance_loss_mlp": 1.07197976, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.07791132694448914, + "language_loss": 0.85259843, + "learning_rate": 0.0006389629409720465, + "loss": 0.86367679, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.35839844, + "step": 2225, + "time_per_iteration": 2.6461989879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102526, + "balance_loss_mlp": 1.06836081, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.06694393428490365, + "language_loss": 0.88831687, + "learning_rate": 0.0006386636456878417, + "loss": 0.89934212, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.34155273, + "step": 2226, + "time_per_iteration": 2.8701326847076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106393, + "balance_loss_mlp": 1.07091641, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07990341915486338, + "language_loss": 0.92087269, + "learning_rate": 0.0006383642965696353, + "loss": 0.93193656, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.35473633, + "step": 2227, + "time_per_iteration": 2.4640464782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_mlp": 1.06544292, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.053395147694407376, + "language_loss": 0.82962096, + "learning_rate": 0.000638064893733645, + "loss": 0.84064686, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.37158203, + "step": 2228, + "time_per_iteration": 2.783597946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117505, + "balance_loss_mlp": 1.08198094, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.07356604001224937, + "language_loss": 0.89838171, + "learning_rate": 0.000637765437296109, + "loss": 0.90955675, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.35522461, + "step": 2229, + "time_per_iteration": 2.6639621257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112327, + "balance_loss_mlp": 1.07644475, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.05563237387214821, + "language_loss": 0.85128897, + "learning_rate": 0.000637465927373287, + "loss": 0.86241227, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.35913086, + "step": 2230, + "time_per_iteration": 2.6883277893066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107316, + "balance_loss_mlp": 1.07253075, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.06522010118943229, + "language_loss": 0.78980476, + "learning_rate": 0.000637166364081459, + "loss": 0.80087787, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.34790039, + "step": 2231, + "time_per_iteration": 2.711379051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111807, + "balance_loss_mlp": 1.07814288, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.06512604260411947, + "language_loss": 0.84616333, + "learning_rate": 0.0006368667475369256, + "loss": 0.85728139, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.33666992, + "step": 2232, + "time_per_iteration": 2.7521519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083825, + "balance_loss_mlp": 1.07271492, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.03367734377341464, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79611605, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.11132812, + "step": 2233, + "time_per_iteration": 4.941352605819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106502, + "balance_loss_mlp": 1.05414832, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.027928692850204096, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79960448, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10888672, + "step": 2234, + "time_per_iteration": 4.825460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117593, + "balance_loss_mlp": 1.08302259, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.05150642259295507, + "language_loss": 0.86345804, + "learning_rate": 0.0006359675795504112, + "loss": 0.87463403, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.34619141, + "step": 2235, + "time_per_iteration": 2.662977695465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.09099901, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.07348370683515035, + "language_loss": 0.74711537, + "learning_rate": 0.0006356677511584775, + "loss": 0.75838703, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.36181641, + "step": 2236, + "time_per_iteration": 3.51220965385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127234, + "balance_loss_mlp": 1.09337878, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.061045373266899905, + "language_loss": 0.86476523, + "learning_rate": 0.0006353678700956511, + "loss": 0.8760376, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.33886719, + "step": 2237, + "time_per_iteration": 2.60677170753479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_mlp": 1.09085464, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.06413862374233106, + "language_loss": 0.83745819, + "learning_rate": 0.0006350679364783569, + "loss": 0.84870958, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.34326172, + "step": 2238, + "time_per_iteration": 2.7771050930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117438, + "balance_loss_mlp": 1.08212781, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.057478588831895126, + "language_loss": 0.85746336, + "learning_rate": 0.0006347679504230393, + "loss": 0.86863768, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.35351562, + "step": 2239, + "time_per_iteration": 2.6826984882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120648, + "balance_loss_mlp": 1.08405077, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.0566935574955873, + "language_loss": 0.76113296, + "learning_rate": 0.0006344679120461632, + "loss": 0.7723394, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.36621094, + "step": 2240, + "time_per_iteration": 3.3756330013275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122418, + "balance_loss_mlp": 1.0843904, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.06383187448999712, + "language_loss": 0.80362582, + "learning_rate": 0.0006341678214642134, + "loss": 0.81484997, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.38037109, + "step": 2241, + "time_per_iteration": 2.6837639808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121026, + "balance_loss_mlp": 1.08633661, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06213603676301435, + "language_loss": 0.82894886, + "learning_rate": 0.0006338676787936963, + "loss": 0.84015912, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.34716797, + "step": 2242, + "time_per_iteration": 3.0835442543029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.09019864, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.06026893794725229, + "language_loss": 0.83885002, + "learning_rate": 0.0006335674841511367, + "loss": 0.85011244, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.36035156, + "step": 2243, + "time_per_iteration": 2.6649861335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054007, + "balance_loss_mlp": 1.04466057, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.029651379922801115, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80235171, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09326172, + "step": 2244, + "time_per_iteration": 5.015843868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043427, + "balance_loss_mlp": 1.03412855, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.025217175998849217, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7840898, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09277344, + "step": 2245, + "time_per_iteration": 4.923234939575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118877, + "balance_loss_mlp": 1.08282828, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.05723795681410829, + "language_loss": 0.83027297, + "learning_rate": 0.0006326665895567652, + "loss": 0.84146178, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.3605957, + "step": 2246, + "time_per_iteration": 2.6065175533294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112241, + "balance_loss_mlp": 1.08652771, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.06570844887047847, + "language_loss": 0.87358153, + "learning_rate": 0.0006323661881916976, + "loss": 0.88480568, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.35864258, + "step": 2247, + "time_per_iteration": 2.682924509048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124487, + "balance_loss_mlp": 1.08996427, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.05864327339271887, + "language_loss": 0.8139447, + "learning_rate": 0.0006320657354375179, + "loss": 0.82518953, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.34521484, + "step": 2248, + "time_per_iteration": 2.9315433502197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112819, + "balance_loss_mlp": 1.09125865, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.05550733837968219, + "language_loss": 0.87244421, + "learning_rate": 0.0006317652314108726, + "loss": 0.88372612, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.36938477, + "step": 2249, + "time_per_iteration": 2.5357820987701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125911, + "balance_loss_mlp": 1.09186506, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.06944226399680122, + "language_loss": 0.91745955, + "learning_rate": 0.0006314646762284277, + "loss": 0.92871869, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.34057617, + "step": 2250, + "time_per_iteration": 2.650629997253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010844, + "balance_loss_mlp": 1.00116396, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012503035455709091, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76436675, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09667969, + "step": 2251, + "time_per_iteration": 4.895758867263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118011, + "balance_loss_mlp": 1.08341658, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.05843208138947643, + "language_loss": 0.77784407, + "learning_rate": 0.0006308634128629022, + "loss": 0.78902417, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.34570312, + "step": 2252, + "time_per_iteration": 2.916623592376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112901, + "balance_loss_mlp": 1.09289002, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0729174620046303, + "language_loss": 0.87908506, + "learning_rate": 0.0006305627049132531, + "loss": 0.89037514, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.36132812, + "step": 2253, + "time_per_iteration": 2.741239070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121412, + "balance_loss_mlp": 1.08660293, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05583951255628595, + "language_loss": 0.8599245, + "learning_rate": 0.0006302619462746662, + "loss": 0.87113857, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.34814453, + "step": 2254, + "time_per_iteration": 3.1628546714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123282, + "balance_loss_mlp": 1.08966494, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.05704174545577272, + "language_loss": 0.90291667, + "learning_rate": 0.0006299611370639069, + "loss": 0.91414952, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.33618164, + "step": 2255, + "time_per_iteration": 2.7106690406799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125975, + "balance_loss_mlp": 1.09157157, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.06008787734976465, + "language_loss": 0.79589838, + "learning_rate": 0.0006296602773977593, + "loss": 0.80715805, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.34423828, + "step": 2256, + "time_per_iteration": 2.673064947128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.08652973, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.05906133720876415, + "language_loss": 0.87730187, + "learning_rate": 0.0006293593673930277, + "loss": 0.88852072, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.35400391, + "step": 2257, + "time_per_iteration": 2.6278131008148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115203, + "balance_loss_mlp": 1.08010745, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07846710421999975, + "language_loss": 0.7888447, + "learning_rate": 0.0006290584071665358, + "loss": 0.79999673, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.35107422, + "step": 2258, + "time_per_iteration": 2.8708009719848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112425, + "balance_loss_mlp": 1.07709181, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06520269446334741, + "language_loss": 0.82244253, + "learning_rate": 0.0006287573968351266, + "loss": 0.83356678, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.35351562, + "step": 2259, + "time_per_iteration": 2.5682222843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113367, + "balance_loss_mlp": 1.07729471, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.07246583855612315, + "language_loss": 0.82777989, + "learning_rate": 0.0006284563365156626, + "loss": 0.83891356, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.3605957, + "step": 2260, + "time_per_iteration": 2.827087879180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108747, + "balance_loss_mlp": 1.07148242, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.12125557864683041, + "language_loss": 0.87600839, + "learning_rate": 0.0006281552263250261, + "loss": 0.88709581, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37255859, + "step": 2261, + "time_per_iteration": 2.479753017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_mlp": 1.02072453, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.029168664611412945, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81722796, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10546875, + "step": 2262, + "time_per_iteration": 4.812009334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106503, + "balance_loss_mlp": 1.07104969, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.06614620097740347, + "language_loss": 0.81361771, + "learning_rate": 0.0006275528567978593, + "loss": 0.82468277, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.35449219, + "step": 2263, + "time_per_iteration": 2.903029203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115264, + "balance_loss_mlp": 1.07923913, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07895665669601973, + "language_loss": 0.82951373, + "learning_rate": 0.0006272515976951898, + "loss": 0.84066635, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.3605957, + "step": 2264, + "time_per_iteration": 3.066096544265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109411, + "balance_loss_mlp": 1.07300496, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06560373300441709, + "language_loss": 0.79299462, + "learning_rate": 0.0006269502891890687, + "loss": 0.80408877, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.36425781, + "step": 2265, + "time_per_iteration": 3.036302089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098467, + "balance_loss_mlp": 1.06504071, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05296436812265497, + "language_loss": 0.88411891, + "learning_rate": 0.0006266489313964743, + "loss": 0.89510357, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.33447266, + "step": 2266, + "time_per_iteration": 2.766963481903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105293, + "balance_loss_mlp": 1.06907725, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.057339134399699385, + "language_loss": 0.85443783, + "learning_rate": 0.0006263475244344041, + "loss": 0.86549073, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.36230469, + "step": 2267, + "time_per_iteration": 2.8397552967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104848, + "balance_loss_mlp": 1.0681076, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.06097162500725226, + "language_loss": 0.84725475, + "learning_rate": 0.0006260460684198746, + "loss": 0.85830331, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.36743164, + "step": 2268, + "time_per_iteration": 2.725037097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.06901538, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07238177879654556, + "language_loss": 0.84404624, + "learning_rate": 0.0006257445634699213, + "loss": 0.85510075, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.36425781, + "step": 2269, + "time_per_iteration": 2.623194456100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.06855631, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.060050482587473634, + "language_loss": 0.83212304, + "learning_rate": 0.0006254430097015993, + "loss": 0.84317344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36499023, + "step": 2270, + "time_per_iteration": 2.6570417881011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_mlp": 1.02752221, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.021802814945167073, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751677, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.1015625, + "step": 2271, + "time_per_iteration": 4.800662517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109594, + "balance_loss_mlp": 1.07299662, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.08079345415457889, + "language_loss": 0.85730046, + "learning_rate": 0.0006248397561781609, + "loss": 0.8683964, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.3659668, + "step": 2272, + "time_per_iteration": 2.879779815673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110506, + "balance_loss_mlp": 1.07312167, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.06456885574264018, + "language_loss": 0.86308181, + "learning_rate": 0.0006245380566572482, + "loss": 0.87418681, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.37402344, + "step": 2273, + "time_per_iteration": 2.671515703201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108969, + "balance_loss_mlp": 1.07227635, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07977356034675265, + "language_loss": 0.76295209, + "learning_rate": 0.0006242363087863744, + "loss": 0.77404177, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36669922, + "step": 2274, + "time_per_iteration": 3.0036468505859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_mlp": 1.07430363, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06387432282930158, + "language_loss": 0.86488557, + "learning_rate": 0.0006239345126826878, + "loss": 0.87598979, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.36132812, + "step": 2275, + "time_per_iteration": 2.8046963214874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113113, + "balance_loss_mlp": 1.07551455, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.06304446482372832, + "language_loss": 0.84217036, + "learning_rate": 0.0006236326684633561, + "loss": 0.85330147, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37597656, + "step": 2276, + "time_per_iteration": 2.8238136768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113871, + "balance_loss_mlp": 1.07725024, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07202298424456109, + "language_loss": 0.75335848, + "learning_rate": 0.0006233307762455658, + "loss": 0.76449716, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.36645508, + "step": 2277, + "time_per_iteration": 2.6191978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121381, + "balance_loss_mlp": 1.08576083, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.053405108271766075, + "language_loss": 0.8389169, + "learning_rate": 0.0006230288361465216, + "loss": 0.85013068, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.35644531, + "step": 2278, + "time_per_iteration": 3.0405595302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113147, + "balance_loss_mlp": 1.09399056, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.06317085407877503, + "language_loss": 0.85187429, + "learning_rate": 0.0006227268482834473, + "loss": 0.86318898, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.37475586, + "step": 2279, + "time_per_iteration": 2.884791135787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140517, + "balance_loss_mlp": 1.10272789, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.08374351035766264, + "language_loss": 0.87551039, + "learning_rate": 0.000622424812773585, + "loss": 0.88691556, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.37768555, + "step": 2280, + "time_per_iteration": 2.790846824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129266, + "balance_loss_mlp": 1.09150028, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07881944372222376, + "language_loss": 0.79747838, + "learning_rate": 0.000622122729734195, + "loss": 0.80877101, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.37744141, + "step": 2281, + "time_per_iteration": 2.5392401218414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130236, + "balance_loss_mlp": 1.09404397, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06512890224106707, + "language_loss": 0.87574816, + "learning_rate": 0.0006218205992825566, + "loss": 0.88705051, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.36206055, + "step": 2282, + "time_per_iteration": 2.6409003734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130264, + "balance_loss_mlp": 1.09304714, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.058092029820517505, + "language_loss": 0.82094592, + "learning_rate": 0.0006215184215359671, + "loss": 0.83224851, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37207031, + "step": 2283, + "time_per_iteration": 2.798405647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112171, + "balance_loss_mlp": 1.08506513, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.06799742884418125, + "language_loss": 0.86864793, + "learning_rate": 0.0006212161966117425, + "loss": 0.87986505, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36669922, + "step": 2284, + "time_per_iteration": 2.7305543422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120327, + "balance_loss_mlp": 1.0823704, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.0718064317498989, + "language_loss": 0.81899178, + "learning_rate": 0.0006209139246272164, + "loss": 0.83019507, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37939453, + "step": 2285, + "time_per_iteration": 2.9496707916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114672, + "balance_loss_mlp": 1.07569027, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.0666339573323591, + "language_loss": 0.81558084, + "learning_rate": 0.0006206116056997421, + "loss": 0.82672757, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.38964844, + "step": 2286, + "time_per_iteration": 2.56559681892395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.08414793, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.07939984369379535, + "language_loss": 0.82495737, + "learning_rate": 0.0006203092399466892, + "loss": 0.83617818, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.37915039, + "step": 2287, + "time_per_iteration": 2.614211082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08368051, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.05953237575059506, + "language_loss": 0.85318255, + "learning_rate": 0.0006200068274854473, + "loss": 0.86438239, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36303711, + "step": 2288, + "time_per_iteration": 2.6718688011169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123012, + "balance_loss_mlp": 1.08679628, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0828196201385275, + "language_loss": 0.86406159, + "learning_rate": 0.0006197043684334229, + "loss": 0.87529171, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.36230469, + "step": 2289, + "time_per_iteration": 2.7540907859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128468, + "balance_loss_mlp": 1.09158421, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.11266642339430595, + "language_loss": 0.79650962, + "learning_rate": 0.0006194018629080411, + "loss": 0.80779433, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.36865234, + "step": 2290, + "time_per_iteration": 2.7200653553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127999, + "balance_loss_mlp": 1.09099627, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.0658560511601545, + "language_loss": 0.81793892, + "learning_rate": 0.0006190993110267451, + "loss": 0.82921886, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.36987305, + "step": 2291, + "time_per_iteration": 2.709512233734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130933, + "balance_loss_mlp": 1.09311938, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.0787223425712205, + "language_loss": 0.84518313, + "learning_rate": 0.0006187967129069958, + "loss": 0.85649246, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.37792969, + "step": 2292, + "time_per_iteration": 2.4924299716949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.08935523, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07162475848736369, + "language_loss": 0.87490463, + "learning_rate": 0.0006184940686662722, + "loss": 0.88615251, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.35449219, + "step": 2293, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119268, + "balance_loss_mlp": 1.08445859, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06340812224100711, + "language_loss": 0.9041853, + "learning_rate": 0.0006181913784220714, + "loss": 0.91537791, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.34838867, + "step": 2294, + "time_per_iteration": 2.64821457862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_mlp": 1.0290786, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.025861242717412188, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81591213, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.08349609, + "step": 2295, + "time_per_iteration": 4.885660171508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119684, + "balance_loss_mlp": 1.08537531, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.10155164806079009, + "language_loss": 0.80041152, + "learning_rate": 0.0006175858603933146, + "loss": 0.81160837, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.34326172, + "step": 2296, + "time_per_iteration": 2.881615400314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129433, + "balance_loss_mlp": 1.09393275, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.0685445546464461, + "language_loss": 0.81208229, + "learning_rate": 0.0006172830328438416, + "loss": 0.82337666, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.35498047, + "step": 2297, + "time_per_iteration": 2.940401315689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.08680558, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.09103818832157724, + "language_loss": 0.87286425, + "learning_rate": 0.0006169801597610572, + "loss": 0.88410091, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.36889648, + "step": 2298, + "time_per_iteration": 2.7739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.08195138, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.1052787532551667, + "language_loss": 0.9040001, + "learning_rate": 0.0006166772412625469, + "loss": 0.91515625, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.33666992, + "step": 2299, + "time_per_iteration": 2.734384298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112173, + "balance_loss_mlp": 1.07710147, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.07592361192988976, + "language_loss": 0.81779516, + "learning_rate": 0.0006163742774659141, + "loss": 0.82891691, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.35107422, + "step": 2300, + "time_per_iteration": 2.8436357975006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107602, + "balance_loss_mlp": 1.07410455, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.0790889900730028, + "language_loss": 0.86033177, + "learning_rate": 0.0006160712684887801, + "loss": 0.87140775, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.33496094, + "step": 2301, + "time_per_iteration": 2.816479206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.07118952, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.0554513610730849, + "language_loss": 0.82599401, + "learning_rate": 0.0006157682144487832, + "loss": 0.83703709, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.33129883, + "step": 2302, + "time_per_iteration": 2.7371127605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112573, + "balance_loss_mlp": 1.07776368, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.08617173815320239, + "language_loss": 0.83484352, + "learning_rate": 0.0006154651154635793, + "loss": 0.84596926, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.34838867, + "step": 2303, + "time_per_iteration": 2.822388172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122213, + "balance_loss_mlp": 1.08664048, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.06891313471916412, + "language_loss": 0.85087454, + "learning_rate": 0.0006151619716508421, + "loss": 0.86209667, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.35571289, + "step": 2304, + "time_per_iteration": 2.5669307708740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113601, + "balance_loss_mlp": 1.07905424, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0676174746334525, + "language_loss": 0.87354678, + "learning_rate": 0.0006148587831282625, + "loss": 0.88468277, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.34545898, + "step": 2305, + "time_per_iteration": 2.7296478748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_mlp": 1.03257155, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03035679683037383, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80218178, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.09521484, + "step": 2306, + "time_per_iteration": 4.932115077972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132739, + "balance_loss_mlp": 1.09490204, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.0708853860960667, + "language_loss": 0.87972111, + "learning_rate": 0.0006142522724244255, + "loss": 0.89104849, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.37817383, + "step": 2307, + "time_per_iteration": 2.5106770992279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_mlp": 1.01785433, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.02287011405410123, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77512109, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09521484, + "step": 2308, + "time_per_iteration": 4.842617034912109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120011, + "balance_loss_mlp": 1.08405757, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07624843376245131, + "language_loss": 0.77539825, + "learning_rate": 0.000613645584293942, + "loss": 0.78659838, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.35986328, + "step": 2309, + "time_per_iteration": 2.8661446571350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.08933806, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.0700550632478262, + "language_loss": 0.83505249, + "learning_rate": 0.0006133421739881185, + "loss": 0.84630251, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.35693359, + "step": 2310, + "time_per_iteration": 2.6644127368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118668, + "balance_loss_mlp": 1.08319092, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.11928760190169391, + "language_loss": 0.83116257, + "learning_rate": 0.0006130387196789605, + "loss": 0.84234929, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.35522461, + "step": 2311, + "time_per_iteration": 2.7157018184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111828, + "balance_loss_mlp": 1.07699549, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.05741887786628051, + "language_loss": 0.84819949, + "learning_rate": 0.0006127352214842795, + "loss": 0.85931778, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.34838867, + "step": 2312, + "time_per_iteration": 2.9459052085876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118701, + "balance_loss_mlp": 1.08293796, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07350143541661519, + "language_loss": 0.85691726, + "learning_rate": 0.0006124316795219041, + "loss": 0.86810434, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.35742188, + "step": 2313, + "time_per_iteration": 2.772299289703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131037, + "balance_loss_mlp": 1.0956552, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.06263706285199609, + "language_loss": 0.82505524, + "learning_rate": 0.0006121280939096794, + "loss": 0.83636558, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.35424805, + "step": 2314, + "time_per_iteration": 2.7951674461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114668, + "balance_loss_mlp": 1.11020195, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.0720052818844606, + "language_loss": 0.88360798, + "learning_rate": 0.000611824464765468, + "loss": 0.89507478, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.36499023, + "step": 2315, + "time_per_iteration": 2.5895602703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067507, + "balance_loss_mlp": 1.05682635, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.0344692196546668, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79662448, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.10693359, + "step": 2316, + "time_per_iteration": 4.6560447216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137929, + "balance_loss_mlp": 1.1022377, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.06826351361083724, + "language_loss": 0.85665047, + "learning_rate": 0.000611217076352619, + "loss": 0.86802971, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35693359, + "step": 2317, + "time_per_iteration": 2.7965078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132041, + "balance_loss_mlp": 1.09835279, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06652231411845559, + "language_loss": 0.83542907, + "learning_rate": 0.0006109133173197905, + "loss": 0.84674948, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.33691406, + "step": 2318, + "time_per_iteration": 2.678832769393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124047, + "balance_loss_mlp": 1.08897519, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.06811942389724822, + "language_loss": 0.85992062, + "learning_rate": 0.0006106095152265935, + "loss": 0.8711611, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35107422, + "step": 2319, + "time_per_iteration": 2.9018518924713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111639, + "balance_loss_mlp": 1.08060324, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.06308491230142964, + "language_loss": 0.85126555, + "learning_rate": 0.0006103056701909739, + "loss": 0.8624295, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.3581543, + "step": 2320, + "time_per_iteration": 2.927619218826294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111434, + "balance_loss_mlp": 1.07869673, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.08034132862269446, + "language_loss": 0.83192152, + "learning_rate": 0.0006100017823308956, + "loss": 0.8430649, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35644531, + "step": 2321, + "time_per_iteration": 3.1759355068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111575, + "balance_loss_mlp": 1.07645655, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.0688182521177716, + "language_loss": 0.79684091, + "learning_rate": 0.0006096978517643377, + "loss": 0.8079567, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.35131836, + "step": 2322, + "time_per_iteration": 2.791020154953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_mlp": 1.07337499, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.08831218810897808, + "language_loss": 0.83671057, + "learning_rate": 0.0006093938786092968, + "loss": 0.84780538, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.36108398, + "step": 2323, + "time_per_iteration": 2.614248037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107948, + "balance_loss_mlp": 1.0734967, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06554008035854059, + "language_loss": 0.90401232, + "learning_rate": 0.0006090898629837857, + "loss": 0.91509175, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.34448242, + "step": 2324, + "time_per_iteration": 2.7988476753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114598, + "balance_loss_mlp": 1.07950234, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.05596676685861875, + "language_loss": 0.87779921, + "learning_rate": 0.0006087858050058337, + "loss": 0.88894522, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.35083008, + "step": 2325, + "time_per_iteration": 2.8598742485046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106952, + "balance_loss_mlp": 1.07309675, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.08404177014968839, + "language_loss": 0.82489681, + "learning_rate": 0.0006084817047934866, + "loss": 0.83596623, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.33886719, + "step": 2326, + "time_per_iteration": 2.6458888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.07780075, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.07155239810176077, + "language_loss": 0.89966661, + "learning_rate": 0.0006081775624648066, + "loss": 0.91078842, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.34399414, + "step": 2327, + "time_per_iteration": 2.580366373062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120962, + "balance_loss_mlp": 1.08689189, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.06301539333332261, + "language_loss": 0.83119273, + "learning_rate": 0.0006078733781378721, + "loss": 0.8424024, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.34082031, + "step": 2328, + "time_per_iteration": 2.54127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110302, + "balance_loss_mlp": 1.07594562, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.057204005558127505, + "language_loss": 0.82213807, + "learning_rate": 0.0006075691519307781, + "loss": 0.83324105, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.34375, + "step": 2329, + "time_per_iteration": 2.8602964878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117341, + "balance_loss_mlp": 1.08193612, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.055534005363494426, + "language_loss": 0.81606597, + "learning_rate": 0.0006072648839616356, + "loss": 0.82723939, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.35400391, + "step": 2330, + "time_per_iteration": 2.662810802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119722, + "balance_loss_mlp": 1.08565211, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.050779766652796585, + "language_loss": 0.82901573, + "learning_rate": 0.0006069605743485718, + "loss": 0.84021294, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.34057617, + "step": 2331, + "time_per_iteration": 3.3678483963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128598, + "balance_loss_mlp": 1.0950762, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.04918059080846435, + "language_loss": 0.83280981, + "learning_rate": 0.0006066562232097303, + "loss": 0.84409571, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.33544922, + "step": 2332, + "time_per_iteration": 2.7449440956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123187, + "balance_loss_mlp": 1.08785367, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.052836841401222294, + "language_loss": 0.86161315, + "learning_rate": 0.0006063518306632708, + "loss": 0.87284505, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.35375977, + "step": 2333, + "time_per_iteration": 2.9690473079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127444, + "balance_loss_mlp": 1.09220576, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.06707958703687776, + "language_loss": 0.82286978, + "learning_rate": 0.0006060473968273688, + "loss": 0.83414423, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.35229492, + "step": 2334, + "time_per_iteration": 2.665539026260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142277, + "balance_loss_mlp": 1.13331211, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.036352477885187424, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79021817, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.08984375, + "step": 2335, + "time_per_iteration": 4.888899326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115384, + "balance_loss_mlp": 1.10641909, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.027581232823365703, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82120597, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.08984375, + "step": 2336, + "time_per_iteration": 4.835580348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126219, + "balance_loss_mlp": 1.09155297, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.06484007747623576, + "language_loss": 0.88115394, + "learning_rate": 0.0006051338487650047, + "loss": 0.89241612, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.34667969, + "step": 2337, + "time_per_iteration": 2.4327657222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125299, + "balance_loss_mlp": 1.08846319, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.06762371666749806, + "language_loss": 0.82857472, + "learning_rate": 0.0006048292509534095, + "loss": 0.83982766, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.3684082, + "step": 2338, + "time_per_iteration": 2.583315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.08851767, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.06288042140328122, + "language_loss": 0.78114402, + "learning_rate": 0.0006045246124434895, + "loss": 0.792373, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.34350586, + "step": 2339, + "time_per_iteration": 2.718944787979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111671, + "balance_loss_mlp": 1.08223438, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06455240115792976, + "language_loss": 0.86995041, + "learning_rate": 0.0006042199333535162, + "loss": 0.88111752, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.3449707, + "step": 2340, + "time_per_iteration": 3.280731439590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120556, + "balance_loss_mlp": 1.08803582, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06119421780994794, + "language_loss": 0.83960807, + "learning_rate": 0.0006039152138017763, + "loss": 0.85081363, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.32519531, + "step": 2341, + "time_per_iteration": 3.042808771133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08285511, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.06181422787629511, + "language_loss": 0.83835328, + "learning_rate": 0.0006036104539065726, + "loss": 0.84952325, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.34155273, + "step": 2342, + "time_per_iteration": 2.671872138977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117893, + "balance_loss_mlp": 1.08208227, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.05413998463628708, + "language_loss": 0.84596831, + "learning_rate": 0.000603305653786223, + "loss": 0.85714728, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.3581543, + "step": 2343, + "time_per_iteration": 3.153627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116847, + "balance_loss_mlp": 1.08182287, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.06019885466307642, + "language_loss": 0.84242773, + "learning_rate": 0.0006030008135590622, + "loss": 0.85359621, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.35058594, + "step": 2344, + "time_per_iteration": 2.724281072616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109974, + "balance_loss_mlp": 1.07564187, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.06173385406680834, + "language_loss": 0.80783409, + "learning_rate": 0.0006026959333434387, + "loss": 0.81893378, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.34350586, + "step": 2345, + "time_per_iteration": 2.7752277851104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107914, + "balance_loss_mlp": 1.07336736, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.04677974400708639, + "language_loss": 0.77811158, + "learning_rate": 0.0006023910132577181, + "loss": 0.78919077, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.34545898, + "step": 2346, + "time_per_iteration": 2.663447141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_mlp": 1.06802082, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.060558646022808645, + "language_loss": 0.85310882, + "learning_rate": 0.0006020860534202806, + "loss": 0.86412525, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.33618164, + "step": 2347, + "time_per_iteration": 2.480811595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108815, + "balance_loss_mlp": 1.07388651, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06606096221098971, + "language_loss": 0.81316173, + "learning_rate": 0.0006017810539495224, + "loss": 0.82424992, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.34960938, + "step": 2348, + "time_per_iteration": 2.9476070404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098582, + "balance_loss_mlp": 1.06415427, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.0571113923067653, + "language_loss": 0.82774842, + "learning_rate": 0.0006014760149638547, + "loss": 0.83873427, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.34423828, + "step": 2349, + "time_per_iteration": 2.6655263900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103718, + "balance_loss_mlp": 1.07005334, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.06475243948679671, + "language_loss": 0.88831103, + "learning_rate": 0.000601170936581704, + "loss": 0.89934826, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.33666992, + "step": 2350, + "time_per_iteration": 2.5269417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06343222, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.06432650174878703, + "language_loss": 0.84562814, + "learning_rate": 0.0006008658189215121, + "loss": 0.85660601, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.34399414, + "step": 2351, + "time_per_iteration": 2.621596097946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110179, + "balance_loss_mlp": 1.07267594, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.3016755485520666, + "language_loss": 0.8046757, + "learning_rate": 0.0006005606621017366, + "loss": 0.81577748, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.375, + "step": 2352, + "time_per_iteration": 2.561138153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111286, + "balance_loss_mlp": 1.07564211, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.055264843638134026, + "language_loss": 0.80770934, + "learning_rate": 0.0006002554662408496, + "loss": 0.81882215, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.35644531, + "step": 2353, + "time_per_iteration": 2.87947940826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118454, + "balance_loss_mlp": 1.08180928, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.06003231312298175, + "language_loss": 0.91710508, + "learning_rate": 0.0005999502314573388, + "loss": 0.92828965, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36645508, + "step": 2354, + "time_per_iteration": 2.703589916229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127767, + "balance_loss_mlp": 1.09119391, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.06522748471040672, + "language_loss": 0.86741221, + "learning_rate": 0.0005996449578697066, + "loss": 0.87868989, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.36547852, + "step": 2355, + "time_per_iteration": 2.6407227516174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114254, + "balance_loss_mlp": 1.10627651, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.05645244306136207, + "language_loss": 0.81587362, + "learning_rate": 0.0005993396455964709, + "loss": 0.827299, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36279297, + "step": 2356, + "time_per_iteration": 2.7260916233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159041, + "balance_loss_mlp": 1.12263405, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.0574643396084849, + "language_loss": 0.81904489, + "learning_rate": 0.0005990342947561647, + "loss": 0.83063525, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.36401367, + "step": 2357, + "time_per_iteration": 2.763461112976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158122, + "balance_loss_mlp": 1.12109542, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.06627350558163068, + "language_loss": 0.78124607, + "learning_rate": 0.0005987289054673351, + "loss": 0.79282725, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.37011719, + "step": 2358, + "time_per_iteration": 2.7317159175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172658, + "balance_loss_mlp": 1.16121387, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.05600708096364228, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77748394, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11425781, + "step": 2359, + "time_per_iteration": 4.815205335617065 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168714, + "balance_loss_mlp": 1.13257003, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.0832511333205401, + "language_loss": 0.91429126, + "learning_rate": 0.0005981180120183722, + "loss": 0.92597842, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36206055, + "step": 2360, + "time_per_iteration": 2.675994873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154784, + "balance_loss_mlp": 1.11825836, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.06456101952662723, + "language_loss": 0.85450256, + "learning_rate": 0.0005978125080954089, + "loss": 0.86605042, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.36523438, + "step": 2361, + "time_per_iteration": 2.844592332839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134938, + "balance_loss_mlp": 1.0997715, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.06943573222196867, + "language_loss": 0.77225572, + "learning_rate": 0.000597506966198262, + "loss": 0.7836051, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.35180664, + "step": 2362, + "time_per_iteration": 2.990652322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127189, + "balance_loss_mlp": 1.09216547, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.07387250459530678, + "language_loss": 0.84014916, + "learning_rate": 0.0005972013864455536, + "loss": 0.85142106, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.3503418, + "step": 2363, + "time_per_iteration": 2.589594841003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124933, + "balance_loss_mlp": 1.09141088, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.06451639193106218, + "language_loss": 0.85711533, + "learning_rate": 0.0005968957689559203, + "loss": 0.86836469, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.33544922, + "step": 2364, + "time_per_iteration": 2.6682167053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119758, + "balance_loss_mlp": 1.08585453, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.06239355206550831, + "language_loss": 0.89508158, + "learning_rate": 0.0005965901138480131, + "loss": 0.90627909, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.33911133, + "step": 2365, + "time_per_iteration": 2.6365487575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125783, + "balance_loss_mlp": 1.08816087, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07086256306792152, + "language_loss": 0.87331104, + "learning_rate": 0.0005962844212404982, + "loss": 0.88456881, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.37597656, + "step": 2366, + "time_per_iteration": 2.6617612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123043, + "balance_loss_mlp": 1.08763838, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05743086206543283, + "language_loss": 0.87604624, + "learning_rate": 0.0005959786912520558, + "loss": 0.88727665, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.35400391, + "step": 2367, + "time_per_iteration": 2.5842456817626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112429, + "balance_loss_mlp": 1.08878994, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.05541530908978363, + "language_loss": 0.84261698, + "learning_rate": 0.0005956729240013806, + "loss": 0.8538599, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.35522461, + "step": 2368, + "time_per_iteration": 2.8338305950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131752, + "balance_loss_mlp": 1.09880257, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06117437276065272, + "language_loss": 0.91673207, + "learning_rate": 0.0005953671196071824, + "loss": 0.92804956, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.32958984, + "step": 2369, + "time_per_iteration": 2.6954920291900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140492, + "balance_loss_mlp": 1.10089099, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.05874804832244865, + "language_loss": 0.80540514, + "learning_rate": 0.0005950612781881846, + "loss": 0.81681007, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.39575195, + "step": 2370, + "time_per_iteration": 2.695518732070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133052, + "balance_loss_mlp": 1.09526241, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.054922750315337415, + "language_loss": 0.76194978, + "learning_rate": 0.0005947553998631259, + "loss": 0.77328038, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37792969, + "step": 2371, + "time_per_iteration": 2.854757070541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133988, + "balance_loss_mlp": 1.09777188, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.04850294692755014, + "language_loss": 0.79227567, + "learning_rate": 0.000594449484750758, + "loss": 0.80361551, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36206055, + "step": 2372, + "time_per_iteration": 3.2277348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128775, + "balance_loss_mlp": 1.09263051, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.06286219474212958, + "language_loss": 0.83387208, + "learning_rate": 0.0005941435329698484, + "loss": 0.84515989, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36132812, + "step": 2373, + "time_per_iteration": 2.676492929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126424, + "balance_loss_mlp": 1.09025562, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.05768590484176838, + "language_loss": 0.83615124, + "learning_rate": 0.0005938375446391778, + "loss": 0.84741557, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36181641, + "step": 2374, + "time_per_iteration": 2.7465567588806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137671, + "balance_loss_mlp": 1.09969115, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.05745321957635053, + "language_loss": 0.89048398, + "learning_rate": 0.0005935315198775415, + "loss": 0.90186071, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38012695, + "step": 2375, + "time_per_iteration": 2.6580095291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128206, + "balance_loss_mlp": 1.09320593, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06107240600749233, + "language_loss": 0.87175268, + "learning_rate": 0.0005932254588037486, + "loss": 0.88303471, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.35009766, + "step": 2376, + "time_per_iteration": 2.488588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121963, + "balance_loss_mlp": 1.08600903, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.05478508122440065, + "language_loss": 0.86331463, + "learning_rate": 0.000592919361536623, + "loss": 0.87453431, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.35961914, + "step": 2377, + "time_per_iteration": 2.644374132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127537, + "balance_loss_mlp": 1.09196472, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.05713052679154174, + "language_loss": 0.89246452, + "learning_rate": 0.0005926132281950017, + "loss": 0.90373993, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.35571289, + "step": 2378, + "time_per_iteration": 2.7563676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121403, + "balance_loss_mlp": 1.08406663, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.05503863795363348, + "language_loss": 0.85310149, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431557, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37329102, + "step": 2379, + "time_per_iteration": 2.8923282623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123743, + "balance_loss_mlp": 1.087098, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.05441682742417314, + "language_loss": 0.86308765, + "learning_rate": 0.0005920008537636931, + "loss": 0.8743251, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.3659668, + "step": 2380, + "time_per_iteration": 2.8928191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121741, + "balance_loss_mlp": 1.0852387, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.0522540937039379, + "language_loss": 0.86756825, + "learning_rate": 0.0005916946129117504, + "loss": 0.87878567, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.9031155109405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129507, + "balance_loss_mlp": 1.09281409, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.055637229661903514, + "language_loss": 0.80852348, + "learning_rate": 0.0005913883364608017, + "loss": 0.8198185, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36694336, + "step": 2382, + "time_per_iteration": 3.0779874324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123213, + "balance_loss_mlp": 1.088094, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.05906328885450196, + "language_loss": 0.88737094, + "learning_rate": 0.0005910820245297542, + "loss": 0.89860308, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.35131836, + "step": 2383, + "time_per_iteration": 2.889805555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119246, + "balance_loss_mlp": 1.0824585, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.06990697064707628, + "language_loss": 0.80825961, + "learning_rate": 0.000590775677237529, + "loss": 0.81945217, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.36791992, + "step": 2384, + "time_per_iteration": 2.7286477088928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011127, + "balance_loss_mlp": 1.07562566, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.06044507930671915, + "language_loss": 0.80186594, + "learning_rate": 0.0005904692947030601, + "loss": 0.81299293, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.37084961, + "step": 2385, + "time_per_iteration": 2.6249661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112584, + "balance_loss_mlp": 1.07446146, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.06266023003425206, + "language_loss": 0.89858609, + "learning_rate": 0.0005901628770452963, + "loss": 0.90971196, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.38110352, + "step": 2386, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106229, + "balance_loss_mlp": 1.06925035, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.05741151930163357, + "language_loss": 0.87304425, + "learning_rate": 0.000589856424383199, + "loss": 0.88410658, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.36987305, + "step": 2387, + "time_per_iteration": 2.6852517127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.07863569, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.06606538590283985, + "language_loss": 0.83553612, + "learning_rate": 0.000589549936835744, + "loss": 0.84669703, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.37451172, + "step": 2388, + "time_per_iteration": 2.8861043453216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106236, + "balance_loss_mlp": 1.07135534, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06160096974470471, + "language_loss": 0.79523546, + "learning_rate": 0.0005892434145219202, + "loss": 0.80629778, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.34912109, + "step": 2389, + "time_per_iteration": 2.6016130447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06219506, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07218042116864783, + "language_loss": 0.82768381, + "learning_rate": 0.0005889368575607303, + "loss": 0.83865625, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.35058594, + "step": 2390, + "time_per_iteration": 2.806382894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_mlp": 1.06791568, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.06321076421250729, + "language_loss": 0.78347147, + "learning_rate": 0.00058863026607119, + "loss": 0.7944994, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.34912109, + "step": 2391, + "time_per_iteration": 3.0679373741149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06800711, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07981135891264553, + "language_loss": 0.80153728, + "learning_rate": 0.0005883236401723287, + "loss": 0.81255829, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.34130859, + "step": 2392, + "time_per_iteration": 3.178016185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102518, + "balance_loss_mlp": 1.06830466, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.05809686694512272, + "language_loss": 0.8436439, + "learning_rate": 0.0005880169799831893, + "loss": 0.85466909, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.34204102, + "step": 2393, + "time_per_iteration": 2.7394168376922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099974, + "balance_loss_mlp": 1.06537914, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.05496993027151255, + "language_loss": 0.81652063, + "learning_rate": 0.0005877102856228278, + "loss": 0.82752037, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.34594727, + "step": 2394, + "time_per_iteration": 2.857044219970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07225823, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.0685378240754912, + "language_loss": 0.84987622, + "learning_rate": 0.0005874035572103133, + "loss": 0.86095524, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.35644531, + "step": 2395, + "time_per_iteration": 2.6805660724639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_mlp": 1.06699777, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.07818612590839771, + "language_loss": 0.82504952, + "learning_rate": 0.0005870967948647288, + "loss": 0.83607757, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.35839844, + "step": 2396, + "time_per_iteration": 2.7740094661712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13801181, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.06620078890509219, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75458288, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.11962891, + "step": 2397, + "time_per_iteration": 5.407956838607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.07158542, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.05578291602549768, + "language_loss": 0.85959148, + "learning_rate": 0.0005864831688507443, + "loss": 0.87066138, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.35424805, + "step": 2398, + "time_per_iteration": 3.000498056411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108167, + "balance_loss_mlp": 1.07342887, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.0567470157783756, + "language_loss": 0.7555595, + "learning_rate": 0.0005861763054205754, + "loss": 0.7666412, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.34765625, + "step": 2399, + "time_per_iteration": 2.7206692695617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108701, + "balance_loss_mlp": 1.07303381, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.054446102099669776, + "language_loss": 0.80056608, + "learning_rate": 0.0005858694085337976, + "loss": 0.81165302, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.35668945, + "step": 2400, + "time_per_iteration": 2.8272197246551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107355, + "balance_loss_mlp": 1.07090116, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.06783884534527172, + "language_loss": 0.83774948, + "learning_rate": 0.0005855624783095589, + "loss": 0.84882307, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.36425781, + "step": 2401, + "time_per_iteration": 2.6019625663757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102889, + "balance_loss_mlp": 1.06812799, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.05559222161472476, + "language_loss": 0.8541491, + "learning_rate": 0.00058525551486702, + "loss": 0.86517805, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.34790039, + "step": 2402, + "time_per_iteration": 2.5166754722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106757, + "balance_loss_mlp": 1.07058895, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.07030933336499708, + "language_loss": 0.80856764, + "learning_rate": 0.0005849485183253548, + "loss": 0.81963521, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.36206055, + "step": 2403, + "time_per_iteration": 2.6906049251556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.07090759, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.057304610397081915, + "language_loss": 0.87811077, + "learning_rate": 0.0005846414888037501, + "loss": 0.88916934, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.34960938, + "step": 2404, + "time_per_iteration": 2.488797426223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.06899309, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.05034114049250231, + "language_loss": 0.82261539, + "learning_rate": 0.0005843344264214049, + "loss": 0.83363742, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.33203125, + "step": 2405, + "time_per_iteration": 2.746372938156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110347, + "balance_loss_mlp": 1.07068777, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.10060755415937467, + "language_loss": 0.85092008, + "learning_rate": 0.0005840273312975317, + "loss": 0.86195481, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.32788086, + "step": 2406, + "time_per_iteration": 2.834230661392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112626, + "balance_loss_mlp": 1.07829416, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.06610522075480575, + "language_loss": 0.90376371, + "learning_rate": 0.0005837202035513555, + "loss": 0.91489005, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.34326172, + "step": 2407, + "time_per_iteration": 2.577099084854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112101, + "balance_loss_mlp": 1.07693422, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06799718927162632, + "language_loss": 0.81987119, + "learning_rate": 0.0005834130433021136, + "loss": 0.83099222, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.3515625, + "step": 2408, + "time_per_iteration": 2.751481771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07537687, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07576984187058394, + "language_loss": 0.73707795, + "learning_rate": 0.0005831058506690563, + "loss": 0.74819058, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.359375, + "step": 2409, + "time_per_iteration": 2.6351587772369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104428, + "balance_loss_mlp": 1.0719074, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06066453040155937, + "language_loss": 0.86246306, + "learning_rate": 0.0005827986257714464, + "loss": 0.87350732, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.32519531, + "step": 2410, + "time_per_iteration": 2.9171712398529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_mlp": 1.07334006, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.05632663018450853, + "language_loss": 0.8897202, + "learning_rate": 0.0005824913687285591, + "loss": 0.90078408, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.33032227, + "step": 2411, + "time_per_iteration": 2.6863625049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104253, + "balance_loss_mlp": 1.07056427, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.09102731097831396, + "language_loss": 0.81903768, + "learning_rate": 0.0005821840796596821, + "loss": 0.83008015, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.3371582, + "step": 2412, + "time_per_iteration": 2.658602714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108642, + "balance_loss_mlp": 1.07605052, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.04905521047169809, + "language_loss": 0.8043226, + "learning_rate": 0.0005818767586841158, + "loss": 0.81540906, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.32592773, + "step": 2413, + "time_per_iteration": 2.7577285766601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108976, + "balance_loss_mlp": 1.07655096, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06302213894221746, + "language_loss": 0.865412, + "learning_rate": 0.0005815694059211726, + "loss": 0.8765018, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.32421875, + "step": 2414, + "time_per_iteration": 2.6655328273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174358, + "balance_loss_mlp": 1.16362953, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.06384975588330166, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82047987, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.10742188, + "step": 2415, + "time_per_iteration": 4.795905828475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_mlp": 1.09135294, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.035706806463564576, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78046715, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.10302734, + "step": 2416, + "time_per_iteration": 4.964730978012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100793, + "balance_loss_mlp": 1.06910706, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.054161288123553565, + "language_loss": 0.8669647, + "learning_rate": 0.0005806471581013931, + "loss": 0.8779726, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.31640625, + "step": 2417, + "time_per_iteration": 2.7034828662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106596, + "balance_loss_mlp": 1.07221591, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.05684649238509572, + "language_loss": 0.78830767, + "learning_rate": 0.0005803396793823146, + "loss": 0.79937363, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.34375, + "step": 2418, + "time_per_iteration": 2.810929536819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112917, + "balance_loss_mlp": 1.07848907, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07858966703970842, + "language_loss": 0.86256903, + "learning_rate": 0.0005800321694726065, + "loss": 0.87369823, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.34423828, + "step": 2419, + "time_per_iteration": 2.797091484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113087, + "balance_loss_mlp": 1.07880187, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.06627504844203173, + "language_loss": 0.86954433, + "learning_rate": 0.0005797246284916545, + "loss": 0.8806752, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.34277344, + "step": 2420, + "time_per_iteration": 2.689190149307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_mlp": 1.09355068, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.047662019725998206, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78608793, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.10058594, + "step": 2421, + "time_per_iteration": 6.38897705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112649, + "balance_loss_mlp": 1.09318316, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.06710074217369558, + "language_loss": 0.88096154, + "learning_rate": 0.0005791094537936233, + "loss": 0.8922264, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.33325195, + "step": 2422, + "time_per_iteration": 4.209144353866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126187, + "balance_loss_mlp": 1.09340453, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.0626199173608307, + "language_loss": 0.82125473, + "learning_rate": 0.0005788018203153762, + "loss": 0.83251661, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.32788086, + "step": 2423, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138161, + "balance_loss_mlp": 1.10540235, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.07666207610831233, + "language_loss": 0.85944337, + "learning_rate": 0.000578494156243549, + "loss": 0.87082505, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.32763672, + "step": 2424, + "time_per_iteration": 2.582838296890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142028, + "balance_loss_mlp": 1.10779119, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.11745148991984863, + "language_loss": 0.89446878, + "learning_rate": 0.0005781864616975878, + "loss": 0.90588903, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.3425293, + "step": 2425, + "time_per_iteration": 2.6464650630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135149, + "balance_loss_mlp": 1.10081649, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.07242740344873133, + "language_loss": 0.84278369, + "learning_rate": 0.0005778787367969502, + "loss": 0.85413516, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.34375, + "step": 2426, + "time_per_iteration": 2.5785605907440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.09822595, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.06251358549871673, + "language_loss": 0.81181312, + "learning_rate": 0.0005775709816611053, + "loss": 0.82312894, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.33374023, + "step": 2427, + "time_per_iteration": 2.9622879028320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125428, + "balance_loss_mlp": 1.09100056, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.06013841542134278, + "language_loss": 0.83607411, + "learning_rate": 0.0005772631964095346, + "loss": 0.84732836, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.34448242, + "step": 2428, + "time_per_iteration": 2.681161403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123616, + "balance_loss_mlp": 1.08990407, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.05815575913312505, + "language_loss": 0.85975552, + "learning_rate": 0.000576955381161731, + "loss": 0.87099165, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.3371582, + "step": 2429, + "time_per_iteration": 2.670814275741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122337, + "balance_loss_mlp": 1.08843446, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07250877112671852, + "language_loss": 0.86541677, + "learning_rate": 0.0005766475360371985, + "loss": 0.8766402, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.33935547, + "step": 2430, + "time_per_iteration": 2.5907814502716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118824, + "balance_loss_mlp": 1.08368063, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0946745942266809, + "language_loss": 0.84659714, + "learning_rate": 0.0005763396611554536, + "loss": 0.85778534, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.3515625, + "step": 2431, + "time_per_iteration": 2.679352045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123862, + "balance_loss_mlp": 1.0890286, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.06880905442669231, + "language_loss": 0.80567783, + "learning_rate": 0.0005760317566360237, + "loss": 0.81691647, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.34838867, + "step": 2432, + "time_per_iteration": 3.0134341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116239, + "balance_loss_mlp": 1.08090591, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.09211359876128772, + "language_loss": 0.85498667, + "learning_rate": 0.000575723822598448, + "loss": 0.86614907, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.35375977, + "step": 2433, + "time_per_iteration": 2.807387351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113775, + "balance_loss_mlp": 1.07882285, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07984033993726149, + "language_loss": 0.81515086, + "learning_rate": 0.0005754158591622773, + "loss": 0.82628858, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.35009766, + "step": 2434, + "time_per_iteration": 2.9610190391540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108856, + "balance_loss_mlp": 1.07335579, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.08173781127815187, + "language_loss": 0.83058012, + "learning_rate": 0.0005751078664470732, + "loss": 0.84166867, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.35522461, + "step": 2435, + "time_per_iteration": 2.5381393432617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105873, + "balance_loss_mlp": 1.07125473, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.06625067078188727, + "language_loss": 0.86156499, + "learning_rate": 0.0005747998445724094, + "loss": 0.87262368, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.34643555, + "step": 2436, + "time_per_iteration": 2.5991244316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110631, + "balance_loss_mlp": 1.0730263, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.06922366477490534, + "language_loss": 0.8967731, + "learning_rate": 0.0005744917936578707, + "loss": 0.90783614, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.33276367, + "step": 2437, + "time_per_iteration": 2.7876076698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110478, + "balance_loss_mlp": 1.07087731, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.05346939801811538, + "language_loss": 0.83987176, + "learning_rate": 0.0005741837138230526, + "loss": 0.8509196, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.33911133, + "step": 2438, + "time_per_iteration": 2.7089829444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110533, + "balance_loss_mlp": 1.07063985, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06113216144822436, + "language_loss": 0.8632471, + "learning_rate": 0.0005738756051875627, + "loss": 0.87430036, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.34692383, + "step": 2439, + "time_per_iteration": 3.10072922706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106031, + "balance_loss_mlp": 1.07031631, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.054040954813727636, + "language_loss": 0.83196378, + "learning_rate": 0.0005735674678710192, + "loss": 0.84302408, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.35668945, + "step": 2440, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06644058, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.06378034204188901, + "language_loss": 0.81315678, + "learning_rate": 0.0005732593019930517, + "loss": 0.82417667, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.35571289, + "step": 2441, + "time_per_iteration": 2.8945391178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_mlp": 1.0766257, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0589509513637404, + "language_loss": 0.88047123, + "learning_rate": 0.0005729511076733008, + "loss": 0.89160711, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.36962891, + "step": 2442, + "time_per_iteration": 2.6688244342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119163, + "balance_loss_mlp": 1.08199334, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.06849073497169517, + "language_loss": 0.84747314, + "learning_rate": 0.000572642885031418, + "loss": 0.85866475, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.37207031, + "step": 2443, + "time_per_iteration": 2.9179134368896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108245, + "balance_loss_mlp": 1.07276881, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.0584848920178353, + "language_loss": 0.80748844, + "learning_rate": 0.0005723346341870662, + "loss": 0.81857085, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35522461, + "step": 2444, + "time_per_iteration": 2.701399087905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129757, + "balance_loss_mlp": 1.09277797, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.11865712100152984, + "language_loss": 0.86692929, + "learning_rate": 0.0005720263552599188, + "loss": 0.87822688, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.36962891, + "step": 2445, + "time_per_iteration": 2.4486730098724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121458, + "balance_loss_mlp": 1.08500421, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08366602087356424, + "language_loss": 0.79955238, + "learning_rate": 0.0005717180483696604, + "loss": 0.81076699, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.36499023, + "step": 2446, + "time_per_iteration": 2.8785839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120985, + "balance_loss_mlp": 1.08486462, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0682417361382486, + "language_loss": 0.83352333, + "learning_rate": 0.0005714097136359862, + "loss": 0.84473318, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36157227, + "step": 2447, + "time_per_iteration": 2.6363351345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118201, + "balance_loss_mlp": 1.08296275, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.051381339811927676, + "language_loss": 0.86498094, + "learning_rate": 0.0005711013511786027, + "loss": 0.87616301, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.35253906, + "step": 2448, + "time_per_iteration": 2.762845993041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111685, + "balance_loss_mlp": 1.08170676, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.058854412729412026, + "language_loss": 0.84082228, + "learning_rate": 0.0005707929611172263, + "loss": 0.85199082, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3515625, + "step": 2449, + "time_per_iteration": 2.7246243953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08007717, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.11039935923903105, + "language_loss": 0.84227139, + "learning_rate": 0.000570484543571585, + "loss": 0.85343003, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.35791016, + "step": 2450, + "time_per_iteration": 2.610919237136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113904, + "balance_loss_mlp": 1.0777123, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.0667594391398321, + "language_loss": 0.82813287, + "learning_rate": 0.0005701760986614171, + "loss": 0.8392719, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36181641, + "step": 2451, + "time_per_iteration": 2.5151522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120641, + "balance_loss_mlp": 1.08590317, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.0603467987943219, + "language_loss": 0.87650943, + "learning_rate": 0.0005698676265064714, + "loss": 0.88771582, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.34765625, + "step": 2452, + "time_per_iteration": 2.5722150802612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114487, + "balance_loss_mlp": 1.07920074, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.07549274937771847, + "language_loss": 0.89053345, + "learning_rate": 0.0005695591272265074, + "loss": 0.90167832, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.35327148, + "step": 2453, + "time_per_iteration": 2.5431923866271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109778, + "balance_loss_mlp": 1.07384801, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.05406998074400625, + "language_loss": 0.82143486, + "learning_rate": 0.0005692506009412954, + "loss": 0.83253264, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.359375, + "step": 2454, + "time_per_iteration": 2.6976101398468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176153, + "balance_loss_mlp": 1.16375494, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.047894752053778404, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78727424, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.12402344, + "step": 2455, + "time_per_iteration": 5.006427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103739, + "balance_loss_mlp": 1.07000232, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07748007609747588, + "language_loss": 0.89838475, + "learning_rate": 0.0005686334678342593, + "loss": 0.90942216, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.3371582, + "step": 2456, + "time_per_iteration": 2.88089919090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110083, + "balance_loss_mlp": 1.07586968, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.053591450648947214, + "language_loss": 0.81747675, + "learning_rate": 0.0005683248612520274, + "loss": 0.82857764, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.34204102, + "step": 2457, + "time_per_iteration": 3.0411272048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111421, + "balance_loss_mlp": 1.07811391, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.10239407628225645, + "language_loss": 0.84273934, + "learning_rate": 0.0005680162281437321, + "loss": 0.85388148, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36083984, + "step": 2458, + "time_per_iteration": 2.8898301124572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120752, + "balance_loss_mlp": 1.08608592, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.0555075738071769, + "language_loss": 0.85104299, + "learning_rate": 0.000567707568629195, + "loss": 0.86225057, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.34692383, + "step": 2459, + "time_per_iteration": 2.706040143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122742, + "balance_loss_mlp": 1.08778977, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.06127780861136823, + "language_loss": 0.82619834, + "learning_rate": 0.0005673988828282486, + "loss": 0.83742571, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.34985352, + "step": 2460, + "time_per_iteration": 2.674525499343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111568, + "balance_loss_mlp": 1.07668757, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05574274236604154, + "language_loss": 0.81308633, + "learning_rate": 0.0005670901708607352, + "loss": 0.82420194, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.34912109, + "step": 2461, + "time_per_iteration": 2.982827663421631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109707, + "balance_loss_mlp": 1.0753746, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.15434207723638854, + "language_loss": 0.84411561, + "learning_rate": 0.0005667814328465076, + "loss": 0.85521269, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.34350586, + "step": 2462, + "time_per_iteration": 2.639051914215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.07245243, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.07072772635633937, + "language_loss": 0.81988347, + "learning_rate": 0.0005664726689054285, + "loss": 0.83094847, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34033203, + "step": 2463, + "time_per_iteration": 2.4655356407165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.07973766, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.06987107232693553, + "language_loss": 0.8107388, + "learning_rate": 0.0005661638791573704, + "loss": 0.82186544, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.32958984, + "step": 2464, + "time_per_iteration": 2.7433135509490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111686, + "balance_loss_mlp": 1.07742512, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.060845328276789123, + "language_loss": 0.87247777, + "learning_rate": 0.0005658550637222164, + "loss": 0.88359463, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.34277344, + "step": 2465, + "time_per_iteration": 2.615755558013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113074, + "balance_loss_mlp": 1.07762074, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.05153784391367151, + "language_loss": 0.82349539, + "learning_rate": 0.0005655462227198592, + "loss": 0.83462608, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35473633, + "step": 2466, + "time_per_iteration": 2.91003680229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109891, + "balance_loss_mlp": 1.07460487, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.055186067432112955, + "language_loss": 0.84493053, + "learning_rate": 0.0005652373562702016, + "loss": 0.85602945, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.3527832, + "step": 2467, + "time_per_iteration": 2.6209630966186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.07982516, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06952200013405305, + "language_loss": 0.88642848, + "learning_rate": 0.000564928464493156, + "loss": 0.89760423, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.37744141, + "step": 2468, + "time_per_iteration": 2.609154224395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117938, + "balance_loss_mlp": 1.0807451, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.05705018138682977, + "language_loss": 0.81856024, + "learning_rate": 0.000564619547508645, + "loss": 0.82973957, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.37158203, + "step": 2469, + "time_per_iteration": 3.041351556777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117314, + "balance_loss_mlp": 1.07849944, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.08036472839994792, + "language_loss": 0.83256048, + "learning_rate": 0.0005643106054366008, + "loss": 0.84373355, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.38818359, + "step": 2470, + "time_per_iteration": 2.5631182193756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07258332, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.05805518051262763, + "language_loss": 0.79916292, + "learning_rate": 0.000564001638396965, + "loss": 0.81025255, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.36376953, + "step": 2471, + "time_per_iteration": 2.7381579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110653, + "balance_loss_mlp": 1.0717926, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0665112346682766, + "language_loss": 0.82313401, + "learning_rate": 0.0005636926465096897, + "loss": 0.83419931, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.34741211, + "step": 2472, + "time_per_iteration": 3.0346837043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111103, + "balance_loss_mlp": 1.07622218, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.06532220540392095, + "language_loss": 0.87808621, + "learning_rate": 0.0005633836298947363, + "loss": 0.88919711, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.34912109, + "step": 2473, + "time_per_iteration": 2.587581157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122307, + "balance_loss_mlp": 1.08716393, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09099011339346055, + "language_loss": 0.70947754, + "learning_rate": 0.000563074588672075, + "loss": 0.72070062, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3515625, + "step": 2474, + "time_per_iteration": 2.7112982273101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.09012604, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06360669353624634, + "language_loss": 0.85420531, + "learning_rate": 0.0005627655229616868, + "loss": 0.8654604, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.35400391, + "step": 2475, + "time_per_iteration": 2.7166192531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131445, + "balance_loss_mlp": 1.09532499, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.05566651470752815, + "language_loss": 0.90219474, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350919, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.36132812, + "step": 2476, + "time_per_iteration": 2.8342158794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.08339906, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06751222051526788, + "language_loss": 0.8450973, + "learning_rate": 0.0005621473185576986, + "loss": 0.85629016, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.35913086, + "step": 2477, + "time_per_iteration": 2.727320432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126891, + "balance_loss_mlp": 1.0915097, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.06498777385437565, + "language_loss": 0.87181318, + "learning_rate": 0.0005618381801041068, + "loss": 0.88308215, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.35400391, + "step": 2478, + "time_per_iteration": 2.622197389602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136775, + "balance_loss_mlp": 1.09965336, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.0693017023966873, + "language_loss": 0.83176625, + "learning_rate": 0.0005615290176428044, + "loss": 0.84313405, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.37084961, + "step": 2479, + "time_per_iteration": 2.6874895095825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.10275292, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.06633902685884922, + "language_loss": 0.85015559, + "learning_rate": 0.0005612198312938187, + "loss": 0.86152905, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.34619141, + "step": 2480, + "time_per_iteration": 2.7283356189727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143197, + "balance_loss_mlp": 1.10717165, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08700724997250119, + "language_loss": 0.79558903, + "learning_rate": 0.0005609106211771868, + "loss": 0.80702102, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.36035156, + "step": 2481, + "time_per_iteration": 2.8008668422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155105, + "balance_loss_mlp": 1.11857891, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07115217474866456, + "language_loss": 0.89249581, + "learning_rate": 0.0005606013874129543, + "loss": 0.90404689, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36523438, + "step": 2482, + "time_per_iteration": 2.746906280517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146439, + "balance_loss_mlp": 1.11027122, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.052135079835272054, + "language_loss": 0.80459106, + "learning_rate": 0.0005602921301211768, + "loss": 0.81605548, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36181641, + "step": 2483, + "time_per_iteration": 2.760091543197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133668, + "balance_loss_mlp": 1.09895456, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.06775745953777351, + "language_loss": 0.82220864, + "learning_rate": 0.0005599828494219185, + "loss": 0.83354533, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.34716797, + "step": 2484, + "time_per_iteration": 2.5458662509918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113545, + "balance_loss_mlp": 1.10004473, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.08200141457856946, + "language_loss": 0.89550984, + "learning_rate": 0.0005596735454352527, + "loss": 0.90686429, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35424805, + "step": 2485, + "time_per_iteration": 2.8570785522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143886, + "balance_loss_mlp": 1.1075511, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07792091337932193, + "language_loss": 0.85635722, + "learning_rate": 0.0005593642182812619, + "loss": 0.86779606, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36352539, + "step": 2486, + "time_per_iteration": 2.630213975906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139867, + "balance_loss_mlp": 1.10369921, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.06102595686098437, + "language_loss": 0.83692348, + "learning_rate": 0.0005590548680800378, + "loss": 0.84832209, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36206055, + "step": 2487, + "time_per_iteration": 3.1342179775238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139371, + "balance_loss_mlp": 1.10389483, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0657277256500081, + "language_loss": 0.76383913, + "learning_rate": 0.0005587454949516804, + "loss": 0.77523285, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35498047, + "step": 2488, + "time_per_iteration": 2.6958112716674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145548, + "balance_loss_mlp": 1.10833097, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.061160167550216256, + "language_loss": 0.88185161, + "learning_rate": 0.0005584360990162993, + "loss": 0.89330709, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.37255859, + "step": 2489, + "time_per_iteration": 2.61667537689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133811, + "balance_loss_mlp": 1.09881115, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.0507120714137282, + "language_loss": 0.85551566, + "learning_rate": 0.0005581266803940124, + "loss": 0.86685371, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.35009766, + "step": 2490, + "time_per_iteration": 2.7139766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133471, + "balance_loss_mlp": 1.09649253, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0583035541715914, + "language_loss": 0.87154239, + "learning_rate": 0.0005578172392049471, + "loss": 0.88287711, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.36987305, + "step": 2491, + "time_per_iteration": 2.7481577396392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134521, + "balance_loss_mlp": 1.09918737, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.08141144255217014, + "language_loss": 0.84311044, + "learning_rate": 0.0005575077755692386, + "loss": 0.85445559, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.35351562, + "step": 2492, + "time_per_iteration": 2.7962934970855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132576, + "balance_loss_mlp": 1.09793389, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.053456927876726165, + "language_loss": 0.86199152, + "learning_rate": 0.0005571982896070316, + "loss": 0.87331724, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.34692383, + "step": 2493, + "time_per_iteration": 2.6755988597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131371, + "balance_loss_mlp": 1.09534633, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.059320296473078654, + "language_loss": 0.89793247, + "learning_rate": 0.0005568887814384792, + "loss": 0.90924621, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.36035156, + "step": 2494, + "time_per_iteration": 2.5790021419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139931, + "balance_loss_mlp": 1.1042639, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.061123462827233396, + "language_loss": 0.87048668, + "learning_rate": 0.000556579251183743, + "loss": 0.88188601, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.35693359, + "step": 2495, + "time_per_iteration": 2.6916205883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134229, + "balance_loss_mlp": 1.0992769, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.05789705924573782, + "language_loss": 0.80256224, + "learning_rate": 0.0005562696989629936, + "loss": 0.81390452, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.34960938, + "step": 2496, + "time_per_iteration": 2.690638542175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133544, + "balance_loss_mlp": 1.0990684, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.06114364023526716, + "language_loss": 0.82642174, + "learning_rate": 0.0005559601248964095, + "loss": 0.83775711, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.34521484, + "step": 2497, + "time_per_iteration": 2.6249618530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135944, + "balance_loss_mlp": 1.10249412, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.06899971908711858, + "language_loss": 0.85956562, + "learning_rate": 0.0005556505291041783, + "loss": 0.87092507, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.33447266, + "step": 2498, + "time_per_iteration": 2.7098748683929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135161, + "balance_loss_mlp": 1.10097158, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.055207166893370456, + "language_loss": 0.84689957, + "learning_rate": 0.0005553409117064954, + "loss": 0.85825121, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.34228516, + "step": 2499, + "time_per_iteration": 2.8708267211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_mlp": 1.10242295, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.06687134527330599, + "language_loss": 0.8476308, + "learning_rate": 0.0005550312728235654, + "loss": 0.85899949, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.34448242, + "step": 2500, + "time_per_iteration": 2.6980721950531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128863, + "balance_loss_mlp": 1.09500802, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07829313389837793, + "language_loss": 0.83860761, + "learning_rate": 0.0005547216125756003, + "loss": 0.84989619, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.33862305, + "step": 2501, + "time_per_iteration": 2.737539291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140174, + "balance_loss_mlp": 1.10729611, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.06644553638954338, + "language_loss": 0.82266629, + "learning_rate": 0.0005544119310828211, + "loss": 0.83406806, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.32885742, + "step": 2502, + "time_per_iteration": 3.082392930984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125836, + "balance_loss_mlp": 1.09245706, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.061244964440333595, + "language_loss": 0.85365945, + "learning_rate": 0.0005541022284654568, + "loss": 0.86491781, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.33398438, + "step": 2503, + "time_per_iteration": 2.9372761249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125034, + "balance_loss_mlp": 1.09189391, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06168262746563105, + "language_loss": 0.84156048, + "learning_rate": 0.0005537925048437446, + "loss": 0.8528108, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.33154297, + "step": 2504, + "time_per_iteration": 2.589538097381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_mlp": 1.04296899, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.02361726537833674, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.7680397, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09521484, + "step": 2505, + "time_per_iteration": 4.908772230148315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111219, + "balance_loss_mlp": 1.07819104, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.056356974386017084, + "language_loss": 0.88423991, + "learning_rate": 0.0005531729950682664, + "loss": 0.89536178, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.34008789, + "step": 2506, + "time_per_iteration": 3.003096580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108061, + "balance_loss_mlp": 1.0739913, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.08388532833554185, + "language_loss": 0.85083711, + "learning_rate": 0.000552863209155015, + "loss": 0.86191773, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.34082031, + "step": 2507, + "time_per_iteration": 2.511463165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.07331145, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.05856414722035687, + "language_loss": 0.82348502, + "learning_rate": 0.0005525534027184461, + "loss": 0.83454525, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.32714844, + "step": 2508, + "time_per_iteration": 2.6477487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102119, + "balance_loss_mlp": 1.06993294, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.054304228935087996, + "language_loss": 0.83357495, + "learning_rate": 0.0005522435758788365, + "loss": 0.84459615, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.32177734, + "step": 2509, + "time_per_iteration": 2.715082883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_mlp": 1.06741309, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.07316920081788965, + "language_loss": 0.80354846, + "learning_rate": 0.0005519337287564721, + "loss": 0.81456852, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.34594727, + "step": 2510, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103913, + "balance_loss_mlp": 1.07225132, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07052632360826482, + "language_loss": 0.83703697, + "learning_rate": 0.000551623861471646, + "loss": 0.84807611, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.31640625, + "step": 2511, + "time_per_iteration": 2.7521867752075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_mlp": 1.01886296, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.02307493847576384, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79847658, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09960938, + "step": 2512, + "time_per_iteration": 4.850410461425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110192, + "balance_loss_mlp": 1.06877947, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.060960940408773784, + "language_loss": 0.86943817, + "learning_rate": 0.0005510040668958211, + "loss": 0.88045734, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.33154297, + "step": 2513, + "time_per_iteration": 2.5581674575805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00145698, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.01573295897448314, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78772056, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.10009766, + "step": 2514, + "time_per_iteration": 4.821207523345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101449, + "balance_loss_mlp": 1.06876206, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.06635931409503217, + "language_loss": 0.8316704, + "learning_rate": 0.0005503841931138645, + "loss": 0.84268492, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.3269043, + "step": 2515, + "time_per_iteration": 2.6826930046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109492, + "balance_loss_mlp": 1.06247151, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07963111819885421, + "language_loss": 0.81975293, + "learning_rate": 0.0005500742268214025, + "loss": 0.83070219, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.32446289, + "step": 2516, + "time_per_iteration": 2.4913811683654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109084, + "balance_loss_mlp": 1.07763672, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.057140457991015275, + "language_loss": 0.85559756, + "learning_rate": 0.0005497642410884014, + "loss": 0.86668837, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.31420898, + "step": 2517, + "time_per_iteration": 2.7807135581970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101598, + "balance_loss_mlp": 1.06855321, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.05176470538316484, + "language_loss": 0.85257566, + "learning_rate": 0.0005494542360352085, + "loss": 0.86359167, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.33056641, + "step": 2518, + "time_per_iteration": 2.653507947921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115114, + "balance_loss_mlp": 1.08285642, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.0599313447084905, + "language_loss": 0.85512084, + "learning_rate": 0.0005491442117821783, + "loss": 0.86627203, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.32226562, + "step": 2519, + "time_per_iteration": 2.717984676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08325005, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.0649010079315795, + "language_loss": 0.87622237, + "learning_rate": 0.0005488341684496732, + "loss": 0.88739175, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.33691406, + "step": 2520, + "time_per_iteration": 2.652135133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108566, + "balance_loss_mlp": 1.07606971, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06559854904132026, + "language_loss": 0.92200404, + "learning_rate": 0.0005485241061580624, + "loss": 0.93308973, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.32495117, + "step": 2521, + "time_per_iteration": 2.7108826637268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102755, + "balance_loss_mlp": 1.07037747, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.055876909605250345, + "language_loss": 0.84836948, + "learning_rate": 0.0005482140250277228, + "loss": 0.85939705, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.32373047, + "step": 2522, + "time_per_iteration": 2.997586965560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105629, + "balance_loss_mlp": 1.07408667, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.07027884549034326, + "language_loss": 0.87641776, + "learning_rate": 0.0005479039251790387, + "loss": 0.88747412, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.31518555, + "step": 2523, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096851, + "balance_loss_mlp": 1.06478369, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.061509725516535926, + "language_loss": 0.8502717, + "learning_rate": 0.0005475938067324014, + "loss": 0.86124021, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.32055664, + "step": 2524, + "time_per_iteration": 2.8200628757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07339168, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.064836171654712, + "language_loss": 0.83736813, + "learning_rate": 0.0005472836698082098, + "loss": 0.84842694, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.32495117, + "step": 2525, + "time_per_iteration": 2.4986329078674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100608, + "balance_loss_mlp": 1.06763458, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.05406459595211624, + "language_loss": 0.8394289, + "learning_rate": 0.0005469735145268694, + "loss": 0.8504349, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.32983398, + "step": 2526, + "time_per_iteration": 2.7246296405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107934, + "balance_loss_mlp": 1.07455492, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0623071528474554, + "language_loss": 0.8099308, + "learning_rate": 0.0005466633410087933, + "loss": 0.82101017, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.33398438, + "step": 2527, + "time_per_iteration": 2.660274028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049375, + "balance_loss_mlp": 1.03955197, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.029762737629489368, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78310198, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.09814453, + "step": 2528, + "time_per_iteration": 4.886114835739136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098205, + "balance_loss_mlp": 1.06663859, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.05348067523581763, + "language_loss": 0.88341582, + "learning_rate": 0.0005460429397441214, + "loss": 0.89439785, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.31542969, + "step": 2529, + "time_per_iteration": 2.556168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06572175, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.07361694113297405, + "language_loss": 0.86787206, + "learning_rate": 0.0005457327122383866, + "loss": 0.87883973, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.31030273, + "step": 2530, + "time_per_iteration": 2.6101198196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_mlp": 1.02248013, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.016416545513431694, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75668502, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.09521484, + "step": 2531, + "time_per_iteration": 4.807017803192139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102878, + "balance_loss_mlp": 1.071383, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.061169122564006716, + "language_loss": 0.75803703, + "learning_rate": 0.0005451122040823244, + "loss": 0.7690658, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.31469727, + "step": 2532, + "time_per_iteration": 2.778230667114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110046, + "balance_loss_mlp": 1.07611895, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.05283553568044795, + "language_loss": 0.77404439, + "learning_rate": 0.0005448019236728997, + "loss": 0.78514493, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.33959961, + "step": 2533, + "time_per_iteration": 2.8531336784362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106047, + "balance_loss_mlp": 1.07521987, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.06480756266699016, + "language_loss": 0.84952033, + "learning_rate": 0.0005444916258698255, + "loss": 0.8605808, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.30810547, + "step": 2534, + "time_per_iteration": 2.5989930629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108181, + "balance_loss_mlp": 1.07701969, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.058540646847924545, + "language_loss": 0.8623631, + "learning_rate": 0.0005441813107935704, + "loss": 0.87344491, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.31152344, + "step": 2535, + "time_per_iteration": 2.6970572471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.0836966, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.06249509461195645, + "language_loss": 0.85908329, + "learning_rate": 0.0005438709785646091, + "loss": 0.87024212, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.32177734, + "step": 2536, + "time_per_iteration": 2.5461835861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109072, + "balance_loss_mlp": 1.07688498, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06859245202813889, + "language_loss": 0.87149572, + "learning_rate": 0.0005435606293034234, + "loss": 0.88258648, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.32177734, + "step": 2537, + "time_per_iteration": 2.6585540771484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_mlp": 1.07018018, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.07107602922960535, + "language_loss": 0.84916604, + "learning_rate": 0.0005432502631305016, + "loss": 0.86016917, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.30126953, + "step": 2538, + "time_per_iteration": 2.6976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103363, + "balance_loss_mlp": 1.07055688, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.04961852862161645, + "language_loss": 0.83663356, + "learning_rate": 0.0005429398801663386, + "loss": 0.84766722, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.32788086, + "step": 2539, + "time_per_iteration": 2.9294815063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101134, + "balance_loss_mlp": 1.06916165, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.06193008336457455, + "language_loss": 0.83023834, + "learning_rate": 0.0005426294805314355, + "loss": 0.84124964, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.31958008, + "step": 2540, + "time_per_iteration": 2.5207223892211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099167, + "balance_loss_mlp": 1.06593108, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.0603925409034683, + "language_loss": 0.80357647, + "learning_rate": 0.0005423190643463003, + "loss": 0.8145681, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.33251953, + "step": 2541, + "time_per_iteration": 3.0720365047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101012, + "balance_loss_mlp": 1.06915879, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.0609118032347285, + "language_loss": 0.83149743, + "learning_rate": 0.0005420086317314473, + "loss": 0.84250748, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.31835938, + "step": 2542, + "time_per_iteration": 2.7291080951690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06470084, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.056070719415307675, + "language_loss": 0.81426919, + "learning_rate": 0.0005416981828073971, + "loss": 0.8252514, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.33544922, + "step": 2543, + "time_per_iteration": 2.7784368991851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_mlp": 1.02441669, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.02316516352555082, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78148878, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09423828, + "step": 2544, + "time_per_iteration": 4.838131666183472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.07023609, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07449943721079477, + "language_loss": 0.85317016, + "learning_rate": 0.000541077236513819, + "loss": 0.86419702, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.32446289, + "step": 2545, + "time_per_iteration": 2.5264503955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101803, + "balance_loss_mlp": 1.07071328, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.056060473734182076, + "language_loss": 0.82499588, + "learning_rate": 0.0005407667393853638, + "loss": 0.83601391, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31054688, + "step": 2546, + "time_per_iteration": 2.66180157661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06699038, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06590134685442105, + "language_loss": 0.83337891, + "learning_rate": 0.0005404562264298569, + "loss": 0.84437472, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32592773, + "step": 2547, + "time_per_iteration": 2.8525304794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098759, + "balance_loss_mlp": 1.06390238, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.05425762766620139, + "language_loss": 0.83855128, + "learning_rate": 0.0005401456977678498, + "loss": 0.84953886, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.34838867, + "step": 2548, + "time_per_iteration": 2.6519198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098656, + "balance_loss_mlp": 1.06561112, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.06384769679028596, + "language_loss": 0.77718782, + "learning_rate": 0.0005398351535199008, + "loss": 0.78817439, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.33056641, + "step": 2549, + "time_per_iteration": 3.0877339839935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096351, + "balance_loss_mlp": 1.06499887, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.053089286344054805, + "language_loss": 0.83930391, + "learning_rate": 0.0005395245938065735, + "loss": 0.85026741, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31347656, + "step": 2550, + "time_per_iteration": 2.8241264820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099597, + "balance_loss_mlp": 1.06669557, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.0641036113016549, + "language_loss": 0.82636213, + "learning_rate": 0.0005392140187484379, + "loss": 0.83735812, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.32885742, + "step": 2551, + "time_per_iteration": 2.593710422515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105531, + "balance_loss_mlp": 1.07332087, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.06156906510059403, + "language_loss": 0.89866853, + "learning_rate": 0.0005389034284660701, + "loss": 0.90972388, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.32202148, + "step": 2552, + "time_per_iteration": 2.8167800903320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112217, + "balance_loss_mlp": 1.07957709, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.06543971253041776, + "language_loss": 0.82440078, + "learning_rate": 0.000538592823080052, + "loss": 0.83552289, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.32641602, + "step": 2553, + "time_per_iteration": 3.190459966659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110985, + "balance_loss_mlp": 1.07817876, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.061393832790464745, + "language_loss": 0.85407627, + "learning_rate": 0.000538282202710971, + "loss": 0.8651861, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.328125, + "step": 2554, + "time_per_iteration": 2.5911953449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111409, + "balance_loss_mlp": 1.07907963, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.06886607309109279, + "language_loss": 0.82350785, + "learning_rate": 0.000537971567479421, + "loss": 0.83462197, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.32324219, + "step": 2555, + "time_per_iteration": 2.7882654666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110188, + "balance_loss_mlp": 1.07783484, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.07781814230506547, + "language_loss": 0.87956369, + "learning_rate": 0.0005376609175060011, + "loss": 0.89066565, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32348633, + "step": 2556, + "time_per_iteration": 2.6131739616394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121597, + "balance_loss_mlp": 1.08850408, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07736545907619681, + "language_loss": 0.80871999, + "learning_rate": 0.0005373502529113162, + "loss": 0.81993598, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.33105469, + "step": 2557, + "time_per_iteration": 2.8115434646606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125925, + "balance_loss_mlp": 1.09154499, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.06369400741363575, + "language_loss": 0.81534445, + "learning_rate": 0.0005370395738159773, + "loss": 0.82660365, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.34375, + "step": 2558, + "time_per_iteration": 2.645482063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134081, + "balance_loss_mlp": 1.10120285, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.06840745530844954, + "language_loss": 0.83582544, + "learning_rate": 0.0005367288803406003, + "loss": 0.84716624, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.32885742, + "step": 2559, + "time_per_iteration": 2.6290056705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113348, + "balance_loss_mlp": 1.09895754, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.06026988921967747, + "language_loss": 0.81393933, + "learning_rate": 0.0005364181726058073, + "loss": 0.82527417, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.34545898, + "step": 2560, + "time_per_iteration": 2.683072805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113164, + "balance_loss_mlp": 1.09771323, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.10364093826622443, + "language_loss": 0.8257041, + "learning_rate": 0.0005361074507322261, + "loss": 0.83702052, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.33935547, + "step": 2561, + "time_per_iteration": 2.5988388061523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127176, + "balance_loss_mlp": 1.09420276, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.08607714934124724, + "language_loss": 0.81995922, + "learning_rate": 0.000535796714840489, + "loss": 0.831231, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.32983398, + "step": 2562, + "time_per_iteration": 2.617560625076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124157, + "balance_loss_mlp": 1.09099317, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.06602924000575079, + "language_loss": 0.84137893, + "learning_rate": 0.0005354859650512348, + "loss": 0.85262048, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.33154297, + "step": 2563, + "time_per_iteration": 2.7547245025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118883, + "balance_loss_mlp": 1.08707833, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.060127327089604984, + "language_loss": 0.87529951, + "learning_rate": 0.0005351752014851074, + "loss": 0.88648832, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31787109, + "step": 2564, + "time_per_iteration": 2.5543923377990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115017, + "balance_loss_mlp": 1.08199644, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.057267908508465526, + "language_loss": 0.83867848, + "learning_rate": 0.0005348644242627553, + "loss": 0.84982872, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.33032227, + "step": 2565, + "time_per_iteration": 2.7361738681793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074248, + "balance_loss_mlp": 1.06585574, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.028047824457769776, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76360869, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.08398438, + "step": 2566, + "time_per_iteration": 4.955476760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126385, + "balance_loss_mlp": 1.09605825, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.0818104780923525, + "language_loss": 0.81442422, + "learning_rate": 0.0005342428293320013, + "loss": 0.825688, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30297852, + "step": 2567, + "time_per_iteration": 2.7417242527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133289, + "balance_loss_mlp": 1.10160363, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.06602747501781048, + "language_loss": 0.83786738, + "learning_rate": 0.0005339320118649238, + "loss": 0.84920025, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.31665039, + "step": 2568, + "time_per_iteration": 2.6943705081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.11111128, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.08080827100230976, + "language_loss": 0.86562729, + "learning_rate": 0.000533621181224271, + "loss": 0.87704599, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30737305, + "step": 2569, + "time_per_iteration": 2.7706520557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140818, + "balance_loss_mlp": 1.10748696, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.0686138609954652, + "language_loss": 0.81810164, + "learning_rate": 0.0005333103375307182, + "loss": 0.82950985, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.33349609, + "step": 2570, + "time_per_iteration": 2.86440372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114456, + "balance_loss_mlp": 1.11196864, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06689740684779927, + "language_loss": 0.86211395, + "learning_rate": 0.0005329994809049451, + "loss": 0.87355959, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.32592773, + "step": 2571, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.10243487, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.10119095251173513, + "language_loss": 0.87867194, + "learning_rate": 0.0005326886114676375, + "loss": 0.89004534, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.34936523, + "step": 2572, + "time_per_iteration": 2.7414114475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122524, + "balance_loss_mlp": 1.09017086, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06560191593845013, + "language_loss": 0.8820219, + "learning_rate": 0.0005323777293394854, + "loss": 0.89324713, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.32348633, + "step": 2573, + "time_per_iteration": 2.5354294776916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08838177, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.057507807941180766, + "language_loss": 0.8235743, + "learning_rate": 0.000532066834641184, + "loss": 0.83478361, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32543945, + "step": 2574, + "time_per_iteration": 2.6555819511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110618, + "balance_loss_mlp": 1.07401729, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.06325814646706406, + "language_loss": 0.85261214, + "learning_rate": 0.0005317559274934334, + "loss": 0.86367393, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.3215332, + "step": 2575, + "time_per_iteration": 2.7056500911712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109208, + "balance_loss_mlp": 1.07559085, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.06593319291759459, + "language_loss": 0.81090045, + "learning_rate": 0.0005314450080169382, + "loss": 0.82199252, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33642578, + "step": 2576, + "time_per_iteration": 2.6029012203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06544995, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.07692863745295915, + "language_loss": 0.80917549, + "learning_rate": 0.0005311340763324083, + "loss": 0.82014352, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.31323242, + "step": 2577, + "time_per_iteration": 2.5615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092477, + "balance_loss_mlp": 1.06081462, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06627487899009786, + "language_loss": 0.82433712, + "learning_rate": 0.0005308231325605578, + "loss": 0.83526182, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.31665039, + "step": 2578, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096329, + "balance_loss_mlp": 1.06473827, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.053568999050238396, + "language_loss": 0.77453893, + "learning_rate": 0.0005305121768221061, + "loss": 0.7855022, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.31542969, + "step": 2579, + "time_per_iteration": 3.0817010402679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_mlp": 1.00046897, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.016247003132607515, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76047277, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08496094, + "step": 2580, + "time_per_iteration": 4.813999176025391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.05099821, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06693938938040958, + "language_loss": 0.92087269, + "learning_rate": 0.0005298902299282984, + "loss": 0.93170166, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.3190918, + "step": 2581, + "time_per_iteration": 2.622823715209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096273, + "balance_loss_mlp": 1.0638243, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.06032323910602905, + "language_loss": 0.84543586, + "learning_rate": 0.0005295792390144033, + "loss": 0.85639858, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.32446289, + "step": 2582, + "time_per_iteration": 2.68511962890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110589, + "balance_loss_mlp": 1.07236862, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.06277392630469315, + "language_loss": 0.84023589, + "learning_rate": 0.0005292682366168294, + "loss": 0.85129476, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.33544922, + "step": 2583, + "time_per_iteration": 2.5309059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095632, + "balance_loss_mlp": 1.06256378, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06727867389441711, + "language_loss": 0.79973817, + "learning_rate": 0.0005289572228563181, + "loss": 0.81069446, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.33081055, + "step": 2584, + "time_per_iteration": 4.178269386291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095977, + "balance_loss_mlp": 1.06362402, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.05530053735156927, + "language_loss": 0.83410299, + "learning_rate": 0.000528646197853616, + "loss": 0.84506273, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.32373047, + "step": 2585, + "time_per_iteration": 2.706878900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101894, + "balance_loss_mlp": 1.07032776, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.05706454291548468, + "language_loss": 0.86111611, + "learning_rate": 0.0005283351617294735, + "loss": 0.87213504, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.31567383, + "step": 2586, + "time_per_iteration": 2.9042582511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017241, + "balance_loss_mlp": 1.00732255, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.020630801148902787, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77653909, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.09912109, + "step": 2587, + "time_per_iteration": 4.9974682331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099928, + "balance_loss_mlp": 1.06676388, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.07253805127360792, + "language_loss": 0.86542678, + "learning_rate": 0.0005277130565998916, + "loss": 0.87642598, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.33178711, + "step": 2588, + "time_per_iteration": 2.7639453411102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092536, + "balance_loss_mlp": 1.06144667, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05247127963424023, + "language_loss": 0.82351577, + "learning_rate": 0.0005274019878359748, + "loss": 0.83444113, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.31054688, + "step": 2589, + "time_per_iteration": 2.706843137741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109391, + "balance_loss_mlp": 1.05943429, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.06499700543891603, + "language_loss": 0.87299156, + "learning_rate": 0.0005270909084336628, + "loss": 0.88393074, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.34472656, + "step": 2590, + "time_per_iteration": 2.627092123031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095191, + "balance_loss_mlp": 1.06174052, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.06358626343280155, + "language_loss": 0.89192379, + "learning_rate": 0.0005267798185137276, + "loss": 0.90287566, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.33447266, + "step": 2591, + "time_per_iteration": 2.6053519248962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098275, + "balance_loss_mlp": 1.06434834, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.06851868017892651, + "language_loss": 0.89230084, + "learning_rate": 0.0005264687181969444, + "loss": 0.9032836, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.33959961, + "step": 2592, + "time_per_iteration": 2.7227771282196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097456, + "balance_loss_mlp": 1.06255198, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.06920907227035335, + "language_loss": 0.75419706, + "learning_rate": 0.0005261576076040937, + "loss": 0.76517165, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.34936523, + "step": 2593, + "time_per_iteration": 3.2559545040130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06430554, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.06727068797895107, + "language_loss": 0.84462249, + "learning_rate": 0.0005258464868559591, + "loss": 0.85559052, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.32519531, + "step": 2594, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06432104, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.05920105575352037, + "language_loss": 0.88943779, + "learning_rate": 0.0005255353560733284, + "loss": 0.90040118, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.32006836, + "step": 2595, + "time_per_iteration": 2.5696520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_mlp": 1.02894819, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.021649763717819466, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76616704, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.09130859, + "step": 2596, + "time_per_iteration": 4.785402059555054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096305, + "balance_loss_mlp": 1.06354642, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.055871474183400556, + "language_loss": 0.83429074, + "learning_rate": 0.0005249130648877492, + "loss": 0.84525383, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.32763672, + "step": 2597, + "time_per_iteration": 2.768077850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096166, + "balance_loss_mlp": 1.0628823, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.06479225622172138, + "language_loss": 0.85305572, + "learning_rate": 0.0005246019047263953, + "loss": 0.86401737, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.33300781, + "step": 2598, + "time_per_iteration": 2.4575202465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109757, + "balance_loss_mlp": 1.06471562, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.06552285864087816, + "language_loss": 0.82716942, + "learning_rate": 0.0005242907350137353, + "loss": 0.83814514, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.32836914, + "step": 2599, + "time_per_iteration": 2.545402765274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.06773996, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.060184934170799446, + "language_loss": 0.79316103, + "learning_rate": 0.0005239795558705754, + "loss": 0.80416048, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.32202148, + "step": 2600, + "time_per_iteration": 2.6259560585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094505, + "balance_loss_mlp": 1.06279588, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.07180292739942261, + "language_loss": 0.89506614, + "learning_rate": 0.0005236683674177264, + "loss": 0.90601116, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.31713867, + "step": 2601, + "time_per_iteration": 2.6216633319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.06531632, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.05820446715743302, + "language_loss": 0.82377899, + "learning_rate": 0.0005233571697760021, + "loss": 0.83476663, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3347168, + "step": 2602, + "time_per_iteration": 2.8286540508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107785, + "balance_loss_mlp": 1.07447851, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06262770013006937, + "language_loss": 0.83391154, + "learning_rate": 0.0005230459630662203, + "loss": 0.84498942, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.33325195, + "step": 2603, + "time_per_iteration": 2.9667811393737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107928, + "balance_loss_mlp": 1.07562184, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.06520686758548196, + "language_loss": 0.81425881, + "learning_rate": 0.0005227347474092022, + "loss": 0.82533813, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.32250977, + "step": 2604, + "time_per_iteration": 2.7840375900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109186, + "balance_loss_mlp": 1.07616544, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.04693444517106987, + "language_loss": 0.83613992, + "learning_rate": 0.0005224235229257724, + "loss": 0.84723175, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.33032227, + "step": 2605, + "time_per_iteration": 2.6730735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_mlp": 1.06970012, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.05305580167320912, + "language_loss": 0.87095463, + "learning_rate": 0.0005221122897367589, + "loss": 0.88196945, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.31762695, + "step": 2606, + "time_per_iteration": 2.804161310195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106275, + "balance_loss_mlp": 1.07384968, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.07402045106641765, + "language_loss": 0.81512845, + "learning_rate": 0.0005218010479629932, + "loss": 0.82619125, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.32421875, + "step": 2607, + "time_per_iteration": 2.6673223972320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111463, + "balance_loss_mlp": 1.0777508, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.06695708261577327, + "language_loss": 0.82331049, + "learning_rate": 0.0005214897977253102, + "loss": 0.83442515, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.33740234, + "step": 2608, + "time_per_iteration": 2.641615390777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109683, + "balance_loss_mlp": 1.06538224, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.057424183285493445, + "language_loss": 0.84299719, + "learning_rate": 0.0005211785391445473, + "loss": 0.85396552, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.31445312, + "step": 2609, + "time_per_iteration": 2.736565589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098049, + "balance_loss_mlp": 1.06381226, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.15505754048194495, + "language_loss": 0.79028511, + "learning_rate": 0.0005208672723415467, + "loss": 0.8012656, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.3425293, + "step": 2610, + "time_per_iteration": 2.7740700244903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06371355, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.06293902841757802, + "language_loss": 0.79232705, + "learning_rate": 0.0005205559974371525, + "loss": 0.80331105, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.34716797, + "step": 2611, + "time_per_iteration": 2.7674527168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096957, + "balance_loss_mlp": 1.06455564, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06270244311506845, + "language_loss": 0.82445353, + "learning_rate": 0.0005202447145522123, + "loss": 0.83542311, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.32397461, + "step": 2612, + "time_per_iteration": 2.6602847576141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100141, + "balance_loss_mlp": 1.06700087, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.1708463718003921, + "language_loss": 0.79453385, + "learning_rate": 0.0005199334238075769, + "loss": 0.80553526, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.33154297, + "step": 2613, + "time_per_iteration": 2.5568900108337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06802678, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.0528689770317124, + "language_loss": 0.92217171, + "learning_rate": 0.0005196221253241, + "loss": 0.93318725, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.3347168, + "step": 2614, + "time_per_iteration": 2.6126556396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099044, + "balance_loss_mlp": 1.06490254, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.060608661488991786, + "language_loss": 0.83149332, + "learning_rate": 0.0005193108192226383, + "loss": 0.84248376, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.34155273, + "step": 2615, + "time_per_iteration": 2.74265456199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099568, + "balance_loss_mlp": 1.06599879, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.05036532075051116, + "language_loss": 0.87427437, + "learning_rate": 0.000518999505624052, + "loss": 0.88527, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.33569336, + "step": 2616, + "time_per_iteration": 2.6870973110198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098357, + "balance_loss_mlp": 1.06483543, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.047696485592571475, + "language_loss": 0.83320528, + "learning_rate": 0.000518688184649203, + "loss": 0.84418881, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.33544922, + "step": 2617, + "time_per_iteration": 2.8016743659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.06434643, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.046578345586746416, + "language_loss": 0.83902323, + "learning_rate": 0.0005183768564189577, + "loss": 0.85000026, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.33374023, + "step": 2618, + "time_per_iteration": 2.5473384857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103149, + "balance_loss_mlp": 1.07158208, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.06435350107251939, + "language_loss": 0.81610096, + "learning_rate": 0.0005180655210541838, + "loss": 0.82713246, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31542969, + "step": 2619, + "time_per_iteration": 2.6063601970672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109319, + "balance_loss_mlp": 1.07362747, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.07554849641883571, + "language_loss": 0.83428431, + "learning_rate": 0.0005177541786757527, + "loss": 0.8453775, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.35693359, + "step": 2620, + "time_per_iteration": 2.7651278972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109868, + "balance_loss_mlp": 1.07589293, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.07801269652965341, + "language_loss": 0.8344717, + "learning_rate": 0.000517442829404538, + "loss": 0.84557039, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.33959961, + "step": 2621, + "time_per_iteration": 2.991288661956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07401848, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07509105999805234, + "language_loss": 0.87522292, + "learning_rate": 0.0005171314733614166, + "loss": 0.8862952, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.33227539, + "step": 2622, + "time_per_iteration": 2.8980941772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107621, + "balance_loss_mlp": 1.07357442, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.05402993794527385, + "language_loss": 0.78464615, + "learning_rate": 0.0005168201106672671, + "loss": 0.79572237, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.34057617, + "step": 2623, + "time_per_iteration": 2.7572929859161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106537, + "balance_loss_mlp": 1.07394505, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.0666138467605724, + "language_loss": 0.85413206, + "learning_rate": 0.0005165087414429717, + "loss": 0.86519742, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.32592773, + "step": 2624, + "time_per_iteration": 2.6197690963745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104325, + "balance_loss_mlp": 1.07178128, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.0890371890087988, + "language_loss": 0.83553296, + "learning_rate": 0.0005161973658094144, + "loss": 0.84657621, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.32543945, + "step": 2625, + "time_per_iteration": 2.688664436340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114503, + "balance_loss_mlp": 1.08188796, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.10293664596100507, + "language_loss": 0.82534152, + "learning_rate": 0.000515885983887482, + "loss": 0.83648658, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.32592773, + "step": 2626, + "time_per_iteration": 2.7382290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117287, + "balance_loss_mlp": 1.08467126, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.06112991005583596, + "language_loss": 0.84654796, + "learning_rate": 0.0005155745957980636, + "loss": 0.85772085, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.32617188, + "step": 2627, + "time_per_iteration": 2.5833873748779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.0852921, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.05493055898841465, + "language_loss": 0.88454115, + "learning_rate": 0.000515263201662051, + "loss": 0.89571404, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.31982422, + "step": 2628, + "time_per_iteration": 2.6362485885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112533, + "balance_loss_mlp": 1.09264278, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.05313724215790835, + "language_loss": 0.8271699, + "learning_rate": 0.0005149518016003378, + "loss": 0.83842319, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.3269043, + "step": 2629, + "time_per_iteration": 3.1579666137695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121904, + "balance_loss_mlp": 1.09109998, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.05858406869857789, + "language_loss": 0.82627785, + "learning_rate": 0.0005146403957338206, + "loss": 0.83749688, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30786133, + "step": 2630, + "time_per_iteration": 2.5554275512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128543, + "balance_loss_mlp": 1.09664297, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.05139775445636508, + "language_loss": 0.82087231, + "learning_rate": 0.0005143289841833975, + "loss": 0.83215779, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31884766, + "step": 2631, + "time_per_iteration": 2.866208076477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136223, + "balance_loss_mlp": 1.10332084, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.07049680225310351, + "language_loss": 0.82485932, + "learning_rate": 0.0005140175670699696, + "loss": 0.83622158, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.32885742, + "step": 2632, + "time_per_iteration": 2.589662551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136954, + "balance_loss_mlp": 1.10464883, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04937719013853961, + "language_loss": 0.83023763, + "learning_rate": 0.0005137061445144395, + "loss": 0.84160721, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.32299805, + "step": 2633, + "time_per_iteration": 2.907914161682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145225, + "balance_loss_mlp": 1.11308646, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06298038708728138, + "language_loss": 0.87351924, + "learning_rate": 0.000513394716637712, + "loss": 0.8849715, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.32128906, + "step": 2634, + "time_per_iteration": 2.7392778396606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_mlp": 1.05677319, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03015814476855984, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80255967, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.07470703, + "step": 2635, + "time_per_iteration": 4.8476762771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138418, + "balance_loss_mlp": 1.10549188, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0660835824728649, + "language_loss": 0.80952996, + "learning_rate": 0.0005127718454042958, + "loss": 0.82091409, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.3293457, + "step": 2636, + "time_per_iteration": 2.801945447921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122948, + "balance_loss_mlp": 1.09083319, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.06804864770708682, + "language_loss": 0.8454951, + "learning_rate": 0.0005124604022894269, + "loss": 0.85672456, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.32104492, + "step": 2637, + "time_per_iteration": 2.9412965774536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_mlp": 1.0316422, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.020904454547095577, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78227401, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.07519531, + "step": 2638, + "time_per_iteration": 4.857941389083862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.09507418, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.058859738864391845, + "language_loss": 0.83504963, + "learning_rate": 0.0005118375016679325, + "loss": 0.84632552, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.32495117, + "step": 2639, + "time_per_iteration": 2.7467126846313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115219, + "balance_loss_mlp": 1.08169687, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.06748446003243579, + "language_loss": 0.80393875, + "learning_rate": 0.0005115260444031382, + "loss": 0.81509095, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.33544922, + "step": 2640, + "time_per_iteration": 2.5831897258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.01354098, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011909310640322752, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79752946, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.07568359, + "step": 2641, + "time_per_iteration": 4.96182656288147 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118257, + "balance_loss_mlp": 1.08506942, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.06566448453374539, + "language_loss": 0.87279713, + "learning_rate": 0.0005109031165700483, + "loss": 0.88397968, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.33178711, + "step": 2642, + "time_per_iteration": 2.5608396530151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114089, + "balance_loss_mlp": 1.08228409, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07470030174236865, + "language_loss": 0.83423924, + "learning_rate": 0.0005105916462435945, + "loss": 0.84538019, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.31787109, + "step": 2643, + "time_per_iteration": 2.840092420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114248, + "balance_loss_mlp": 1.08272934, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0540496938056118, + "language_loss": 0.8565858, + "learning_rate": 0.0005102801718050989, + "loss": 0.86772823, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.31494141, + "step": 2644, + "time_per_iteration": 2.687993288040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111024, + "balance_loss_mlp": 1.08024383, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.0657522571772089, + "language_loss": 0.89181781, + "learning_rate": 0.0005099686933754867, + "loss": 0.90292799, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.30737305, + "step": 2645, + "time_per_iteration": 2.676555633544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110963, + "balance_loss_mlp": 1.07589364, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.06525501329559952, + "language_loss": 0.84646904, + "learning_rate": 0.0005096572110756845, + "loss": 0.85756534, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.33740234, + "step": 2646, + "time_per_iteration": 2.722046136856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098497, + "balance_loss_mlp": 1.06502318, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.055343813231999515, + "language_loss": 0.85733652, + "learning_rate": 0.0005093457250266205, + "loss": 0.86832154, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.33496094, + "step": 2647, + "time_per_iteration": 2.726637363433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105884, + "balance_loss_mlp": 1.07260132, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.07566246752622155, + "language_loss": 0.83174831, + "learning_rate": 0.000509034235349224, + "loss": 0.84280717, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.33276367, + "step": 2648, + "time_per_iteration": 2.7163400650024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06480372, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.05726246002698667, + "language_loss": 0.81403017, + "learning_rate": 0.0005087227421644266, + "loss": 0.82501602, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.33813477, + "step": 2649, + "time_per_iteration": 2.753593683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090769, + "balance_loss_mlp": 1.05836821, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.062073163804743356, + "language_loss": 0.86567879, + "learning_rate": 0.0005084112455931602, + "loss": 0.87658644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.32397461, + "step": 2650, + "time_per_iteration": 2.6115548610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109227, + "balance_loss_mlp": 1.05986929, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.07224314043681272, + "language_loss": 0.85185993, + "learning_rate": 0.0005080997457563586, + "loss": 0.8627826, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.32397461, + "step": 2651, + "time_per_iteration": 2.562626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091424, + "balance_loss_mlp": 1.05797434, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.12059659360832554, + "language_loss": 0.79420835, + "learning_rate": 0.0005077882427749569, + "loss": 0.80512255, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.3347168, + "step": 2652, + "time_per_iteration": 2.532801866531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.06072092, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09167141678281196, + "language_loss": 0.85065627, + "learning_rate": 0.0005074767367698913, + "loss": 0.86160588, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.34277344, + "step": 2653, + "time_per_iteration": 2.718952178955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06184387, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.05265423612140712, + "language_loss": 0.83726275, + "learning_rate": 0.0005071652278620988, + "loss": 0.84820282, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.3215332, + "step": 2654, + "time_per_iteration": 3.0578973293304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093541, + "balance_loss_mlp": 1.06082976, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.057781922950613636, + "language_loss": 0.8368457, + "learning_rate": 0.0005068537161725186, + "loss": 0.84778106, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.32714844, + "step": 2655, + "time_per_iteration": 2.763050079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109333, + "balance_loss_mlp": 1.06035662, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.06748478853261292, + "language_loss": 0.84411526, + "learning_rate": 0.0005065422018220893, + "loss": 0.85504854, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.32983398, + "step": 2656, + "time_per_iteration": 2.8346335887908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099653, + "balance_loss_mlp": 1.06744266, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.05948045399752535, + "language_loss": 0.80220234, + "learning_rate": 0.0005062306849317521, + "loss": 0.8131988, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.32226562, + "step": 2657, + "time_per_iteration": 2.8443868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011092, + "balance_loss_mlp": 1.07832527, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.06625791562361402, + "language_loss": 0.83381897, + "learning_rate": 0.0005059191656224487, + "loss": 0.84491098, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30859375, + "step": 2658, + "time_per_iteration": 2.7093002796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110636, + "balance_loss_mlp": 1.07883072, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.06672155578926672, + "language_loss": 0.88962573, + "learning_rate": 0.0005056076440151212, + "loss": 0.90073204, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.31787109, + "step": 2659, + "time_per_iteration": 2.6441903114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072549, + "balance_loss_mlp": 1.06272602, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.032966871601824974, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77360666, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.09814453, + "step": 2660, + "time_per_iteration": 4.922346353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124135, + "balance_loss_mlp": 1.09111381, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06875691586697516, + "language_loss": 0.87086922, + "learning_rate": 0.0005049845943901691, + "loss": 0.8821106, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.33032227, + "step": 2661, + "time_per_iteration": 2.8344130516052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107104, + "balance_loss_mlp": 1.07703924, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.06167047048505293, + "language_loss": 0.86829108, + "learning_rate": 0.0005046730666144338, + "loss": 0.87936211, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.30078125, + "step": 2662, + "time_per_iteration": 2.7832746505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110467, + "balance_loss_mlp": 1.07780349, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.05618387348962469, + "language_loss": 0.8811537, + "learning_rate": 0.0005043615370244532, + "loss": 0.89225835, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.32666016, + "step": 2663, + "time_per_iteration": 3.3585264682769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_mlp": 1.02664995, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02051261915929333, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79279995, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.08984375, + "step": 2664, + "time_per_iteration": 4.639116048812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.07670689, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.057959232824292994, + "language_loss": 0.85514903, + "learning_rate": 0.0005037384728855425, + "loss": 0.86621535, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.29882812, + "step": 2665, + "time_per_iteration": 2.7972493171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106635, + "balance_loss_mlp": 1.07456732, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08985416920229425, + "language_loss": 0.84974313, + "learning_rate": 0.0005034269385785075, + "loss": 0.86080956, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.3203125, + "step": 2666, + "time_per_iteration": 2.6164255142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.08135498, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09072509808708462, + "language_loss": 0.85031348, + "learning_rate": 0.0005031154029410168, + "loss": 0.86144769, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.32055664, + "step": 2667, + "time_per_iteration": 2.5188395977020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112022, + "balance_loss_mlp": 1.07873833, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.08345403251216076, + "language_loss": 0.86623496, + "learning_rate": 0.0005028038660940197, + "loss": 0.87735522, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.33300781, + "step": 2668, + "time_per_iteration": 2.5099217891693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104597, + "balance_loss_mlp": 1.07360303, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.051835294009996306, + "language_loss": 0.8459934, + "learning_rate": 0.0005024923281584648, + "loss": 0.85703939, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.30981445, + "step": 2669, + "time_per_iteration": 2.6409177780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113407, + "balance_loss_mlp": 1.08103013, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.05618222104131465, + "language_loss": 0.82660598, + "learning_rate": 0.0005021807892553026, + "loss": 0.83774006, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.32397461, + "step": 2670, + "time_per_iteration": 2.7168080806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105439, + "balance_loss_mlp": 1.07458735, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.052268384876698444, + "language_loss": 0.84909296, + "learning_rate": 0.0005018692495054828, + "loss": 0.86014736, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.30834961, + "step": 2671, + "time_per_iteration": 2.769845485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07063007, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.059994941655344296, + "language_loss": 0.80935681, + "learning_rate": 0.0005015577090299561, + "loss": 0.82036185, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.29833984, + "step": 2672, + "time_per_iteration": 2.681316375732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_mlp": 1.07245326, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.05683100055240327, + "language_loss": 0.86631596, + "learning_rate": 0.0005012461679496729, + "loss": 0.87733757, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.29711914, + "step": 2673, + "time_per_iteration": 2.5961544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100162, + "balance_loss_mlp": 1.06883335, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.05638845856922635, + "language_loss": 0.88303345, + "learning_rate": 0.0005009346263855848, + "loss": 0.8940351, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.31323242, + "step": 2674, + "time_per_iteration": 2.607531785964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.06903887, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.05523698149533188, + "language_loss": 0.84251857, + "learning_rate": 0.0005006230844586422, + "loss": 0.85352582, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.31665039, + "step": 2675, + "time_per_iteration": 2.766676664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106245, + "balance_loss_mlp": 1.07384396, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.054179282011379754, + "language_loss": 0.79421759, + "learning_rate": 0.0005003115422897968, + "loss": 0.80528009, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.32397461, + "step": 2676, + "time_per_iteration": 2.7511518001556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101702, + "balance_loss_mlp": 1.0696342, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06371145669365144, + "language_loss": 0.86998433, + "learning_rate": 0.0005, + "loss": 0.88100135, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.32055664, + "step": 2677, + "time_per_iteration": 2.6361911296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096983, + "balance_loss_mlp": 1.06508231, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06720272484805691, + "language_loss": 0.79773581, + "learning_rate": 0.0004996884577102033, + "loss": 0.80870569, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.3190918, + "step": 2678, + "time_per_iteration": 3.078381299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101165, + "balance_loss_mlp": 1.06726193, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.05338815308435362, + "language_loss": 0.84963048, + "learning_rate": 0.000499376915541358, + "loss": 0.86064208, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.33911133, + "step": 2679, + "time_per_iteration": 2.6979198455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096582, + "balance_loss_mlp": 1.06506324, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.0530977146452018, + "language_loss": 0.8140825, + "learning_rate": 0.0004990653736144155, + "loss": 0.82504833, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31494141, + "step": 2680, + "time_per_iteration": 2.8514578342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098157, + "balance_loss_mlp": 1.06547022, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.091547983778046, + "language_loss": 0.86229038, + "learning_rate": 0.0004987538320503271, + "loss": 0.87327194, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3269043, + "step": 2681, + "time_per_iteration": 2.478638172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_mlp": 1.06798983, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.07018643811750969, + "language_loss": 0.83312553, + "learning_rate": 0.0004984422909700442, + "loss": 0.8441304, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.32495117, + "step": 2682, + "time_per_iteration": 2.6546084880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.06783557, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.15069020701750013, + "language_loss": 0.84435642, + "learning_rate": 0.0004981307504945173, + "loss": 0.85534728, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31225586, + "step": 2683, + "time_per_iteration": 2.71260929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110147, + "balance_loss_mlp": 1.06914032, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.0559262102608404, + "language_loss": 0.89665949, + "learning_rate": 0.0004978192107446976, + "loss": 0.90767419, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.32324219, + "step": 2684, + "time_per_iteration": 2.767662763595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097385, + "balance_loss_mlp": 1.06650972, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.06901755479732997, + "language_loss": 0.87345654, + "learning_rate": 0.0004975076718415353, + "loss": 0.88443041, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30834961, + "step": 2685, + "time_per_iteration": 2.6287574768066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110088, + "balance_loss_mlp": 1.06988525, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.05502113672837593, + "language_loss": 0.91023147, + "learning_rate": 0.0004971961339059806, + "loss": 0.92124021, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.30957031, + "step": 2686, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_mlp": 1.07256198, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06476684801011888, + "language_loss": 0.84195554, + "learning_rate": 0.0004968845970589832, + "loss": 0.85300732, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.32617188, + "step": 2687, + "time_per_iteration": 2.6715877056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102414, + "balance_loss_mlp": 1.06896389, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0648303600022088, + "language_loss": 0.84613401, + "learning_rate": 0.0004965730614214926, + "loss": 0.85715812, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.3347168, + "step": 2688, + "time_per_iteration": 2.6734025478363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099959, + "balance_loss_mlp": 1.06720066, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.05675548235902804, + "language_loss": 0.85410345, + "learning_rate": 0.0004962615271144576, + "loss": 0.86510307, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.32739258, + "step": 2689, + "time_per_iteration": 2.4930050373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101842, + "balance_loss_mlp": 1.0703702, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.06418610502647971, + "language_loss": 0.82956815, + "learning_rate": 0.0004959499942588264, + "loss": 0.8405866, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.31469727, + "step": 2690, + "time_per_iteration": 2.904674768447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.070189, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.04799778536167862, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79278797, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.0859375, + "step": 2691, + "time_per_iteration": 4.761531591415405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105601, + "balance_loss_mlp": 1.07255602, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.051278898576550616, + "language_loss": 0.85877872, + "learning_rate": 0.0004953269333855661, + "loss": 0.86983472, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.33032227, + "step": 2692, + "time_per_iteration": 2.729318857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104257, + "balance_loss_mlp": 1.07328665, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.05911618517599564, + "language_loss": 0.84474307, + "learning_rate": 0.0004950154056098309, + "loss": 0.85578561, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.30932617, + "step": 2693, + "time_per_iteration": 2.6833436489105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.09158325, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.059128614865360495, + "language_loss": 0.83972096, + "learning_rate": 0.0004947038797692867, + "loss": 0.85096538, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.32861328, + "step": 2694, + "time_per_iteration": 2.82362961769104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119523, + "balance_loss_mlp": 1.08635902, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.05692767589933962, + "language_loss": 0.77609885, + "learning_rate": 0.0004943923559848789, + "loss": 0.78729415, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.33178711, + "step": 2695, + "time_per_iteration": 2.7919468879699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123482, + "balance_loss_mlp": 1.09112859, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06299979408052762, + "language_loss": 0.90267843, + "learning_rate": 0.0004940808343775515, + "loss": 0.91391325, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.32348633, + "step": 2696, + "time_per_iteration": 2.6863224506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112015, + "balance_loss_mlp": 1.08748627, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.06289973384355804, + "language_loss": 0.82184958, + "learning_rate": 0.0004937693150682479, + "loss": 0.83305109, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.32666016, + "step": 2697, + "time_per_iteration": 2.5169589519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124428, + "balance_loss_mlp": 1.09109747, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.0748090565006246, + "language_loss": 0.76575571, + "learning_rate": 0.0004934577981779107, + "loss": 0.77699995, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.33325195, + "step": 2698, + "time_per_iteration": 2.65891432762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111403, + "balance_loss_mlp": 1.08103275, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.34709701447359415, + "language_loss": 0.81575179, + "learning_rate": 0.0004931462838274817, + "loss": 0.82689214, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.33007812, + "step": 2699, + "time_per_iteration": 2.829094648361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113032, + "balance_loss_mlp": 1.09694147, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.06002024337813523, + "language_loss": 0.84538823, + "learning_rate": 0.0004928347721379011, + "loss": 0.85669148, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.33398438, + "step": 2700, + "time_per_iteration": 2.685887098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128853, + "balance_loss_mlp": 1.09499812, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.07280089907997458, + "language_loss": 0.82063133, + "learning_rate": 0.0004925232632301089, + "loss": 0.83191985, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.33886719, + "step": 2701, + "time_per_iteration": 2.5586745738983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139592, + "balance_loss_mlp": 1.10711944, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.05869071142497867, + "language_loss": 0.7981168, + "learning_rate": 0.0004922117572250431, + "loss": 0.80951279, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.32495117, + "step": 2702, + "time_per_iteration": 2.652883768081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154601, + "balance_loss_mlp": 1.12041199, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08372395695209851, + "language_loss": 0.80792272, + "learning_rate": 0.0004919002542436414, + "loss": 0.8194688, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.34155273, + "step": 2703, + "time_per_iteration": 2.8069591522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156311, + "balance_loss_mlp": 1.12131107, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.06918407740604555, + "language_loss": 0.81692028, + "learning_rate": 0.0004915887544068399, + "loss": 0.82848334, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.35009766, + "step": 2704, + "time_per_iteration": 2.6484997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159384, + "balance_loss_mlp": 1.12228656, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.0754612517988151, + "language_loss": 0.78528553, + "learning_rate": 0.0004912772578355736, + "loss": 0.79687935, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.37084961, + "step": 2705, + "time_per_iteration": 2.889177083969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115407, + "balance_loss_mlp": 1.11825967, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.06509959827239385, + "language_loss": 0.83146906, + "learning_rate": 0.000490965764650776, + "loss": 0.84300983, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.3581543, + "step": 2706, + "time_per_iteration": 2.885923385620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115916, + "balance_loss_mlp": 1.12346911, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06296986612889613, + "language_loss": 0.82775491, + "learning_rate": 0.0004906542749733798, + "loss": 0.83934653, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.35693359, + "step": 2707, + "time_per_iteration": 3.6151185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152032, + "balance_loss_mlp": 1.11653161, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.046885737032271585, + "language_loss": 0.85312223, + "learning_rate": 0.0004903427889243156, + "loss": 0.86464256, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.35498047, + "step": 2708, + "time_per_iteration": 2.8592212200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169903, + "balance_loss_mlp": 1.13335371, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07702072033180815, + "language_loss": 0.85470927, + "learning_rate": 0.0004900313066245134, + "loss": 0.86640829, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.36547852, + "step": 2709, + "time_per_iteration": 2.7046992778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155719, + "balance_loss_mlp": 1.12145817, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.049948125939344834, + "language_loss": 0.80970949, + "learning_rate": 0.0004897198281949012, + "loss": 0.82126665, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.34277344, + "step": 2710, + "time_per_iteration": 2.728750228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164738, + "balance_loss_mlp": 1.12837923, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.06520397862885238, + "language_loss": 0.77818954, + "learning_rate": 0.0004894083537564057, + "loss": 0.78983688, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.36352539, + "step": 2711, + "time_per_iteration": 2.7362277507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163972, + "balance_loss_mlp": 1.12913883, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.051241123094768644, + "language_loss": 0.81174654, + "learning_rate": 0.0004890968834299519, + "loss": 0.82338625, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.34838867, + "step": 2712, + "time_per_iteration": 2.768146514892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156146, + "balance_loss_mlp": 1.12026405, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.05945211160457726, + "language_loss": 0.78877795, + "learning_rate": 0.0004887854173364633, + "loss": 0.80033934, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.35913086, + "step": 2713, + "time_per_iteration": 2.8356804847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149792, + "balance_loss_mlp": 1.1157217, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.05274159181021226, + "language_loss": 0.81621301, + "learning_rate": 0.0004884739555968617, + "loss": 0.82771093, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.34057617, + "step": 2714, + "time_per_iteration": 2.831137180328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.09369898, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.02923312307597506, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8007924, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08496094, + "step": 2715, + "time_per_iteration": 4.95891547203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149677, + "balance_loss_mlp": 1.11534512, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06614932878153669, + "language_loss": 0.86865598, + "learning_rate": 0.0004878510456629992, + "loss": 0.88015276, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.34326172, + "step": 2716, + "time_per_iteration": 2.968658924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145792, + "balance_loss_mlp": 1.1120801, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.05224698034347332, + "language_loss": 0.8526777, + "learning_rate": 0.00048753959771057314, + "loss": 0.86413562, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.33740234, + "step": 2717, + "time_per_iteration": 2.6395833492279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140286, + "balance_loss_mlp": 1.10736012, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.0584811227693513, + "language_loss": 0.83152837, + "learning_rate": 0.0004872281545957044, + "loss": 0.84293115, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.3293457, + "step": 2718, + "time_per_iteration": 2.7039849758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135383, + "balance_loss_mlp": 1.10069275, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.050310473622198856, + "language_loss": 0.85946554, + "learning_rate": 0.0004869167164393055, + "loss": 0.87081933, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.34692383, + "step": 2719, + "time_per_iteration": 2.91475510597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132518, + "balance_loss_mlp": 1.10028338, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.0697291023285212, + "language_loss": 0.89398658, + "learning_rate": 0.00048660528336228793, + "loss": 0.90531176, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.32226562, + "step": 2720, + "time_per_iteration": 2.792276620864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124032, + "balance_loss_mlp": 1.09115386, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05026677719306565, + "language_loss": 0.90367562, + "learning_rate": 0.0004862938554855606, + "loss": 0.91491592, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.32885742, + "step": 2721, + "time_per_iteration": 2.7964749336242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129388, + "balance_loss_mlp": 1.09643817, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.0663768296863652, + "language_loss": 0.86310339, + "learning_rate": 0.0004859824329300304, + "loss": 0.87439728, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.32958984, + "step": 2722, + "time_per_iteration": 2.6039419174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128053, + "balance_loss_mlp": 1.09403062, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.0581387375581185, + "language_loss": 0.84092689, + "learning_rate": 0.00048567101581660244, + "loss": 0.85220736, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.34033203, + "step": 2723, + "time_per_iteration": 2.5987517833709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.09227037, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.06184026942262611, + "language_loss": 0.87479013, + "learning_rate": 0.00048535960426617956, + "loss": 0.88604021, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.32739258, + "step": 2724, + "time_per_iteration": 2.6038565635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121549, + "balance_loss_mlp": 1.08724082, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.05825945935903347, + "language_loss": 0.81925243, + "learning_rate": 0.0004850481983996621, + "loss": 0.83046794, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.34350586, + "step": 2725, + "time_per_iteration": 2.7633490562438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122144, + "balance_loss_mlp": 1.08907521, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.06367267368201004, + "language_loss": 0.88101065, + "learning_rate": 0.0004847367983379492, + "loss": 0.89223206, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.33081055, + "step": 2726, + "time_per_iteration": 2.520050287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119938, + "balance_loss_mlp": 1.08837104, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.059069616726974465, + "language_loss": 0.79169118, + "learning_rate": 0.00048442540420193643, + "loss": 0.80289054, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.31567383, + "step": 2727, + "time_per_iteration": 2.9363925457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125304, + "balance_loss_mlp": 1.09278345, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.06091521023817234, + "language_loss": 0.7936945, + "learning_rate": 0.0004841140161125182, + "loss": 0.8049475, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.32543945, + "step": 2728, + "time_per_iteration": 3.5786640644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127178, + "balance_loss_mlp": 1.09666038, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.054648351094499156, + "language_loss": 0.85262787, + "learning_rate": 0.0004838026341905857, + "loss": 0.86389971, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.30517578, + "step": 2729, + "time_per_iteration": 2.7021641731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.09909368, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.06068419443661206, + "language_loss": 0.85131037, + "learning_rate": 0.00048349125855702844, + "loss": 0.8626138, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.3125, + "step": 2730, + "time_per_iteration": 2.794691562652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129298, + "balance_loss_mlp": 1.09754109, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.0500444792759443, + "language_loss": 0.81508827, + "learning_rate": 0.00048317988933273287, + "loss": 0.82638121, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.31738281, + "step": 2731, + "time_per_iteration": 2.7251734733581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124341, + "balance_loss_mlp": 1.09291768, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.06596294225314246, + "language_loss": 0.82520533, + "learning_rate": 0.00048286852663858367, + "loss": 0.83644867, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.31420898, + "step": 2732, + "time_per_iteration": 2.972963571548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.0889498, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.055500139325311094, + "language_loss": 0.84107697, + "learning_rate": 0.000482557170595462, + "loss": 0.85228211, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.31542969, + "step": 2733, + "time_per_iteration": 2.858245849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112503, + "balance_loss_mlp": 1.09401202, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.13743136293517658, + "language_loss": 0.87933344, + "learning_rate": 0.0004822458213242475, + "loss": 0.89058375, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31005859, + "step": 2734, + "time_per_iteration": 2.522383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112386, + "balance_loss_mlp": 1.08115363, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.05651199089550523, + "language_loss": 0.86197513, + "learning_rate": 0.00048193447894581627, + "loss": 0.87309897, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.31201172, + "step": 2735, + "time_per_iteration": 3.0866682529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111368, + "balance_loss_mlp": 1.08235216, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06879211849592783, + "language_loss": 0.88187921, + "learning_rate": 0.00048162314358104243, + "loss": 0.89301598, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.31298828, + "step": 2736, + "time_per_iteration": 2.5985138416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108841, + "balance_loss_mlp": 1.07713127, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.05778047820427569, + "language_loss": 0.83687961, + "learning_rate": 0.0004813118153507969, + "loss": 0.84796798, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.31713867, + "step": 2737, + "time_per_iteration": 2.73371958732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_mlp": 1.01416731, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.01810308130118829, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83469975, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.08691406, + "step": 2738, + "time_per_iteration": 4.790890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097772, + "balance_loss_mlp": 1.06670594, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.05745954748436515, + "language_loss": 0.83672923, + "learning_rate": 0.00048068918077736163, + "loss": 0.84770691, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.31030273, + "step": 2739, + "time_per_iteration": 3.239821195602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06309009, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06477195420820829, + "language_loss": 0.81728363, + "learning_rate": 0.0004803778746759001, + "loss": 0.82822424, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.30932617, + "step": 2740, + "time_per_iteration": 2.942760944366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.06614065, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.05799868370730736, + "language_loss": 0.81935298, + "learning_rate": 0.00048006657619242317, + "loss": 0.83032262, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.30810547, + "step": 2741, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06550419, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07558439368734231, + "language_loss": 0.78591353, + "learning_rate": 0.00047975528544778775, + "loss": 0.79689896, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.33056641, + "step": 2742, + "time_per_iteration": 2.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.06058371, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.06405052151098177, + "language_loss": 0.88749677, + "learning_rate": 0.00047944400256284754, + "loss": 0.89840853, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.30566406, + "step": 2743, + "time_per_iteration": 2.6816787719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098065, + "balance_loss_mlp": 1.06809616, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07088810283562207, + "language_loss": 0.80461031, + "learning_rate": 0.0004791327276584532, + "loss": 0.81559092, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.29956055, + "step": 2744, + "time_per_iteration": 2.8708317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098246, + "balance_loss_mlp": 1.06596446, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.06685009455993486, + "language_loss": 0.8087393, + "learning_rate": 0.00047882146085545264, + "loss": 0.81972182, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.32250977, + "step": 2745, + "time_per_iteration": 2.610027551651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_mlp": 1.01204121, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.008936429220158798, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76423383, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.08984375, + "step": 2746, + "time_per_iteration": 5.000555038452148 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097643, + "balance_loss_mlp": 1.06767416, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.06348628312729114, + "language_loss": 0.79553157, + "learning_rate": 0.00047819895203700684, + "loss": 0.806508, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29907227, + "step": 2747, + "time_per_iteration": 2.7115635871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017614, + "balance_loss_mlp": 1.0085541, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.007776557121409109, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76530045, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09082031, + "step": 2748, + "time_per_iteration": 4.672155141830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092605, + "balance_loss_mlp": 1.06263614, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.06781776650114792, + "language_loss": 0.8852309, + "learning_rate": 0.0004775764770742277, + "loss": 0.89615691, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29931641, + "step": 2749, + "time_per_iteration": 2.8029801845550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06542146, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.07126893850665976, + "language_loss": 0.86776084, + "learning_rate": 0.00047726525259079777, + "loss": 0.87873781, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.32299805, + "step": 2750, + "time_per_iteration": 2.7803709506988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097184, + "balance_loss_mlp": 1.06568849, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.07487878206236488, + "language_loss": 0.88641649, + "learning_rate": 0.0004769540369337798, + "loss": 0.89738834, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.31469727, + "step": 2751, + "time_per_iteration": 2.7477662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103352, + "balance_loss_mlp": 1.07166588, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06303354467879724, + "language_loss": 0.86111081, + "learning_rate": 0.00047664283022399794, + "loss": 0.87214434, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.31665039, + "step": 2752, + "time_per_iteration": 2.8321616649627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111513, + "balance_loss_mlp": 1.08142424, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.1009265551294561, + "language_loss": 0.81372654, + "learning_rate": 0.00047633163258227376, + "loss": 0.82484162, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.30053711, + "step": 2753, + "time_per_iteration": 2.866710662841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107072, + "balance_loss_mlp": 1.07536244, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06597410250171662, + "language_loss": 0.85720521, + "learning_rate": 0.0004760204441294247, + "loss": 0.86827588, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.31689453, + "step": 2754, + "time_per_iteration": 2.635411500930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123028, + "balance_loss_mlp": 1.09172344, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06814428712155127, + "language_loss": 0.86859232, + "learning_rate": 0.00047570926498626486, + "loss": 0.87982261, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31274414, + "step": 2755, + "time_per_iteration": 2.678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.10846841, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05259166917973927, + "language_loss": 0.8179211, + "learning_rate": 0.00047539809527360474, + "loss": 0.82931906, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31298828, + "step": 2756, + "time_per_iteration": 2.8505630493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139868, + "balance_loss_mlp": 1.1087544, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.23589307030508885, + "language_loss": 0.82282543, + "learning_rate": 0.0004750869351122511, + "loss": 0.83422416, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.31079102, + "step": 2757, + "time_per_iteration": 3.007599353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114789, + "balance_loss_mlp": 1.11598992, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.06932827369161218, + "language_loss": 0.81883401, + "learning_rate": 0.00047477578462300685, + "loss": 0.83031291, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.31884766, + "step": 2758, + "time_per_iteration": 2.7112765312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.11215043, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.060390901611552056, + "language_loss": 0.79751188, + "learning_rate": 0.0004744646439266718, + "loss": 0.80895996, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.32641602, + "step": 2759, + "time_per_iteration": 2.9956624507904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_mlp": 1.10905194, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.0692957942514688, + "language_loss": 0.92371601, + "learning_rate": 0.000474153513144041, + "loss": 0.93513119, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.32470703, + "step": 2760, + "time_per_iteration": 2.902304172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114025, + "balance_loss_mlp": 1.10756326, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06953135792158749, + "language_loss": 0.87208283, + "learning_rate": 0.00047384239239590633, + "loss": 0.88348538, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.3269043, + "step": 2761, + "time_per_iteration": 2.9197542667388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127002, + "balance_loss_mlp": 1.09414792, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06520154041113266, + "language_loss": 0.89041948, + "learning_rate": 0.0004735312818030556, + "loss": 0.90168953, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.32861328, + "step": 2762, + "time_per_iteration": 2.699882745742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128123, + "balance_loss_mlp": 1.0964613, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.0963196289257929, + "language_loss": 0.83125454, + "learning_rate": 0.0004732201814862727, + "loss": 0.84253573, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31640625, + "step": 2763, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113884, + "balance_loss_mlp": 1.08155453, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.058489246415432364, + "language_loss": 0.81845987, + "learning_rate": 0.0004729090915663373, + "loss": 0.82959872, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.32324219, + "step": 2764, + "time_per_iteration": 2.880218029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112044, + "balance_loss_mlp": 1.07930923, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.08176902294326427, + "language_loss": 0.85593212, + "learning_rate": 0.00047259801216402534, + "loss": 0.86705256, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.32739258, + "step": 2765, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.0809716, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.0984419544464696, + "language_loss": 0.86589384, + "learning_rate": 0.00047228694340010845, + "loss": 0.87702894, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.32543945, + "step": 2766, + "time_per_iteration": 2.615323781967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106832, + "balance_loss_mlp": 1.07288122, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.06857994992356635, + "language_loss": 0.85894436, + "learning_rate": 0.0004719758853953544, + "loss": 0.87001264, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.33984375, + "step": 2767, + "time_per_iteration": 3.580965042114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109799, + "balance_loss_mlp": 1.07475162, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.07966941077078553, + "language_loss": 0.84403044, + "learning_rate": 0.00047166483827052645, + "loss": 0.85512847, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.35083008, + "step": 2768, + "time_per_iteration": 2.3937976360321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112761, + "balance_loss_mlp": 1.09797895, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.05218838233145069, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78191251, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.14746094, + "step": 2769, + "time_per_iteration": 4.980372905731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112083, + "balance_loss_mlp": 1.07910919, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.05422451751257763, + "language_loss": 0.8393681, + "learning_rate": 0.000471042777143682, + "loss": 0.8504889, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.32958984, + "step": 2770, + "time_per_iteration": 3.2559990882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109219, + "balance_loss_mlp": 1.07576907, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.05619534531580183, + "language_loss": 0.79500479, + "learning_rate": 0.0004707317633831707, + "loss": 0.80609697, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.3347168, + "step": 2771, + "time_per_iteration": 2.580369472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113861, + "balance_loss_mlp": 1.07976723, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.07426752742264173, + "language_loss": 0.78140616, + "learning_rate": 0.00047042076098559673, + "loss": 0.79254484, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.34130859, + "step": 2772, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115026, + "balance_loss_mlp": 1.08131373, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07148667655520102, + "language_loss": 0.74185407, + "learning_rate": 0.00047010977007170174, + "loss": 0.75300431, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.3371582, + "step": 2773, + "time_per_iteration": 3.2167580127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103553, + "balance_loss_mlp": 1.07079434, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.05649801417476766, + "language_loss": 0.82702589, + "learning_rate": 0.00046979879076222334, + "loss": 0.83806139, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.32763672, + "step": 2774, + "time_per_iteration": 2.6618025302886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109156, + "balance_loss_mlp": 1.07689798, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.05944272870619304, + "language_loss": 0.85247773, + "learning_rate": 0.0004694878231778939, + "loss": 0.86356932, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.32250977, + "step": 2775, + "time_per_iteration": 3.381577968597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105801, + "balance_loss_mlp": 1.07459164, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.05869389504796052, + "language_loss": 0.84721255, + "learning_rate": 0.0004691768674394423, + "loss": 0.85827059, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.31176758, + "step": 2776, + "time_per_iteration": 2.9549882411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_mlp": 1.03230345, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.020468065913813137, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85525757, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09423828, + "step": 2777, + "time_per_iteration": 4.780264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_mlp": 1.03013933, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.02045845897293101, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77692783, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09130859, + "step": 2778, + "time_per_iteration": 5.030272960662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101269, + "balance_loss_mlp": 1.06870127, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06089610481991967, + "language_loss": 0.7961477, + "learning_rate": 0.00046824407250656676, + "loss": 0.80716044, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.32568359, + "step": 2779, + "time_per_iteration": 2.6681063175201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096395, + "balance_loss_mlp": 1.06537652, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.04990324067280663, + "language_loss": 0.83819127, + "learning_rate": 0.0004679331653588161, + "loss": 0.84915525, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.30981445, + "step": 2780, + "time_per_iteration": 2.635774612426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092346, + "balance_loss_mlp": 1.05999231, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.06684745885443293, + "language_loss": 0.85806221, + "learning_rate": 0.0004676222706605147, + "loss": 0.86898565, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.32348633, + "step": 2781, + "time_per_iteration": 2.6137733459472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092057, + "balance_loss_mlp": 1.05886936, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08426708268962642, + "language_loss": 0.85464495, + "learning_rate": 0.0004673113885323626, + "loss": 0.86556554, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.33203125, + "step": 2782, + "time_per_iteration": 2.861581802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083804, + "balance_loss_mlp": 1.05083072, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.060311716473253056, + "language_loss": 0.78792584, + "learning_rate": 0.00046700051909505494, + "loss": 0.79876387, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.32983398, + "step": 2783, + "time_per_iteration": 3.182298183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089723, + "balance_loss_mlp": 1.05407953, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678042842361867, + "language_loss": 0.84239137, + "learning_rate": 0.000466689662469282, + "loss": 0.85328859, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.35644531, + "step": 2784, + "time_per_iteration": 2.6519503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.04891968, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06002174049054728, + "language_loss": 0.83905756, + "learning_rate": 0.00046637881877572917, + "loss": 0.84987772, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.33105469, + "step": 2785, + "time_per_iteration": 3.1058127880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_mlp": 1.051754, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.0580195679012457, + "language_loss": 0.8490684, + "learning_rate": 0.0004660679881350764, + "loss": 0.85991538, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.32958984, + "step": 2786, + "time_per_iteration": 2.77021861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053559, + "balance_loss_mlp": 1.0447371, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.032864625150969516, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76661706, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.08837891, + "step": 2787, + "time_per_iteration": 5.029211044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087215, + "balance_loss_mlp": 1.05335903, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07679411484967892, + "language_loss": 0.77928644, + "learning_rate": 0.0004654463664951667, + "loss": 0.79015857, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.33886719, + "step": 2788, + "time_per_iteration": 2.9762089252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088019, + "balance_loss_mlp": 1.05464029, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06025701653165108, + "language_loss": 0.83150423, + "learning_rate": 0.0004651355757372447, + "loss": 0.84238434, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.33398438, + "step": 2789, + "time_per_iteration": 2.5971946716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089252, + "balance_loss_mlp": 1.05604005, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.08338964083328992, + "language_loss": 0.8607617, + "learning_rate": 0.00046482479851489274, + "loss": 0.87165421, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.33227539, + "step": 2790, + "time_per_iteration": 2.6431193351745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109405, + "balance_loss_mlp": 1.06119633, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.07763218475438792, + "language_loss": 0.77609432, + "learning_rate": 0.00046451403494876525, + "loss": 0.78703481, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.32861328, + "step": 2791, + "time_per_iteration": 2.860164165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092942, + "balance_loss_mlp": 1.05918157, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.06279789357775317, + "language_loss": 0.84532517, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625458, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.33789062, + "step": 2792, + "time_per_iteration": 2.7511003017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106074, + "balance_loss_mlp": 1.07081246, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.05029896863334896, + "language_loss": 0.85103881, + "learning_rate": 0.00046389254926777404, + "loss": 0.86209953, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.35253906, + "step": 2793, + "time_per_iteration": 2.7946324348449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.07229924, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.05473465194283574, + "language_loss": 0.78127646, + "learning_rate": 0.0004635818273941926, + "loss": 0.79232681, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.32739258, + "step": 2794, + "time_per_iteration": 3.5742921829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109863, + "balance_loss_mlp": 1.07641304, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.07615315185796866, + "language_loss": 0.82079315, + "learning_rate": 0.0004632711196593997, + "loss": 0.83189178, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.3347168, + "step": 2795, + "time_per_iteration": 2.7694544792175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110741, + "balance_loss_mlp": 1.07907939, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.07020702036152926, + "language_loss": 0.85457337, + "learning_rate": 0.00046296042618402297, + "loss": 0.86568069, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.31640625, + "step": 2796, + "time_per_iteration": 3.0587034225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109738, + "balance_loss_mlp": 1.07883883, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06759922925686453, + "language_loss": 0.7969842, + "learning_rate": 0.0004626497470886839, + "loss": 0.80808163, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30883789, + "step": 2797, + "time_per_iteration": 3.002824068069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105945, + "balance_loss_mlp": 1.07299602, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.07466819588637175, + "language_loss": 0.82158947, + "learning_rate": 0.00046233908249399897, + "loss": 0.83264899, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.32958984, + "step": 2798, + "time_per_iteration": 2.7746241092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097876, + "balance_loss_mlp": 1.06559372, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.05453000178981586, + "language_loss": 0.78238356, + "learning_rate": 0.00046202843252057905, + "loss": 0.79336226, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.32275391, + "step": 2799, + "time_per_iteration": 2.581350803375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097308, + "balance_loss_mlp": 1.06478727, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06584834464031906, + "language_loss": 0.84020996, + "learning_rate": 0.00046171779728902896, + "loss": 0.85118306, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.32495117, + "step": 2800, + "time_per_iteration": 2.577760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.05988431, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.0769580423168035, + "language_loss": 0.85918987, + "learning_rate": 0.000461407176919948, + "loss": 0.87011129, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.32250977, + "step": 2801, + "time_per_iteration": 2.5490942001342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093913, + "balance_loss_mlp": 1.06189322, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.05361052263899676, + "language_loss": 0.85168314, + "learning_rate": 0.00046109657153392997, + "loss": 0.86262226, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.32006836, + "step": 2802, + "time_per_iteration": 2.7699196338653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095158, + "balance_loss_mlp": 1.06132686, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07003946535384918, + "language_loss": 0.82877356, + "learning_rate": 0.0004607859812515622, + "loss": 0.83972514, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.33862305, + "step": 2803, + "time_per_iteration": 2.6007485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093716, + "balance_loss_mlp": 1.06198251, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06322278970979951, + "language_loss": 0.88066649, + "learning_rate": 0.00046047540619342667, + "loss": 0.89160359, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.31713867, + "step": 2804, + "time_per_iteration": 2.5943124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06163239, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.060964528711389604, + "language_loss": 0.80115181, + "learning_rate": 0.00046016484648009933, + "loss": 0.81207782, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30957031, + "step": 2805, + "time_per_iteration": 2.707387924194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096878, + "balance_loss_mlp": 1.0659312, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.05960799154457967, + "language_loss": 0.80838758, + "learning_rate": 0.0004598543022321501, + "loss": 0.81935638, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.30908203, + "step": 2806, + "time_per_iteration": 2.606360673904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103257, + "balance_loss_mlp": 1.07080865, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.059370042319646085, + "language_loss": 0.80030453, + "learning_rate": 0.0004595437735701433, + "loss": 0.81133705, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.32446289, + "step": 2807, + "time_per_iteration": 2.674914836883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.06448901, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.07129928038264445, + "language_loss": 0.83467567, + "learning_rate": 0.00045923326061463623, + "loss": 0.84564078, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.32006836, + "step": 2808, + "time_per_iteration": 2.7732136249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093728, + "balance_loss_mlp": 1.0615654, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.061183409959599915, + "language_loss": 0.81861985, + "learning_rate": 0.00045892276348618113, + "loss": 0.82955706, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.3215332, + "step": 2809, + "time_per_iteration": 2.9496963024139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_mlp": 1.03318524, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.03295349175272743, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79301834, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.078125, + "step": 2810, + "time_per_iteration": 4.980771064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095175, + "balance_loss_mlp": 1.06375122, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.048089637178950914, + "language_loss": 0.80807102, + "learning_rate": 0.000458301817192603, + "loss": 0.81902277, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.31396484, + "step": 2811, + "time_per_iteration": 2.819394111633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014174, + "balance_loss_mlp": 1.00659227, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.018125943247431338, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81855953, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.07568359, + "step": 2812, + "time_per_iteration": 4.830869197845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094625, + "balance_loss_mlp": 1.06312966, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.07142535441885249, + "language_loss": 0.8774603, + "learning_rate": 0.00045768093565369983, + "loss": 0.88840652, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31494141, + "step": 2813, + "time_per_iteration": 2.7351324558258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06911242, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.0566212514723048, + "language_loss": 0.82215679, + "learning_rate": 0.0004573705194685646, + "loss": 0.83316934, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.32128906, + "step": 2814, + "time_per_iteration": 2.6945576667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100716, + "balance_loss_mlp": 1.06860089, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.06333436634677812, + "language_loss": 0.85428321, + "learning_rate": 0.00045706011983366157, + "loss": 0.86529034, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.32080078, + "step": 2815, + "time_per_iteration": 2.681619882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108066, + "balance_loss_mlp": 1.07623768, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.068256039366798, + "language_loss": 0.8269453, + "learning_rate": 0.00045674973686949847, + "loss": 0.83802599, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.31835938, + "step": 2816, + "time_per_iteration": 2.5405073165893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109346, + "balance_loss_mlp": 1.07830381, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.0555657817841838, + "language_loss": 0.85590029, + "learning_rate": 0.0004564393706965766, + "loss": 0.86699367, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 2.9834089279174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102481, + "balance_loss_mlp": 1.07079506, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.052731051534337416, + "language_loss": 0.81111342, + "learning_rate": 0.00045612902143539116, + "loss": 0.82213825, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31665039, + "step": 2818, + "time_per_iteration": 2.5867249965667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06935942, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.08027643777933474, + "language_loss": 0.82169372, + "learning_rate": 0.00045581868920642986, + "loss": 0.83268583, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.29833984, + "step": 2819, + "time_per_iteration": 2.538219928741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100605, + "balance_loss_mlp": 1.06968212, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.056746529630016036, + "language_loss": 0.79290533, + "learning_rate": 0.00045550837413017457, + "loss": 0.80391139, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30883789, + "step": 2820, + "time_per_iteration": 2.6461877822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100089, + "balance_loss_mlp": 1.06995249, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06471497165860861, + "language_loss": 0.85196662, + "learning_rate": 0.0004551980763271005, + "loss": 0.86296749, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30102539, + "step": 2821, + "time_per_iteration": 2.6883745193481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_mlp": 1.07015133, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.058885459141671155, + "language_loss": 0.84080005, + "learning_rate": 0.0004548877959176756, + "loss": 0.85182083, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.3190918, + "step": 2822, + "time_per_iteration": 2.861867666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.06595802, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06589540393120931, + "language_loss": 0.86233151, + "learning_rate": 0.00045457753302236166, + "loss": 0.87329865, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30737305, + "step": 2823, + "time_per_iteration": 2.687164068222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097063, + "balance_loss_mlp": 1.06685555, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07425338305054356, + "language_loss": 0.87034917, + "learning_rate": 0.00045426728776161353, + "loss": 0.88131976, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30175781, + "step": 2824, + "time_per_iteration": 2.7938835620880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104052, + "balance_loss_mlp": 1.07224679, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.05711338707468448, + "language_loss": 0.81608665, + "learning_rate": 0.00045395706025587863, + "loss": 0.82712722, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.31787109, + "step": 2825, + "time_per_iteration": 2.6212074756622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099159, + "balance_loss_mlp": 1.06907105, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.07865669635555295, + "language_loss": 0.8299852, + "learning_rate": 0.00045364685062559843, + "loss": 0.84097683, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30078125, + "step": 2826, + "time_per_iteration": 2.8868184089660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104022, + "balance_loss_mlp": 1.07505381, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06023434626032839, + "language_loss": 0.91765273, + "learning_rate": 0.0004533366589912067, + "loss": 0.92869294, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.28955078, + "step": 2827, + "time_per_iteration": 2.9981062412261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105445, + "balance_loss_mlp": 1.07557106, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.06990968055660145, + "language_loss": 0.78070033, + "learning_rate": 0.0004530264854731306, + "loss": 0.79175478, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29858398, + "step": 2828, + "time_per_iteration": 3.0054330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107215, + "balance_loss_mlp": 1.07605386, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05020371190449787, + "language_loss": 0.84383601, + "learning_rate": 0.00045271633019179034, + "loss": 0.85490811, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.3112793, + "step": 2829, + "time_per_iteration": 2.775956630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107068, + "balance_loss_mlp": 1.07605028, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05805566098722391, + "language_loss": 0.88203323, + "learning_rate": 0.0004524061932675986, + "loss": 0.8931039, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.30981445, + "step": 2830, + "time_per_iteration": 2.8221793174743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106595, + "balance_loss_mlp": 1.07555294, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.0740029895366448, + "language_loss": 0.87459874, + "learning_rate": 0.00045209607482096125, + "loss": 0.8856647, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.31005859, + "step": 2831, + "time_per_iteration": 3.0393142700195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.0710969, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.08209208283258153, + "language_loss": 0.84651136, + "learning_rate": 0.0004517859749722772, + "loss": 0.85753322, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.31054688, + "step": 2832, + "time_per_iteration": 2.6821095943450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105232, + "balance_loss_mlp": 1.07368898, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.07359331276456935, + "language_loss": 0.79572821, + "learning_rate": 0.0004514758938419376, + "loss": 0.80678058, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.31518555, + "step": 2833, + "time_per_iteration": 2.8375093936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080375, + "balance_loss_mlp": 1.07288861, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03314547284214794, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78000963, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.07470703, + "step": 2834, + "time_per_iteration": 4.963228225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06930006, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.057187543491433894, + "language_loss": 0.83827722, + "learning_rate": 0.00045085578821782175, + "loss": 0.84927469, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.3046875, + "step": 2835, + "time_per_iteration": 2.5562217235565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054355, + "balance_loss_mlp": 1.04696393, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.02358753311446476, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77189088, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.07373047, + "step": 2836, + "time_per_iteration": 4.959676742553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100094, + "balance_loss_mlp": 1.06983829, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.0408398110042356, + "language_loss": 0.80949795, + "learning_rate": 0.00045023575891159866, + "loss": 0.82049894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30200195, + "step": 2837, + "time_per_iteration": 2.74700665473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_mlp": 1.01894093, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.01524116386105569, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75790191, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.07421875, + "step": 2838, + "time_per_iteration": 4.9733850955963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.07366681, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.05292635351535042, + "language_loss": 0.78244042, + "learning_rate": 0.0004496158068861354, + "loss": 0.79347491, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29760742, + "step": 2839, + "time_per_iteration": 2.8023805618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110962, + "balance_loss_mlp": 1.08010423, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.0535580092110964, + "language_loss": 0.80844593, + "learning_rate": 0.00044930586015455207, + "loss": 0.81954211, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.29492188, + "step": 2840, + "time_per_iteration": 2.816567897796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118684, + "balance_loss_mlp": 1.08804703, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.06541969342762931, + "language_loss": 0.89212978, + "learning_rate": 0.000448995933104179, + "loss": 0.90331668, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.3059082, + "step": 2841, + "time_per_iteration": 2.903371810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115887, + "balance_loss_mlp": 1.08601356, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06848140377985366, + "language_loss": 0.80388117, + "learning_rate": 0.00044868602585534077, + "loss": 0.81504011, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.29833984, + "step": 2842, + "time_per_iteration": 2.870833396911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104882, + "balance_loss_mlp": 1.07519853, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.06871095275450309, + "language_loss": 0.89058006, + "learning_rate": 0.0004483761385283541, + "loss": 0.90162885, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.29663086, + "step": 2843, + "time_per_iteration": 2.5367324352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.06863523, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.05633892340966096, + "language_loss": 0.81610817, + "learning_rate": 0.0004480662712435281, + "loss": 0.82710731, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.3125, + "step": 2844, + "time_per_iteration": 2.8301496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.0627687, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.05986468354955699, + "language_loss": 0.88694894, + "learning_rate": 0.0004477564241211635, + "loss": 0.89787042, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.2935791, + "step": 2845, + "time_per_iteration": 2.5813820362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086916, + "balance_loss_mlp": 1.05787718, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.059098326299960216, + "language_loss": 0.87329561, + "learning_rate": 0.0004474465972815541, + "loss": 0.88416475, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.2902832, + "step": 2846, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_mlp": 1.05730796, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05595262091427783, + "language_loss": 0.87812984, + "learning_rate": 0.000447136790844985, + "loss": 0.88898313, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.28027344, + "step": 2847, + "time_per_iteration": 2.698451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086514, + "balance_loss_mlp": 1.05726016, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.06538513229207209, + "language_loss": 0.81294727, + "learning_rate": 0.00044682700493173385, + "loss": 0.82381248, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.29223633, + "step": 2848, + "time_per_iteration": 2.8252742290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05441868, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06259253721450928, + "language_loss": 0.80796725, + "learning_rate": 0.00044651723966207004, + "loss": 0.81881809, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.30639648, + "step": 2849, + "time_per_iteration": 3.093806505203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083876, + "balance_loss_mlp": 1.05424023, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.05680096345280931, + "language_loss": 0.78538483, + "learning_rate": 0.00044620749515625536, + "loss": 0.79622364, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.29614258, + "step": 2850, + "time_per_iteration": 2.759477376937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_mlp": 1.0532248, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.054672764420471885, + "language_loss": 0.85281622, + "learning_rate": 0.00044589777153454334, + "loss": 0.86365175, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30297852, + "step": 2851, + "time_per_iteration": 2.7247886657714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082527, + "balance_loss_mlp": 1.0519855, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.05927586181396917, + "language_loss": 0.83792317, + "learning_rate": 0.00044558806891717895, + "loss": 0.84874845, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30493164, + "step": 2852, + "time_per_iteration": 2.480499267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078563, + "balance_loss_mlp": 1.04847419, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06995220773511122, + "language_loss": 0.79820019, + "learning_rate": 0.0004452783874243998, + "loss": 0.80898583, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.30053711, + "step": 2853, + "time_per_iteration": 2.815159559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_mlp": 1.05354142, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.0871194319773747, + "language_loss": 0.8473509, + "learning_rate": 0.00044496872717643475, + "loss": 0.85818863, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.30200195, + "step": 2854, + "time_per_iteration": 2.671760320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_mlp": 1.02099681, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.022692984636718958, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7811873, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.08447266, + "step": 2855, + "time_per_iteration": 4.943760633468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_mlp": 1.05152166, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08481580298187671, + "language_loss": 0.82385266, + "learning_rate": 0.0004443494708958217, + "loss": 0.83465844, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.2902832, + "step": 2856, + "time_per_iteration": 2.9592692852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081194, + "balance_loss_mlp": 1.05131996, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.054737825906261944, + "language_loss": 0.81019336, + "learning_rate": 0.0004440398751035906, + "loss": 0.82100528, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29858398, + "step": 2857, + "time_per_iteration": 2.8660449981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086545, + "balance_loss_mlp": 1.05612314, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.07506614425197558, + "language_loss": 0.84203708, + "learning_rate": 0.00044373030103700645, + "loss": 0.85290253, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.30395508, + "step": 2858, + "time_per_iteration": 2.589571475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.05769968, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.06400511299844665, + "language_loss": 0.80211353, + "learning_rate": 0.000443420748816257, + "loss": 0.81297493, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28442383, + "step": 2859, + "time_per_iteration": 2.775573492050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089751, + "balance_loss_mlp": 1.05894732, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.05990515883462961, + "language_loss": 0.78525764, + "learning_rate": 0.0004431112185615208, + "loss": 0.7961551, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.30786133, + "step": 2860, + "time_per_iteration": 2.79428768157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099065, + "balance_loss_mlp": 1.06942964, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.08012396807897051, + "language_loss": 0.80142951, + "learning_rate": 0.00044280171039296845, + "loss": 0.81242013, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29589844, + "step": 2861, + "time_per_iteration": 2.6075713634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097414, + "balance_loss_mlp": 1.06808829, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.055527438655266555, + "language_loss": 0.88317382, + "learning_rate": 0.0004424922244307616, + "loss": 0.89414799, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.29321289, + "step": 2862, + "time_per_iteration": 2.6453704833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093253, + "balance_loss_mlp": 1.06306958, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.0988044596240084, + "language_loss": 0.82273299, + "learning_rate": 0.00044218276079505315, + "loss": 0.83366549, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.30151367, + "step": 2863, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093494, + "balance_loss_mlp": 1.0636915, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.15366013773450377, + "language_loss": 0.74783754, + "learning_rate": 0.0004418733196059876, + "loss": 0.75877243, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29760742, + "step": 2864, + "time_per_iteration": 2.6593546867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092739, + "balance_loss_mlp": 1.06398571, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.05392307081741782, + "language_loss": 0.80104017, + "learning_rate": 0.0004415639009837008, + "loss": 0.81196761, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28759766, + "step": 2865, + "time_per_iteration": 2.8184585571289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096337, + "balance_loss_mlp": 1.06660628, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.0621710106813525, + "language_loss": 0.8235333, + "learning_rate": 0.00044125450504831955, + "loss": 0.83449662, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.29711914, + "step": 2866, + "time_per_iteration": 2.734349489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086542, + "balance_loss_mlp": 1.05592918, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.06271512147953057, + "language_loss": 0.82752901, + "learning_rate": 0.0004409451319199622, + "loss": 0.83839446, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.30566406, + "step": 2867, + "time_per_iteration": 2.683742046356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095264, + "balance_loss_mlp": 1.06417394, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07258101504897169, + "language_loss": 0.84457368, + "learning_rate": 0.0004406357817187381, + "loss": 0.85552633, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.31054688, + "step": 2868, + "time_per_iteration": 3.0147883892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103312, + "balance_loss_mlp": 1.07379591, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.05164398294731223, + "language_loss": 0.81673765, + "learning_rate": 0.0004403264545647474, + "loss": 0.82777071, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29492188, + "step": 2869, + "time_per_iteration": 3.5095975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107006, + "balance_loss_mlp": 1.07603574, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.04919714399659635, + "language_loss": 0.85006267, + "learning_rate": 0.00044001715057808154, + "loss": 0.86113274, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.30932617, + "step": 2870, + "time_per_iteration": 2.759791851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114514, + "balance_loss_mlp": 1.08330536, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06727866309699267, + "language_loss": 0.81942332, + "learning_rate": 0.0004397078698788232, + "loss": 0.83056843, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.31176758, + "step": 2871, + "time_per_iteration": 3.21431040763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104908, + "balance_loss_mlp": 1.09441757, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.04310408533027141, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81547272, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.10498047, + "step": 2872, + "time_per_iteration": 4.941087484359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114234, + "balance_loss_mlp": 1.082739, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.05898932962157328, + "language_loss": 0.78340954, + "learning_rate": 0.00043908937882281343, + "loss": 0.79455185, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.31469727, + "step": 2873, + "time_per_iteration": 2.577866554260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08501506, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.05969066171006231, + "language_loss": 0.82846034, + "learning_rate": 0.0004387801687061814, + "loss": 0.83962971, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.3190918, + "step": 2874, + "time_per_iteration": 2.8184196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117603, + "balance_loss_mlp": 1.08489251, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.05481480847886404, + "language_loss": 0.80685902, + "learning_rate": 0.0004384709823571958, + "loss": 0.81803501, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.32714844, + "step": 2875, + "time_per_iteration": 2.7496426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.07519674, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0703745986604158, + "language_loss": 0.83230788, + "learning_rate": 0.0004381618198958932, + "loss": 0.84336388, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.30371094, + "step": 2876, + "time_per_iteration": 3.4905495643615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110669, + "balance_loss_mlp": 1.07662511, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.06448913307816859, + "language_loss": 0.84021735, + "learning_rate": 0.00043785268144230137, + "loss": 0.85128427, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.30029297, + "step": 2877, + "time_per_iteration": 2.907133102416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102032, + "balance_loss_mlp": 1.07203865, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.0731230974557418, + "language_loss": 0.82496381, + "learning_rate": 0.00043754356711643837, + "loss": 0.83598411, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29980469, + "step": 2878, + "time_per_iteration": 2.715023994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097061, + "balance_loss_mlp": 1.06609011, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.0760081782140183, + "language_loss": 0.83909559, + "learning_rate": 0.0004372344770383132, + "loss": 0.85006618, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30932617, + "step": 2879, + "time_per_iteration": 2.822368621826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097203, + "balance_loss_mlp": 1.06756735, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.06372253861737541, + "language_loss": 0.83305293, + "learning_rate": 0.00043692541132792507, + "loss": 0.84402496, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29614258, + "step": 2880, + "time_per_iteration": 2.7154414653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093507, + "balance_loss_mlp": 1.06349051, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.057885594640944824, + "language_loss": 0.83464789, + "learning_rate": 0.00043661637010526384, + "loss": 0.84558296, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.30004883, + "step": 2881, + "time_per_iteration": 2.507059097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092859, + "balance_loss_mlp": 1.06255555, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.08329174894233551, + "language_loss": 0.83249325, + "learning_rate": 0.00043630735349031025, + "loss": 0.84342188, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30273438, + "step": 2882, + "time_per_iteration": 2.644418478012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06216836, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.047753182436236, + "language_loss": 0.81861913, + "learning_rate": 0.00043599836160303495, + "loss": 0.82952571, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.28491211, + "step": 2883, + "time_per_iteration": 2.8971407413482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090292, + "balance_loss_mlp": 1.06160986, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.057456379562134556, + "language_loss": 0.77759755, + "learning_rate": 0.0004356893945633995, + "loss": 0.78850043, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.28649902, + "step": 2884, + "time_per_iteration": 2.937133312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094576, + "balance_loss_mlp": 1.06620383, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.05754228747661135, + "language_loss": 0.81617516, + "learning_rate": 0.0004353804524913551, + "loss": 0.82712096, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.28344727, + "step": 2885, + "time_per_iteration": 2.579535722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109656, + "balance_loss_mlp": 1.08028293, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.06485446309889223, + "language_loss": 0.81926423, + "learning_rate": 0.0004350715355068441, + "loss": 0.83036083, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.29345703, + "step": 2886, + "time_per_iteration": 2.709717273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111013, + "balance_loss_mlp": 1.08142567, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.066893347852213, + "language_loss": 0.7961694, + "learning_rate": 0.00043476264372979847, + "loss": 0.80727959, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.2956543, + "step": 2887, + "time_per_iteration": 2.5216078758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113, + "balance_loss_mlp": 1.08441329, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.0640996529430707, + "language_loss": 0.78604692, + "learning_rate": 0.0004344537772801408, + "loss": 0.7971769, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.28540039, + "step": 2888, + "time_per_iteration": 3.8132436275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.05838752, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.028482200170008867, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.7448833, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.07470703, + "step": 2889, + "time_per_iteration": 4.947216987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117814, + "balance_loss_mlp": 1.08801198, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.06792095354006551, + "language_loss": 0.83771884, + "learning_rate": 0.0004338361208426298, + "loss": 0.84889698, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.29760742, + "step": 2890, + "time_per_iteration": 2.631476879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113406, + "balance_loss_mlp": 1.08350825, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.05967481099781226, + "language_loss": 0.81602627, + "learning_rate": 0.00043352733109457164, + "loss": 0.82716036, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.29858398, + "step": 2891, + "time_per_iteration": 2.907500743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111722, + "balance_loss_mlp": 1.08194315, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.04670195587242621, + "language_loss": 0.84789026, + "learning_rate": 0.00043321856715349244, + "loss": 0.85900748, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29760742, + "step": 2892, + "time_per_iteration": 2.9401984214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110696, + "balance_loss_mlp": 1.0810132, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.05439165995621742, + "language_loss": 0.80422115, + "learning_rate": 0.00043290982913926466, + "loss": 0.81532812, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.29614258, + "step": 2893, + "time_per_iteration": 2.7956430912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113402, + "balance_loss_mlp": 1.08312285, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.09922355360532673, + "language_loss": 0.8448714, + "learning_rate": 0.0004326011171717514, + "loss": 0.85600543, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30297852, + "step": 2894, + "time_per_iteration": 2.8997769355773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108454, + "balance_loss_mlp": 1.07676816, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06224988402754836, + "language_loss": 0.81240308, + "learning_rate": 0.0004322924313708051, + "loss": 0.82348764, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.31689453, + "step": 2895, + "time_per_iteration": 2.511643648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07825518, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.0621054083596477, + "language_loss": 0.84500259, + "learning_rate": 0.0004319837718562681, + "loss": 0.85607862, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.29321289, + "step": 2896, + "time_per_iteration": 2.580003023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106887, + "balance_loss_mlp": 1.07667959, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.05844968671659234, + "language_loss": 0.83570629, + "learning_rate": 0.0004316751387479726, + "loss": 0.84677517, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30175781, + "step": 2897, + "time_per_iteration": 2.7676987648010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122549, + "balance_loss_mlp": 1.0925082, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.06543352873326957, + "language_loss": 0.82800293, + "learning_rate": 0.0004313665321657409, + "loss": 0.83922845, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.30004883, + "step": 2898, + "time_per_iteration": 3.7584402561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120576, + "balance_loss_mlp": 1.08917618, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.06787906742385669, + "language_loss": 0.80272007, + "learning_rate": 0.00043105795222938436, + "loss": 0.81392586, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.31396484, + "step": 2899, + "time_per_iteration": 2.718045711517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.07795143, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.06698960298708169, + "language_loss": 0.78827435, + "learning_rate": 0.00043074939905870467, + "loss": 0.79937094, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.31713867, + "step": 2900, + "time_per_iteration": 2.639775514602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.08230579, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.09759490745534659, + "language_loss": 0.80356312, + "learning_rate": 0.0004304408727734927, + "loss": 0.81467754, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.29125977, + "step": 2901, + "time_per_iteration": 2.6272940635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107959, + "balance_loss_mlp": 1.07889545, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.06875313821095587, + "language_loss": 0.89200485, + "learning_rate": 0.0004301323734935288, + "loss": 0.9030844, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.29052734, + "step": 2902, + "time_per_iteration": 2.652219533920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_mlp": 1.07164121, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.05706751216847301, + "language_loss": 0.87298477, + "learning_rate": 0.000429823901338583, + "loss": 0.8839913, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.2902832, + "step": 2903, + "time_per_iteration": 2.611798048019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099623, + "balance_loss_mlp": 1.06872356, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.053536411753063035, + "language_loss": 0.87032712, + "learning_rate": 0.00042951545642841513, + "loss": 0.88132328, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.30883789, + "step": 2904, + "time_per_iteration": 3.067237377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099073, + "balance_loss_mlp": 1.06979561, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.04987560026618122, + "language_loss": 0.86746645, + "learning_rate": 0.0004292070388827737, + "loss": 0.87845719, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.29272461, + "step": 2905, + "time_per_iteration": 2.5981948375701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06426287, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06265536693897518, + "language_loss": 0.81292248, + "learning_rate": 0.00042889864882139753, + "loss": 0.82385433, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.2890625, + "step": 2906, + "time_per_iteration": 2.581113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107989, + "balance_loss_mlp": 1.07811511, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06493240221059006, + "language_loss": 0.81897962, + "learning_rate": 0.0004285902863640139, + "loss": 0.83005953, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29858398, + "step": 2907, + "time_per_iteration": 2.6115305423736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109856, + "balance_loss_mlp": 1.06973481, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.07849480564056018, + "language_loss": 0.8626982, + "learning_rate": 0.00042828195163033966, + "loss": 0.87368375, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.28833008, + "step": 2908, + "time_per_iteration": 2.6564390659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07654572, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07707498056388652, + "language_loss": 0.79454792, + "learning_rate": 0.0004279736447400812, + "loss": 0.80562025, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30664062, + "step": 2909, + "time_per_iteration": 2.580448627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_mlp": 1.07343817, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.055339920225342294, + "language_loss": 0.78979003, + "learning_rate": 0.00042766536581293385, + "loss": 0.80081677, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.29223633, + "step": 2910, + "time_per_iteration": 2.714306116104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112106, + "balance_loss_mlp": 1.09004188, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.06660982321180627, + "language_loss": 0.79863673, + "learning_rate": 0.0004273571149685819, + "loss": 0.80984735, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30981445, + "step": 2911, + "time_per_iteration": 2.738189220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117749, + "balance_loss_mlp": 1.08794653, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.07453286241806684, + "language_loss": 0.83875954, + "learning_rate": 0.00042704889232669937, + "loss": 0.84993702, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29785156, + "step": 2912, + "time_per_iteration": 2.7153878211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119265, + "balance_loss_mlp": 1.09003508, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.06505374842280261, + "language_loss": 0.85808718, + "learning_rate": 0.0004267406980069484, + "loss": 0.8692798, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29248047, + "step": 2913, + "time_per_iteration": 2.7438042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105259, + "balance_loss_mlp": 1.07490873, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.045730944132966495, + "language_loss": 0.79707301, + "learning_rate": 0.0004264325321289808, + "loss": 0.80812562, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.30322266, + "step": 2914, + "time_per_iteration": 2.787429094314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101375, + "balance_loss_mlp": 1.07131052, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.05941371213730478, + "language_loss": 0.8624413, + "learning_rate": 0.00042612439481243736, + "loss": 0.87345505, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.30078125, + "step": 2915, + "time_per_iteration": 2.7993295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_mlp": 1.06113064, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.06435914288601326, + "language_loss": 0.90124059, + "learning_rate": 0.00042581628617694735, + "loss": 0.91214895, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.296875, + "step": 2916, + "time_per_iteration": 2.744046449661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089958, + "balance_loss_mlp": 1.06032228, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.05771140503361017, + "language_loss": 0.81953394, + "learning_rate": 0.0004255082063421296, + "loss": 0.83043355, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.29638672, + "step": 2917, + "time_per_iteration": 2.705963134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.0655117, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.0764674514791775, + "language_loss": 0.84947777, + "learning_rate": 0.00042520015542759065, + "loss": 0.86043298, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.29980469, + "step": 2918, + "time_per_iteration": 2.9078075885772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085954, + "balance_loss_mlp": 1.05662882, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.049198929687541054, + "language_loss": 0.88353539, + "learning_rate": 0.00042489213355292687, + "loss": 0.89439487, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.29296875, + "step": 2919, + "time_per_iteration": 2.862194776535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093265, + "balance_loss_mlp": 1.06300998, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.0619251317266344, + "language_loss": 0.81301886, + "learning_rate": 0.00042458414083772276, + "loss": 0.82395148, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.30224609, + "step": 2920, + "time_per_iteration": 2.5329933166503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095136, + "balance_loss_mlp": 1.0651195, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.05517349890350355, + "language_loss": 0.8525691, + "learning_rate": 0.000424276177401552, + "loss": 0.86352038, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.29956055, + "step": 2921, + "time_per_iteration": 2.787318468093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092355, + "balance_loss_mlp": 1.06200445, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06500569481536145, + "language_loss": 0.85831988, + "learning_rate": 0.0004239682433639763, + "loss": 0.86924338, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.3034668, + "step": 2922, + "time_per_iteration": 2.697091817855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093283, + "balance_loss_mlp": 1.06386256, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.08309086608315261, + "language_loss": 0.85596514, + "learning_rate": 0.0004236603388445467, + "loss": 0.86689794, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.29394531, + "step": 2923, + "time_per_iteration": 2.5720105171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097102, + "balance_loss_mlp": 1.0683012, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07246274776201297, + "language_loss": 0.82229364, + "learning_rate": 0.00042335246396280166, + "loss": 0.83326471, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.28808594, + "step": 2924, + "time_per_iteration": 2.7669975757598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093178, + "balance_loss_mlp": 1.06320906, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06414999121973448, + "language_loss": 0.90646857, + "learning_rate": 0.0004230446188382693, + "loss": 0.91740036, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.29956055, + "step": 2925, + "time_per_iteration": 2.5662741661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.0595876, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.05389869275215176, + "language_loss": 0.80918074, + "learning_rate": 0.0004227368035904654, + "loss": 0.82006967, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.29296875, + "step": 2926, + "time_per_iteration": 2.964599370956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092675, + "balance_loss_mlp": 1.06249142, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06261422618617216, + "language_loss": 0.82895541, + "learning_rate": 0.00042242901833889474, + "loss": 0.83988214, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30151367, + "step": 2927, + "time_per_iteration": 2.6312665939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093424, + "balance_loss_mlp": 1.06376481, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.06041695665754469, + "language_loss": 0.86030155, + "learning_rate": 0.0004221212632030501, + "loss": 0.87123579, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.29614258, + "step": 2928, + "time_per_iteration": 3.0977063179016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06351972, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.06366283736150324, + "language_loss": 0.80857551, + "learning_rate": 0.0004218135383024124, + "loss": 0.81951618, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30541992, + "step": 2929, + "time_per_iteration": 2.749244213104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088519, + "balance_loss_mlp": 1.0590266, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.12143433952472552, + "language_loss": 0.85715157, + "learning_rate": 0.0004215058437564511, + "loss": 0.86803675, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.29467773, + "step": 2930, + "time_per_iteration": 2.593238115310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_mlp": 1.05512953, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.056033125460513485, + "language_loss": 0.82132083, + "learning_rate": 0.00042119817968462397, + "loss": 0.83216375, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.29125977, + "step": 2931, + "time_per_iteration": 2.591958522796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092676, + "balance_loss_mlp": 1.06275427, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.07812059497351068, + "language_loss": 0.87152535, + "learning_rate": 0.0004208905462063766, + "loss": 0.88245207, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.29907227, + "step": 2932, + "time_per_iteration": 2.6288535594940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086522, + "balance_loss_mlp": 1.0571723, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.06389283518633071, + "language_loss": 0.84869772, + "learning_rate": 0.00042058294344114315, + "loss": 0.85956293, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.29345703, + "step": 2933, + "time_per_iteration": 2.6064674854278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05672109, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.05718901807458546, + "language_loss": 0.77749109, + "learning_rate": 0.0004202753715083456, + "loss": 0.78835702, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.29858398, + "step": 2934, + "time_per_iteration": 3.075186014175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093891, + "balance_loss_mlp": 1.0630157, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07168087831316133, + "language_loss": 0.81911719, + "learning_rate": 0.0004199678305273936, + "loss": 0.83005607, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30883789, + "step": 2935, + "time_per_iteration": 2.6289923191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091967, + "balance_loss_mlp": 1.06316626, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.0664481148229904, + "language_loss": 0.81315005, + "learning_rate": 0.0004196603206176854, + "loss": 0.82406974, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.28808594, + "step": 2936, + "time_per_iteration": 2.941150426864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093274, + "balance_loss_mlp": 1.06404424, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07427925135014142, + "language_loss": 0.83779049, + "learning_rate": 0.000419352841898607, + "loss": 0.84872323, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29199219, + "step": 2937, + "time_per_iteration": 2.977189302444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092016, + "balance_loss_mlp": 1.06273842, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.061049572757767595, + "language_loss": 0.77780819, + "learning_rate": 0.000419045394489532, + "loss": 0.78872836, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29296875, + "step": 2938, + "time_per_iteration": 2.6722819805145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086194, + "balance_loss_mlp": 1.05622458, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.05727154642915785, + "language_loss": 0.77326584, + "learning_rate": 0.0004187379785098224, + "loss": 0.78412783, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.29931641, + "step": 2939, + "time_per_iteration": 3.100283622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.05665886, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.06949350877551969, + "language_loss": 0.83849806, + "learning_rate": 0.00041843059407882744, + "loss": 0.84936267, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.29785156, + "step": 2940, + "time_per_iteration": 2.9837162494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.05257499, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.068553917777786, + "language_loss": 0.82768112, + "learning_rate": 0.0004181232413158842, + "loss": 0.83850372, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.29638672, + "step": 2941, + "time_per_iteration": 2.636819839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083664, + "balance_loss_mlp": 1.05371857, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.06960240931548377, + "language_loss": 0.82932127, + "learning_rate": 0.0004178159203403179, + "loss": 0.84015793, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29931641, + "step": 2942, + "time_per_iteration": 2.822134494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_mlp": 1.0547837, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.05318601865014104, + "language_loss": 0.81807715, + "learning_rate": 0.0004175086312714409, + "loss": 0.8289094, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.28442383, + "step": 2943, + "time_per_iteration": 2.571985960006714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086509, + "balance_loss_mlp": 1.05625343, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05713331418457596, + "language_loss": 0.84120524, + "learning_rate": 0.00041720137422855366, + "loss": 0.85207033, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.30224609, + "step": 2944, + "time_per_iteration": 2.7213711738586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086525, + "balance_loss_mlp": 1.05758142, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.1661240742061477, + "language_loss": 0.79230917, + "learning_rate": 0.00041689414933094383, + "loss": 0.80317438, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.28930664, + "step": 2945, + "time_per_iteration": 2.628525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088063, + "balance_loss_mlp": 1.05954862, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.06338169436240754, + "language_loss": 0.81427538, + "learning_rate": 0.00041658695669788653, + "loss": 0.82515597, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.28515625, + "step": 2946, + "time_per_iteration": 2.736955404281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084859, + "balance_loss_mlp": 1.0541029, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.0612697940531113, + "language_loss": 0.81293368, + "learning_rate": 0.00041627979644864453, + "loss": 0.82378221, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.30712891, + "step": 2947, + "time_per_iteration": 2.780796766281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.05436563, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06710047446863547, + "language_loss": 0.81410027, + "learning_rate": 0.0004159726687024683, + "loss": 0.82493049, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.28662109, + "step": 2948, + "time_per_iteration": 2.6072115898132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108621, + "balance_loss_mlp": 1.05757558, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.06141378811636639, + "language_loss": 0.79485345, + "learning_rate": 0.00041566557357859506, + "loss": 0.80571556, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.28613281, + "step": 2949, + "time_per_iteration": 2.911865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.05443358, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.052257384193144164, + "language_loss": 0.79611081, + "learning_rate": 0.0004153585111962502, + "loss": 0.806961, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.30566406, + "step": 2950, + "time_per_iteration": 3.2808187007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_mlp": 1.05606341, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06672147261233864, + "language_loss": 0.84739614, + "learning_rate": 0.0004150514816746453, + "loss": 0.85826337, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.30639648, + "step": 2951, + "time_per_iteration": 2.680326461791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089698, + "balance_loss_mlp": 1.0602051, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.06944116544696582, + "language_loss": 0.85944223, + "learning_rate": 0.0004147444851329802, + "loss": 0.87033927, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29443359, + "step": 2952, + "time_per_iteration": 2.6477670669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086586, + "balance_loss_mlp": 1.05680704, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.054427499920313586, + "language_loss": 0.86026949, + "learning_rate": 0.00041443752169044126, + "loss": 0.87113535, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.29736328, + "step": 2953, + "time_per_iteration": 2.997781276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092153, + "balance_loss_mlp": 1.061993, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.055407826164880256, + "language_loss": 0.84948021, + "learning_rate": 0.0004141305914662025, + "loss": 0.86040175, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.30126953, + "step": 2954, + "time_per_iteration": 2.704019069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05688024, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0673052072270573, + "language_loss": 0.80911326, + "learning_rate": 0.0004138236945794246, + "loss": 0.81998718, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.30493164, + "step": 2955, + "time_per_iteration": 2.88403058052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108918, + "balance_loss_mlp": 1.05911565, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.06799730214965168, + "language_loss": 0.8379457, + "learning_rate": 0.00041351683114925576, + "loss": 0.84883749, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.30053711, + "step": 2956, + "time_per_iteration": 3.0439462661743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087169, + "balance_loss_mlp": 1.0562458, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06948923214023794, + "language_loss": 0.86469889, + "learning_rate": 0.0004132100012948308, + "loss": 0.87557054, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.30883789, + "step": 2957, + "time_per_iteration": 2.6431198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090494, + "balance_loss_mlp": 1.05959463, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.0655655158566539, + "language_loss": 0.84699452, + "learning_rate": 0.00041290320513527145, + "loss": 0.85789943, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.30883789, + "step": 2958, + "time_per_iteration": 2.5978519916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_mlp": 1.05528057, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.05333030562061355, + "language_loss": 0.8519215, + "learning_rate": 0.0004125964427896867, + "loss": 0.86277229, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29760742, + "step": 2959, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_mlp": 1.05468178, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06459683266000829, + "language_loss": 0.79222417, + "learning_rate": 0.0004122897143771723, + "loss": 0.80306756, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.29663086, + "step": 2960, + "time_per_iteration": 2.5457372665405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.05713725, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.057309213891239566, + "language_loss": 0.81961918, + "learning_rate": 0.0004119830200168109, + "loss": 0.83049381, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.30297852, + "step": 2961, + "time_per_iteration": 2.66658091545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.05180621, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.0578679247611712, + "language_loss": 0.88614476, + "learning_rate": 0.0004116763598276714, + "loss": 0.89694846, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.28564453, + "step": 2962, + "time_per_iteration": 2.5355417728424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083269, + "balance_loss_mlp": 1.05394387, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.05524318032555551, + "language_loss": 0.81030452, + "learning_rate": 0.00041136973392881017, + "loss": 0.82113719, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.29345703, + "step": 2963, + "time_per_iteration": 2.8497612476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.05540633, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.06477225886122127, + "language_loss": 0.82179135, + "learning_rate": 0.00041106314243926983, + "loss": 0.83264679, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.30102539, + "step": 2964, + "time_per_iteration": 2.735269069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080389, + "balance_loss_mlp": 1.05103993, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.05516182837620622, + "language_loss": 0.87329233, + "learning_rate": 0.0004107565854780798, + "loss": 0.88409621, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29296875, + "step": 2965, + "time_per_iteration": 2.6157355308532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.05596685, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.07414316825555053, + "language_loss": 0.81466991, + "learning_rate": 0.000410450063164256, + "loss": 0.82552361, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29370117, + "step": 2966, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083362, + "balance_loss_mlp": 1.05291581, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.06746080357230834, + "language_loss": 0.82004952, + "learning_rate": 0.00041014357561680115, + "loss": 0.83088315, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30395508, + "step": 2967, + "time_per_iteration": 2.51119065284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085478, + "balance_loss_mlp": 1.05519855, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.053142332405834165, + "language_loss": 0.86128843, + "learning_rate": 0.0004098371229547039, + "loss": 0.87214315, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.30249023, + "step": 2968, + "time_per_iteration": 2.6994621753692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022253, + "balance_loss_mlp": 1.01390862, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.025900339106917806, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81033063, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.08349609, + "step": 2969, + "time_per_iteration": 4.718291997909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092241, + "balance_loss_mlp": 1.06179523, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.05366083523781242, + "language_loss": 0.80585647, + "learning_rate": 0.00040922432276247107, + "loss": 0.8167789, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.30419922, + "step": 2970, + "time_per_iteration": 2.55259108543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.0609777, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.049420251796361614, + "language_loss": 0.84874177, + "learning_rate": 0.0004089179754702457, + "loss": 0.85966122, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.30932617, + "step": 2971, + "time_per_iteration": 2.771068572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109211, + "balance_loss_mlp": 1.06090152, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.06283275659801735, + "language_loss": 0.7981565, + "learning_rate": 0.00040861166353919843, + "loss": 0.80907762, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.31176758, + "step": 2972, + "time_per_iteration": 2.7827725410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091117, + "balance_loss_mlp": 1.06069493, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06507135137823726, + "language_loss": 0.818784, + "learning_rate": 0.00040830538708824983, + "loss": 0.82969517, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.30395508, + "step": 2973, + "time_per_iteration": 2.845456600189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108959, + "balance_loss_mlp": 1.05966854, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.07493195148818688, + "language_loss": 0.81968939, + "learning_rate": 0.000407999146236307, + "loss": 0.8305853, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29882812, + "step": 2974, + "time_per_iteration": 2.531430244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093173, + "balance_loss_mlp": 1.06284618, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.06121365308687838, + "language_loss": 0.8362776, + "learning_rate": 0.0004076929411022634, + "loss": 0.84720927, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.30322266, + "step": 2975, + "time_per_iteration": 2.645341634750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096437, + "balance_loss_mlp": 1.06591964, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.05509159729755976, + "language_loss": 0.79606473, + "learning_rate": 0.0004073867718049982, + "loss": 0.80702913, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.30493164, + "step": 2976, + "time_per_iteration": 3.085145950317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.07137978, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.06232705756749319, + "language_loss": 0.82691067, + "learning_rate": 0.00040708063846337704, + "loss": 0.83793509, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.31054688, + "step": 2977, + "time_per_iteration": 2.738443613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099181, + "balance_loss_mlp": 1.06813931, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.061703964741206326, + "language_loss": 0.81214464, + "learning_rate": 0.00040677454119625143, + "loss": 0.82313639, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.31005859, + "step": 2978, + "time_per_iteration": 2.6232175827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108887, + "balance_loss_mlp": 1.07758296, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07073355508195153, + "language_loss": 0.83247018, + "learning_rate": 0.0004064684801224587, + "loss": 0.84355903, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.31274414, + "step": 2979, + "time_per_iteration": 2.577918767929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101163, + "balance_loss_mlp": 1.07085991, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.05699497583041508, + "language_loss": 0.80492741, + "learning_rate": 0.00040616245536082224, + "loss": 0.81593907, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30273438, + "step": 2980, + "time_per_iteration": 2.6298904418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101523, + "balance_loss_mlp": 1.07167256, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.04979780559516064, + "language_loss": 0.81357765, + "learning_rate": 0.00040585646703015165, + "loss": 0.82459289, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29833984, + "step": 2981, + "time_per_iteration": 2.8170647621154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_mlp": 1.07118809, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07486213422343042, + "language_loss": 0.78689104, + "learning_rate": 0.0004055505152492419, + "loss": 0.79791927, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.31616211, + "step": 2982, + "time_per_iteration": 2.6379241943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_mlp": 1.06825066, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.05681665302183781, + "language_loss": 0.74231875, + "learning_rate": 0.00040524460013687425, + "loss": 0.75331908, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.31762695, + "step": 2983, + "time_per_iteration": 2.7545318603515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097699, + "balance_loss_mlp": 1.0663712, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.04476617807489617, + "language_loss": 0.81250238, + "learning_rate": 0.0004049387218118155, + "loss": 0.82347941, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.31298828, + "step": 2984, + "time_per_iteration": 2.9756665229797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108902, + "balance_loss_mlp": 1.05816841, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07928255171477795, + "language_loss": 0.85245347, + "learning_rate": 0.00040463288039281777, + "loss": 0.8633436, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30810547, + "step": 2985, + "time_per_iteration": 2.706669807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_mlp": 1.02681565, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.02538869827055974, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78910911, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.07666016, + "step": 2986, + "time_per_iteration": 4.949368953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.05462396, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.060127228305881374, + "language_loss": 0.82366645, + "learning_rate": 0.0004040213087479444, + "loss": 0.83451408, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.30102539, + "step": 2987, + "time_per_iteration": 2.9205455780029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086178, + "balance_loss_mlp": 1.05723405, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.05965622667733625, + "language_loss": 0.85328299, + "learning_rate": 0.0004037155787595018, + "loss": 0.86414474, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.2890625, + "step": 2988, + "time_per_iteration": 2.574509859085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088474, + "balance_loss_mlp": 1.0593158, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.05784717048255493, + "language_loss": 0.80869853, + "learning_rate": 0.000403409886151987, + "loss": 0.8195833, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29125977, + "step": 2989, + "time_per_iteration": 2.945080041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016432, + "balance_loss_mlp": 1.00894582, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.009946927491071988, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83015537, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.07470703, + "step": 2990, + "time_per_iteration": 4.807205677032471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015586, + "balance_loss_mlp": 1.00809932, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.009078458393910433, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79214191, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.07470703, + "step": 2991, + "time_per_iteration": 4.805190563201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.05380619, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.05637441563568418, + "language_loss": 0.76644433, + "learning_rate": 0.00040249303380173807, + "loss": 0.77728564, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.30322266, + "step": 2992, + "time_per_iteration": 3.049729108810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.05780125, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06616333205601678, + "language_loss": 0.79290402, + "learning_rate": 0.00040218749190459126, + "loss": 0.80381036, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.32836914, + "step": 2993, + "time_per_iteration": 2.7314000129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087925, + "balance_loss_mlp": 1.05795622, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.06422497492556134, + "language_loss": 0.82827115, + "learning_rate": 0.00040188198798162775, + "loss": 0.83915043, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29956055, + "step": 2994, + "time_per_iteration": 2.605794668197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089955, + "balance_loss_mlp": 1.06022453, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.05264744908201922, + "language_loss": 0.85358101, + "learning_rate": 0.000401576522151455, + "loss": 0.8644805, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29711914, + "step": 2995, + "time_per_iteration": 2.8504650592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_mlp": 1.05664682, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.05051873290535222, + "language_loss": 0.83133811, + "learning_rate": 0.0004012710945326651, + "loss": 0.8421973, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.29248047, + "step": 2996, + "time_per_iteration": 2.7823193073272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094348, + "balance_loss_mlp": 1.06545174, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.0711371625716349, + "language_loss": 0.81093514, + "learning_rate": 0.0004009657052438355, + "loss": 0.82187867, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28881836, + "step": 2997, + "time_per_iteration": 2.7743020057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091289, + "balance_loss_mlp": 1.06184435, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.06367440987852575, + "language_loss": 0.85650682, + "learning_rate": 0.00040066035440352904, + "loss": 0.86741972, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.29418945, + "step": 2998, + "time_per_iteration": 2.6359331607818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014946, + "balance_loss_mlp": 1.0071255, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.01828635150904939, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8030808, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.078125, + "step": 2999, + "time_per_iteration": 4.881432056427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104047, + "balance_loss_mlp": 1.07417345, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.0709915390631299, + "language_loss": 0.76451176, + "learning_rate": 0.00040004976854266145, + "loss": 0.77555221, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.2980957, + "step": 3000, + "time_per_iteration": 2.5374131202697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101005, + "balance_loss_mlp": 1.07017779, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.051209677129469174, + "language_loss": 0.81337965, + "learning_rate": 0.0003997445337591505, + "loss": 0.8243897, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.30810547, + "step": 3001, + "time_per_iteration": 2.647610902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102438, + "balance_loss_mlp": 1.07351804, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.0611265357111255, + "language_loss": 0.74261576, + "learning_rate": 0.0003994393378982635, + "loss": 0.75364017, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28979492, + "step": 3002, + "time_per_iteration": 2.602245330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013935, + "balance_loss_mlp": 1.00611448, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.01032263408282017, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80551934, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.078125, + "step": 3003, + "time_per_iteration": 4.818480968475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104089, + "balance_loss_mlp": 1.07304692, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.0604287320481862, + "language_loss": 0.88041145, + "learning_rate": 0.0003988290634182961, + "loss": 0.89145231, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.31005859, + "step": 3004, + "time_per_iteration": 2.7484169006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.07284904, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.06655998832299866, + "language_loss": 0.80592918, + "learning_rate": 0.0003985239850361453, + "loss": 0.81695324, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.29541016, + "step": 3005, + "time_per_iteration": 2.6148018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_mlp": 1.07281876, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.0659443256400084, + "language_loss": 0.84734911, + "learning_rate": 0.0003982189460504777, + "loss": 0.85836959, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.29199219, + "step": 3006, + "time_per_iteration": 2.7011501789093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105808, + "balance_loss_mlp": 1.07540917, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.06531961229333205, + "language_loss": 0.7939682, + "learning_rate": 0.00039791394657971935, + "loss": 0.80502629, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.30371094, + "step": 3007, + "time_per_iteration": 2.7082760334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102193, + "balance_loss_mlp": 1.07234263, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.06476760562978502, + "language_loss": 0.8421638, + "learning_rate": 0.00039760898674228205, + "loss": 0.85318571, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.29858398, + "step": 3008, + "time_per_iteration": 2.650878429412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105056, + "balance_loss_mlp": 1.07475293, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.05525540739637584, + "language_loss": 0.80765337, + "learning_rate": 0.0003973040666565613, + "loss": 0.81870395, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.30273438, + "step": 3009, + "time_per_iteration": 3.1226985454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100227, + "balance_loss_mlp": 1.07030547, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06024611276807751, + "language_loss": 0.82195163, + "learning_rate": 0.000396999186440938, + "loss": 0.83295393, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.29882812, + "step": 3010, + "time_per_iteration": 2.844270944595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096543, + "balance_loss_mlp": 1.06533396, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.06262665363935188, + "language_loss": 0.85208702, + "learning_rate": 0.000396694346213777, + "loss": 0.86305249, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.31176758, + "step": 3011, + "time_per_iteration": 2.613032817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109368, + "balance_loss_mlp": 1.06492627, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.05937601459412264, + "language_loss": 0.83947617, + "learning_rate": 0.0003963895460934276, + "loss": 0.85041296, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.28735352, + "step": 3012, + "time_per_iteration": 3.124514102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091421, + "balance_loss_mlp": 1.05992579, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07020347624432877, + "language_loss": 0.8493948, + "learning_rate": 0.00039608478619822376, + "loss": 0.86030906, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.31494141, + "step": 3013, + "time_per_iteration": 2.411346912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_mlp": 1.05544281, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.05715104374994747, + "language_loss": 0.826662, + "learning_rate": 0.00039578006664648394, + "loss": 0.83750206, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.28564453, + "step": 3014, + "time_per_iteration": 2.7363553047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.05310702, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06684904609650524, + "language_loss": 0.81588256, + "learning_rate": 0.0003954753875565105, + "loss": 0.82670951, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.2956543, + "step": 3015, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107747, + "balance_loss_mlp": 1.04890752, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06478579772376787, + "language_loss": 0.82758343, + "learning_rate": 0.00039517074904659057, + "loss": 0.83835804, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.28564453, + "step": 3016, + "time_per_iteration": 2.7099101543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084798, + "balance_loss_mlp": 1.05454302, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.05410468367994604, + "language_loss": 0.84939837, + "learning_rate": 0.00039486615123499535, + "loss": 0.8602463, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.30224609, + "step": 3017, + "time_per_iteration": 2.8504526615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085654, + "balance_loss_mlp": 1.05532694, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.05526317953318916, + "language_loss": 0.85137427, + "learning_rate": 0.00039456159423997996, + "loss": 0.86223084, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.30297852, + "step": 3018, + "time_per_iteration": 2.633484363555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.0523833, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.07104600615119407, + "language_loss": 0.8999185, + "learning_rate": 0.00039425707817978406, + "loss": 0.91074204, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29956055, + "step": 3019, + "time_per_iteration": 2.6299033164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082814, + "balance_loss_mlp": 1.05241609, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.05724387536038855, + "language_loss": 0.83951199, + "learning_rate": 0.00039395260317263124, + "loss": 0.85034013, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.30395508, + "step": 3020, + "time_per_iteration": 2.5456759929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080319, + "balance_loss_mlp": 1.04996824, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.07612516842687451, + "language_loss": 0.85048491, + "learning_rate": 0.0003936481693367291, + "loss": 0.86128807, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.3034668, + "step": 3021, + "time_per_iteration": 2.7192864418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094567, + "balance_loss_mlp": 1.06259549, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08707963459833061, + "language_loss": 0.882092, + "learning_rate": 0.0003933437767902697, + "loss": 0.89303768, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.31958008, + "step": 3022, + "time_per_iteration": 2.7938294410705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.05792677, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07541432505918821, + "language_loss": 0.7834546, + "learning_rate": 0.00039303942565142825, + "loss": 0.79433668, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.30249023, + "step": 3023, + "time_per_iteration": 2.7417471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091198, + "balance_loss_mlp": 1.06089532, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.05482425315239383, + "language_loss": 0.76731157, + "learning_rate": 0.0003927351160383644, + "loss": 0.77822357, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.30249023, + "step": 3024, + "time_per_iteration": 2.804474353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091546, + "balance_loss_mlp": 1.06193483, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05202928961884776, + "language_loss": 0.77983212, + "learning_rate": 0.000392430848069222, + "loss": 0.79074758, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.530200958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097141, + "balance_loss_mlp": 1.06814933, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.058580785743037773, + "language_loss": 0.82503867, + "learning_rate": 0.00039212662186212795, + "loss": 0.8360101, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.28979492, + "step": 3026, + "time_per_iteration": 2.592423677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.06676841, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.04855017878997747, + "language_loss": 0.7719928, + "learning_rate": 0.0003918224375351934, + "loss": 0.78294182, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.28149414, + "step": 3027, + "time_per_iteration": 2.7347710132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101546, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.05175541468331668, + "language_loss": 0.7881335, + "learning_rate": 0.0003915182952065135, + "loss": 0.79914892, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29858398, + "step": 3028, + "time_per_iteration": 2.698678493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_mlp": 1.07684946, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.051679899573834884, + "language_loss": 0.87814313, + "learning_rate": 0.0003912141949941664, + "loss": 0.88920105, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.2890625, + "step": 3029, + "time_per_iteration": 2.703824520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107968, + "balance_loss_mlp": 1.07675922, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.07311487113166662, + "language_loss": 0.82985795, + "learning_rate": 0.0003909101370162143, + "loss": 0.84093761, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.31201172, + "step": 3030, + "time_per_iteration": 2.601590633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101355, + "balance_loss_mlp": 1.00611103, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.01566462127280147, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73447442, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.07421875, + "step": 3031, + "time_per_iteration": 4.907916307449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103812, + "balance_loss_mlp": 1.07403314, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.05748921462157389, + "language_loss": 0.8307178, + "learning_rate": 0.0003903021482356622, + "loss": 0.84175599, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29760742, + "step": 3032, + "time_per_iteration": 2.8251240253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_mlp": 1.07525432, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.054780146703337314, + "language_loss": 0.82722723, + "learning_rate": 0.00038999821766910465, + "loss": 0.83827209, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.29248047, + "step": 3033, + "time_per_iteration": 2.9882729053497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108478, + "balance_loss_mlp": 1.07996285, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.08031037628307693, + "language_loss": 0.86154497, + "learning_rate": 0.00038969432980902606, + "loss": 0.87262976, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.28540039, + "step": 3034, + "time_per_iteration": 2.597313642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018692, + "balance_loss_mlp": 1.01149189, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.013503469394203483, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80803192, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.07177734, + "step": 3035, + "time_per_iteration": 4.801652669906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113026, + "balance_loss_mlp": 1.08374798, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0646542819028206, + "language_loss": 0.82506442, + "learning_rate": 0.00038908668268020953, + "loss": 0.83619463, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29223633, + "step": 3036, + "time_per_iteration": 2.6857457160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112518, + "balance_loss_mlp": 1.08381224, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.21422512196310703, + "language_loss": 0.85166728, + "learning_rate": 0.00038878292364738097, + "loss": 0.86279243, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.28662109, + "step": 3037, + "time_per_iteration": 2.776686191558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106641, + "balance_loss_mlp": 1.07726789, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0719771880124652, + "language_loss": 0.87355781, + "learning_rate": 0.0003884792077928508, + "loss": 0.88462424, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.5682616233825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_mlp": 1.07304573, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.06153670645771429, + "language_loss": 0.7661767, + "learning_rate": 0.0003881755352345322, + "loss": 0.77719897, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29174805, + "step": 3039, + "time_per_iteration": 2.5531814098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104927, + "balance_loss_mlp": 1.07560194, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05739173880603102, + "language_loss": 0.86896229, + "learning_rate": 0.0003878719060903207, + "loss": 0.88001162, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29296875, + "step": 3040, + "time_per_iteration": 2.593386650085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098868, + "balance_loss_mlp": 1.06908977, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.068924296543817, + "language_loss": 0.84256113, + "learning_rate": 0.0003875683204780961, + "loss": 0.85354984, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29785156, + "step": 3041, + "time_per_iteration": 2.6921916007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_mlp": 1.07145464, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.07404975426077917, + "language_loss": 0.85055083, + "learning_rate": 0.00038726477851572043, + "loss": 0.86155903, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.29394531, + "step": 3042, + "time_per_iteration": 2.76772403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090937, + "balance_loss_mlp": 1.06249356, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.06423863125550561, + "language_loss": 0.80573255, + "learning_rate": 0.0003869612803210395, + "loss": 0.81664193, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.28442383, + "step": 3043, + "time_per_iteration": 2.6271820068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092493, + "balance_loss_mlp": 1.06314421, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.07232729129784332, + "language_loss": 0.83455092, + "learning_rate": 0.0003866578260118817, + "loss": 0.84547591, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29345703, + "step": 3044, + "time_per_iteration": 2.583698272705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05616593, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.059856611418728146, + "language_loss": 0.83175647, + "learning_rate": 0.0003863544157060581, + "loss": 0.84260201, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.28369141, + "step": 3045, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090685, + "balance_loss_mlp": 1.06166923, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.05199684229497746, + "language_loss": 0.82254589, + "learning_rate": 0.0003860510495213634, + "loss": 0.8334527, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.28979492, + "step": 3046, + "time_per_iteration": 2.7998342514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090034, + "balance_loss_mlp": 1.05946922, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08208062967584176, + "language_loss": 0.78349328, + "learning_rate": 0.0003857477275755746, + "loss": 0.7943936, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.30517578, + "step": 3047, + "time_per_iteration": 2.6120448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088733, + "balance_loss_mlp": 1.05940795, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0525859268526321, + "language_loss": 0.83988523, + "learning_rate": 0.00038544444998645167, + "loss": 0.8507725, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.29296875, + "step": 3048, + "time_per_iteration": 2.9847609996795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085173, + "balance_loss_mlp": 1.0563724, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.06739730522499447, + "language_loss": 0.82059789, + "learning_rate": 0.00038514121687173767, + "loss": 0.83144969, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.28808594, + "step": 3049, + "time_per_iteration": 2.619170904159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081984, + "balance_loss_mlp": 1.0529443, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07072588382777995, + "language_loss": 0.82076973, + "learning_rate": 0.00038483802834915807, + "loss": 0.83158958, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.29003906, + "step": 3050, + "time_per_iteration": 2.9947521686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04742062, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.0556694240307722, + "language_loss": 0.7980268, + "learning_rate": 0.00038453488453642074, + "loss": 0.80879277, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29174805, + "step": 3051, + "time_per_iteration": 2.659647226333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081993, + "balance_loss_mlp": 1.05133235, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.055022006168623364, + "language_loss": 0.8682425, + "learning_rate": 0.00038423178555121697, + "loss": 0.87906241, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.30664062, + "step": 3052, + "time_per_iteration": 2.682971954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078016, + "balance_loss_mlp": 1.0489769, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.05776598371070369, + "language_loss": 0.85701603, + "learning_rate": 0.00038392873151121994, + "loss": 0.86779618, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.29052734, + "step": 3053, + "time_per_iteration": 3.060055732727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077537, + "balance_loss_mlp": 1.04883146, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.06401371867882108, + "language_loss": 0.83262593, + "learning_rate": 0.0003836257225340859, + "loss": 0.84340131, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.28686523, + "step": 3054, + "time_per_iteration": 2.680649995803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.04853082, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.058869654242756926, + "language_loss": 0.82344568, + "learning_rate": 0.00038332275873745336, + "loss": 0.83423615, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.3046875, + "step": 3055, + "time_per_iteration": 3.036266565322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108387, + "balance_loss_mlp": 1.05485463, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05256953045681507, + "language_loss": 0.83349717, + "learning_rate": 0.0003830198402389431, + "loss": 0.84433585, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.2902832, + "step": 3056, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.06163549, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.04626706953255302, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78418016, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07421875, + "step": 3057, + "time_per_iteration": 4.978636026382446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082198, + "balance_loss_mlp": 1.05349255, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.07448060145489646, + "language_loss": 0.83136308, + "learning_rate": 0.0003824141396066855, + "loss": 0.84218502, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28710938, + "step": 3058, + "time_per_iteration": 2.5531108379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088619, + "balance_loss_mlp": 1.05910254, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.059082946906010764, + "language_loss": 0.82999164, + "learning_rate": 0.000382111357708092, + "loss": 0.84087777, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29541016, + "step": 3059, + "time_per_iteration": 2.699920654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088385, + "balance_loss_mlp": 1.05917883, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.071653907002528, + "language_loss": 0.84021831, + "learning_rate": 0.00038180862157792864, + "loss": 0.85110211, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.29174805, + "step": 3060, + "time_per_iteration": 2.8073549270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_mlp": 1.05642152, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.05679216094879844, + "language_loss": 0.82328987, + "learning_rate": 0.0003815059313337279, + "loss": 0.83413565, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28198242, + "step": 3061, + "time_per_iteration": 2.6649534702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086963, + "balance_loss_mlp": 1.05906773, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.07322136366051005, + "language_loss": 0.78155029, + "learning_rate": 0.00038120328709300436, + "loss": 0.79241997, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.27905273, + "step": 3062, + "time_per_iteration": 2.9070422649383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091769, + "balance_loss_mlp": 1.06191885, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.07246450050077374, + "language_loss": 0.83913672, + "learning_rate": 0.0003809006889732549, + "loss": 0.85005438, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.29833984, + "step": 3063, + "time_per_iteration": 2.803724527359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06420445, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.05969034427320992, + "language_loss": 0.88370293, + "learning_rate": 0.0003805981370919589, + "loss": 0.89462918, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28442383, + "step": 3064, + "time_per_iteration": 2.495248556137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086784, + "balance_loss_mlp": 1.05877018, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.05081424319280643, + "language_loss": 0.83982229, + "learning_rate": 0.0003802956315665771, + "loss": 0.85069013, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28027344, + "step": 3065, + "time_per_iteration": 2.6511592864990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091365, + "balance_loss_mlp": 1.06182539, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.06728201091458674, + "language_loss": 0.81791949, + "learning_rate": 0.0003799931725145529, + "loss": 0.8288331, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.29516602, + "step": 3066, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095665, + "balance_loss_mlp": 1.06729341, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.05193283223246739, + "language_loss": 0.86020327, + "learning_rate": 0.00037969076005331083, + "loss": 0.87115991, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28369141, + "step": 3067, + "time_per_iteration": 2.763853073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06713736, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.05663918686290471, + "language_loss": 0.88129491, + "learning_rate": 0.00037938839430025817, + "loss": 0.89225829, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.29248047, + "step": 3068, + "time_per_iteration": 2.6258280277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089417, + "balance_loss_mlp": 1.06092644, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.05275324094783275, + "language_loss": 0.85889924, + "learning_rate": 0.0003790860753727835, + "loss": 0.86979342, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.28491211, + "step": 3069, + "time_per_iteration": 2.7926387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.05799568, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0573953914976859, + "language_loss": 0.8280952, + "learning_rate": 0.00037878380338825766, + "loss": 0.83896416, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28881836, + "step": 3070, + "time_per_iteration": 2.6791534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089394, + "balance_loss_mlp": 1.06209493, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.054269754776710775, + "language_loss": 0.81082213, + "learning_rate": 0.00037848157846403287, + "loss": 0.82171613, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.2734375, + "step": 3071, + "time_per_iteration": 2.897139549255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095364, + "balance_loss_mlp": 1.06792235, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.0725138562855444, + "language_loss": 0.83259237, + "learning_rate": 0.0003781794007174435, + "loss": 0.84354603, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.2746582, + "step": 3072, + "time_per_iteration": 2.724810838699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_mlp": 1.02988398, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.01939748854391394, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75111091, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.06689453, + "step": 3073, + "time_per_iteration": 4.9330198764801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090512, + "balance_loss_mlp": 1.06285512, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.048822002095482486, + "language_loss": 0.81208611, + "learning_rate": 0.0003775751872264152, + "loss": 0.82299125, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.27661133, + "step": 3074, + "time_per_iteration": 2.7631497383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084721, + "balance_loss_mlp": 1.05599189, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.06348444489710649, + "language_loss": 0.86787391, + "learning_rate": 0.0003772731517165527, + "loss": 0.87872112, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28710938, + "step": 3075, + "time_per_iteration": 2.7517099380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089134, + "balance_loss_mlp": 1.06069052, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.059695821747375526, + "language_loss": 0.83545357, + "learning_rate": 0.0003769711638534784, + "loss": 0.84634489, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28466797, + "step": 3076, + "time_per_iteration": 2.9352333545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090964, + "balance_loss_mlp": 1.06209183, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.08879190082108672, + "language_loss": 0.79118001, + "learning_rate": 0.00037666922375443446, + "loss": 0.80208963, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28857422, + "step": 3077, + "time_per_iteration": 2.5947184562683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093967, + "balance_loss_mlp": 1.06578577, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06374349472109522, + "language_loss": 0.81828058, + "learning_rate": 0.00037636733153664396, + "loss": 0.82922018, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.28149414, + "step": 3078, + "time_per_iteration": 2.8191051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109303, + "balance_loss_mlp": 1.0645864, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.06406278721713668, + "language_loss": 0.80298102, + "learning_rate": 0.0003760654873173124, + "loss": 0.81391132, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.28466797, + "step": 3079, + "time_per_iteration": 2.656822919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089541, + "balance_loss_mlp": 1.06081128, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.04854482848269962, + "language_loss": 0.82530022, + "learning_rate": 0.00037576369121362566, + "loss": 0.83619559, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.28759766, + "step": 3080, + "time_per_iteration": 2.589050531387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06840181, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.05673956944694001, + "language_loss": 0.82090509, + "learning_rate": 0.0003754619433427516, + "loss": 0.83188212, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29272461, + "step": 3081, + "time_per_iteration": 2.8826987743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086639, + "balance_loss_mlp": 1.05845797, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.06493823771045844, + "language_loss": 0.78039849, + "learning_rate": 0.0003751602438218392, + "loss": 0.79126489, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.28222656, + "step": 3082, + "time_per_iteration": 2.815852642059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087731, + "balance_loss_mlp": 1.05952644, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.08102695368832301, + "language_loss": 0.83818078, + "learning_rate": 0.0003748585927680186, + "loss": 0.84905803, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.28198242, + "step": 3083, + "time_per_iteration": 2.6566061973571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_mlp": 1.05651248, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.0619003043193751, + "language_loss": 0.8314001, + "learning_rate": 0.00037455699029840086, + "loss": 0.84224129, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.27612305, + "step": 3084, + "time_per_iteration": 2.609382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081588, + "balance_loss_mlp": 1.05436099, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.05433571826648474, + "language_loss": 0.84684891, + "learning_rate": 0.0003742554365300787, + "loss": 0.85766476, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.27270508, + "step": 3085, + "time_per_iteration": 2.725409746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05927253, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.05832989485618193, + "language_loss": 0.79031849, + "learning_rate": 0.0003739539315801255, + "loss": 0.80118442, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.27331543, + "step": 3086, + "time_per_iteration": 2.9751360416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092425, + "balance_loss_mlp": 1.06493533, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.05988774460659005, + "language_loss": 0.9182803, + "learning_rate": 0.000373652475565596, + "loss": 0.92920458, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.27490234, + "step": 3087, + "time_per_iteration": 2.535181999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090266, + "balance_loss_mlp": 1.06144142, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.07303028521355714, + "language_loss": 0.81608456, + "learning_rate": 0.00037335106860352587, + "loss": 0.82698727, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.28808594, + "step": 3088, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545377, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.0577260245362681, + "language_loss": 0.83174306, + "learning_rate": 0.00037304971081093146, + "loss": 0.84268945, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.29199219, + "step": 3089, + "time_per_iteration": 2.5568172931671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06479192, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.05440667028717182, + "language_loss": 0.80792761, + "learning_rate": 0.00037274840230481024, + "loss": 0.81884158, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.26635742, + "step": 3090, + "time_per_iteration": 2.7040512561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089877, + "balance_loss_mlp": 1.06152868, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07197994401815008, + "language_loss": 0.79483205, + "learning_rate": 0.00037244714320214077, + "loss": 0.80573082, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.28369141, + "step": 3091, + "time_per_iteration": 2.527803659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091312, + "balance_loss_mlp": 1.06317902, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06270949928795992, + "language_loss": 0.83166003, + "learning_rate": 0.000372145933619882, + "loss": 0.84257317, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.28137207, + "step": 3092, + "time_per_iteration": 2.869267225265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092404, + "balance_loss_mlp": 1.06455636, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.059066436199884755, + "language_loss": 0.82841283, + "learning_rate": 0.000371844773674974, + "loss": 0.83933693, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.27856445, + "step": 3093, + "time_per_iteration": 2.6301257610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097042, + "balance_loss_mlp": 1.06793106, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.06442112613973276, + "language_loss": 0.82118666, + "learning_rate": 0.0003715436634843375, + "loss": 0.83215708, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29101562, + "step": 3094, + "time_per_iteration": 2.8569583892822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091347, + "balance_loss_mlp": 1.06466842, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.04641072419683149, + "language_loss": 0.80758119, + "learning_rate": 0.00037124260316487355, + "loss": 0.81849468, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.26708984, + "step": 3095, + "time_per_iteration": 2.8417470455169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095419, + "balance_loss_mlp": 1.06838274, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.05475651988922655, + "language_loss": 0.89790189, + "learning_rate": 0.0003709415928334643, + "loss": 0.90885603, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.27075195, + "step": 3096, + "time_per_iteration": 2.5519328117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092318, + "balance_loss_mlp": 1.06382728, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09831894239475095, + "language_loss": 0.80721879, + "learning_rate": 0.00037064063260697233, + "loss": 0.818142, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.28491211, + "step": 3097, + "time_per_iteration": 2.8612656593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099184, + "balance_loss_mlp": 1.07157493, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.058836420710008684, + "language_loss": 0.78798771, + "learning_rate": 0.0003703397226022407, + "loss": 0.79897952, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.27612305, + "step": 3098, + "time_per_iteration": 3.069542169570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_mlp": 1.03243947, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.024027627375554906, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76539135, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.06835938, + "step": 3099, + "time_per_iteration": 4.940065860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109756, + "balance_loss_mlp": 1.06966519, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.059128365336986094, + "language_loss": 0.83247489, + "learning_rate": 0.0003697380537253339, + "loss": 0.84345049, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.27929688, + "step": 3100, + "time_per_iteration": 2.638352632522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098347, + "balance_loss_mlp": 1.06973624, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.05513129923941457, + "language_loss": 0.82084006, + "learning_rate": 0.0003694372950867471, + "loss": 0.83182353, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28637695, + "step": 3101, + "time_per_iteration": 2.7355875968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101199, + "balance_loss_mlp": 1.07282722, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.05863829677079808, + "language_loss": 0.77766848, + "learning_rate": 0.0003691365871370976, + "loss": 0.78868043, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.28393555, + "step": 3102, + "time_per_iteration": 3.0227084159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110316, + "balance_loss_mlp": 1.07533622, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06404713166930852, + "language_loss": 0.85323572, + "learning_rate": 0.00036883592999313093, + "loss": 0.86426735, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27832031, + "step": 3103, + "time_per_iteration": 2.659637689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.0700587, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.05340010645713243, + "language_loss": 0.79008019, + "learning_rate": 0.0003685353237715722, + "loss": 0.80105591, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27563477, + "step": 3104, + "time_per_iteration": 2.9019625186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109062, + "balance_loss_mlp": 1.06272471, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.053396202956180965, + "language_loss": 0.81746447, + "learning_rate": 0.0003682347685891274, + "loss": 0.82837057, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.27893066, + "step": 3105, + "time_per_iteration": 2.8479247093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093319, + "balance_loss_mlp": 1.06535256, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.061940030050424234, + "language_loss": 0.80626607, + "learning_rate": 0.0003679342645624822, + "loss": 0.81719923, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.2800293, + "step": 3106, + "time_per_iteration": 2.988600015640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088181, + "balance_loss_mlp": 1.06088209, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.06552701347411696, + "language_loss": 0.82154477, + "learning_rate": 0.0003676338118083025, + "loss": 0.83242655, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.2734375, + "step": 3107, + "time_per_iteration": 3.0211057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091455, + "balance_loss_mlp": 1.06372714, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.05808577111452716, + "language_loss": 0.79585344, + "learning_rate": 0.0003673334104432347, + "loss": 0.806768, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.27758789, + "step": 3108, + "time_per_iteration": 2.6277918815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109043, + "balance_loss_mlp": 1.06255877, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.05782699460566696, + "language_loss": 0.83817154, + "learning_rate": 0.0003670330605839048, + "loss": 0.84907585, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.27856445, + "step": 3109, + "time_per_iteration": 2.786181926727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094155, + "balance_loss_mlp": 1.06685627, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.06234839208499282, + "language_loss": 0.76878405, + "learning_rate": 0.0003667327623469191, + "loss": 0.77972555, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27319336, + "step": 3110, + "time_per_iteration": 2.731876850128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089583, + "balance_loss_mlp": 1.0621767, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.06451414709321307, + "language_loss": 0.78028917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79118496, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27429199, + "step": 3111, + "time_per_iteration": 2.796886682510376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088313, + "balance_loss_mlp": 1.06072783, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.06854484980093518, + "language_loss": 0.82222939, + "learning_rate": 0.00036613232120630393, + "loss": 0.83311254, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.27587891, + "step": 3112, + "time_per_iteration": 2.6065847873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_mlp": 1.05594933, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.06819300023171558, + "language_loss": 0.80318254, + "learning_rate": 0.00036583217853578643, + "loss": 0.81402361, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.28173828, + "step": 3113, + "time_per_iteration": 2.5723838806152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_mlp": 1.06200337, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.05495468357602656, + "language_loss": 0.77783948, + "learning_rate": 0.000365532087953837, + "loss": 0.78872508, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.26586914, + "step": 3114, + "time_per_iteration": 3.622190475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081565, + "balance_loss_mlp": 1.05359864, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07841874273871757, + "language_loss": 0.89431345, + "learning_rate": 0.00036523204957696065, + "loss": 0.90512908, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.27978516, + "step": 3115, + "time_per_iteration": 2.6414806842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084485, + "balance_loss_mlp": 1.05627978, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.0586823821525485, + "language_loss": 0.80958188, + "learning_rate": 0.00036493206352164324, + "loss": 0.8204267, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.28222656, + "step": 3116, + "time_per_iteration": 2.896613121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.05184269, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.05558165654665051, + "language_loss": 0.85426074, + "learning_rate": 0.000364632129904349, + "loss": 0.86506593, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.28662109, + "step": 3117, + "time_per_iteration": 2.7053070068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079719, + "balance_loss_mlp": 1.05215788, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.05806752486487043, + "language_loss": 0.78326154, + "learning_rate": 0.00036433224884152283, + "loss": 0.79405868, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.27587891, + "step": 3118, + "time_per_iteration": 2.6854429244995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083264, + "balance_loss_mlp": 1.0547967, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.06710995797512392, + "language_loss": 0.78089821, + "learning_rate": 0.00036403242044958875, + "loss": 0.79173082, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28466797, + "step": 3119, + "time_per_iteration": 2.53751540184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04949808, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.059219046094812676, + "language_loss": 0.91922826, + "learning_rate": 0.0003637326448449507, + "loss": 0.93000555, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.28222656, + "step": 3120, + "time_per_iteration": 2.7070553302764893 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 260120304, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7078069337325568.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/training_args.bin b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b28f0633932ff84d8e0fde7beb2f9c59f0d04be --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54b92ce31f27a60f5f91da41c22febbdc5fe6a9ac82c4d361c2b9dbc9096639 +size 7992 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/zero_to_fp32.py b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-3120/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/added_tokens.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4477091c8e5e4d06ea14a8a918edb0ae2310c298 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/generation_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efbcc20600b738105998bec61f910e4e4f950e67 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feba4a94ca616fab47ee1b626afa991fb4ce7349aa7fabf090bd94f1be8e8248 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a09d6380ac10ff0c1f9556a5383d01f8815bb796 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2392c987c0259470f44cfd973ff0e046063d8e4484fbdd5e646b0098180e4cb7 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f60355257a2db2c47d83f96ded3c6ab3e7adb5a9 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1817ec67851740e8ac8760a388aabc5b0df08711e9361a6a084524f4ac4b997 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f4eed5632d94790cb8a37e1ddf3f0baeb1fdf19 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:776a7610a96e4bcf68b062a90bd634ac4d0700f492675d079d2480d69823219f +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8029b55d478c29f5cb24a5c66794099bcf00fb15 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a854938dd002c13c7183da8dd010c42a96af98ea9b3274b180cbf922c6a10a +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e919de44b066c5af40f72433d5ade4edc675660 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e63c5eca7657b701aa8d5b09873db3f37b4429ad69332201e5725e4fb94bb175 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55d05330acb769aa13a816ad50dc0ba3a0c0224b --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e53ed5263f7da0274461d494493356e7bf680d759d4af590fbec3f86543631c +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..312b0091410e0b1c6b65634c1969a4459af0adc2 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107f47f1e8cb6d44193343b0f13e0f2e9b8602c0566bcc2a4c3b93a6abc7aeea +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/latest b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/latest new file mode 100644 index 0000000000000000000000000000000000000000..ae01dfd535e9ee314b565695c1d61230ecf4c494 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/latest @@ -0,0 +1 @@ +global_step4160 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a8d2638c0d310db56a9c22dac14103f4c5331b33 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c31cb21857e1804263e44d35a0cb109a8bdf05257d9a698e34aaf3a4f6983ab6 +size 3759025152 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model.safetensors.index.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_0.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_1.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_2.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_3.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/special_tokens_map.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/tokenizer.model b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/tokenizer_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/trainer_state.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8db1026e4c4e7b24ce5203892b5604630c801d20 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/trainer_state.json @@ -0,0 +1,62433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003078106964217, + "eval_steps": 500, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03935784, + "balance_loss_mlp": 2.84935808, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 13.498251331228948, + "language_loss": 2.81572914, + "learning_rate": 0.0, + "loss": 1.90346789, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.30480647087097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0351246, + "balance_loss_mlp": 2.65644169, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 27.482987886380492, + "language_loss": 8.76816368, + "learning_rate": 0.00013726078121135892, + "loss": 8.80328846, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 8.578125, + "step": 2, + "time_per_iteration": 2.6929261684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03513305, + "balance_loss_mlp": 2.65728736, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 28.576563245741852, + "language_loss": 9.00053596, + "learning_rate": 0.00021755319103969496, + "loss": 9.03566933, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 8.578125, + "step": 3, + "time_per_iteration": 2.7945075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03667009, + "balance_loss_mlp": 2.78657675, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 15.694146018083416, + "language_loss": 2.74122858, + "learning_rate": 0.00027452156242271784, + "loss": 2.77789879, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 8.828125, + "step": 4, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03933422, + "balance_loss_mlp": 3.01102829, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 3.505338851882968, + "language_loss": 1.83478093, + "learning_rate": 0.0003187096642208417, + "loss": 1.87411511, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 9.2109375, + "step": 5, + "time_per_iteration": 2.651094675064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04005588, + "balance_loss_mlp": 3.05420256, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 3.050600048840319, + "language_loss": 1.61776543, + "learning_rate": 0.0003548139722510539, + "loss": 1.65782118, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 9.4921875, + "step": 6, + "time_per_iteration": 2.697614908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03708502, + "balance_loss_mlp": 2.7708497, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7974788691124679, + "language_loss": 1.32417345, + "learning_rate": 0.00038533972973918044, + "loss": 1.36125851, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 9.3515625, + "step": 7, + "time_per_iteration": 2.6407949924468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0332405, + "balance_loss_mlp": 2.38868618, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.7144720842381633, + "language_loss": 1.25956392, + "learning_rate": 0.0004117823436340768, + "loss": 1.29280448, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 9.3359375, + "step": 8, + "time_per_iteration": 2.6287930011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02785454, + "balance_loss_mlp": 1.8508532, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.3140255221758466, + "language_loss": 1.29993415, + "learning_rate": 0.00043510638207938993, + "loss": 1.32778871, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 9.3203125, + "step": 9, + "time_per_iteration": 2.8048858642578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0244685, + "balance_loss_mlp": 1.50004196, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.19799802642524775, + "language_loss": 1.19032216, + "learning_rate": 0.00045597044543220066, + "loss": 1.2147907, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 9.4453125, + "step": 10, + "time_per_iteration": 2.7669434547424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02310187, + "balance_loss_mlp": 1.35117221, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.14485632700798082, + "language_loss": 1.18421102, + "learning_rate": 0.00047484428652143135, + "loss": 1.20731282, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 9.5703125, + "step": 11, + "time_per_iteration": 2.9067423343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02309394, + "balance_loss_mlp": 1.33740926, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.1366980934684776, + "language_loss": 1.24379897, + "learning_rate": 0.0004920747534624128, + "loss": 1.26689291, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 9.703125, + "step": 12, + "time_per_iteration": 2.612813949584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022984, + "balance_loss_mlp": 1.32565212, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.11957957623458634, + "language_loss": 1.26615512, + "learning_rate": 0.0005079252465375872, + "loss": 1.28913903, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 9.7109375, + "step": 13, + "time_per_iteration": 2.879688262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02311662, + "balance_loss_mlp": 1.34730673, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.10749127497061137, + "language_loss": 1.14448667, + "learning_rate": 0.0005226005109505393, + "loss": 1.16760325, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 9.625, + "step": 14, + "time_per_iteration": 2.568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02285502, + "balance_loss_mlp": 1.3615818, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.11405493545380829, + "language_loss": 1.20514369, + "learning_rate": 0.0005362628552605367, + "loss": 1.22799873, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 9.21875, + "step": 15, + "time_per_iteration": 2.6814210414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02243131, + "balance_loss_mlp": 1.36117291, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.10465613456634369, + "language_loss": 1.24307358, + "learning_rate": 0.0005490431248454357, + "loss": 1.26550484, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 8.84375, + "step": 16, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02323403, + "balance_loss_mlp": 1.52994621, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2929644268686402, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78028512, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.90625, + "step": 17, + "time_per_iteration": 6.376815319061279 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02154669, + "balance_loss_mlp": 1.37418151, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.15081794947454089, + "language_loss": 1.11159086, + "learning_rate": 0.0005723671632907488, + "loss": 1.13313746, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.80078125, + "step": 18, + "time_per_iteration": 2.721731424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02067628, + "balance_loss_mlp": 1.35466075, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11430094844987627, + "language_loss": 1.15730095, + "learning_rate": 0.0005830738490244919, + "loss": 1.1779772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 7.12890625, + "step": 19, + "time_per_iteration": 2.691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01966178, + "balance_loss_mlp": 1.31958628, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10166759343386816, + "language_loss": 1.17760253, + "learning_rate": 0.0005932312266435596, + "loss": 1.19726431, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.46484375, + "step": 20, + "time_per_iteration": 2.779218912124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01836812, + "balance_loss_mlp": 1.26727819, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.12846528828878043, + "language_loss": 1.12106359, + "learning_rate": 0.0006028929207788754, + "loss": 1.13943172, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 5.70703125, + "step": 21, + "time_per_iteration": 2.716970443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01720951, + "balance_loss_mlp": 1.21970022, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09445288880840001, + "language_loss": 1.16516471, + "learning_rate": 0.0006121050677327902, + "loss": 1.18237424, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 5.0078125, + "step": 22, + "time_per_iteration": 2.92696475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01630624, + "balance_loss_mlp": 1.19193399, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.11621712848760359, + "language_loss": 1.06380248, + "learning_rate": 0.0006209076479463684, + "loss": 1.08010876, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.3984375, + "step": 23, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01572853, + "balance_loss_mlp": 1.18394423, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.10970997088624258, + "language_loss": 1.16519284, + "learning_rate": 0.0006293355346737718, + "loss": 1.18092132, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 3.88476562, + "step": 24, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0152954, + "balance_loss_mlp": 1.18755198, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.09735665571869598, + "language_loss": 1.12784922, + "learning_rate": 0.0006374193284416834, + "loss": 1.14314473, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 3.42382812, + "step": 25, + "time_per_iteration": 2.7919249534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0148827, + "balance_loss_mlp": 1.19282198, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.09233879954989622, + "language_loss": 1.11062908, + "learning_rate": 0.0006451860277489461, + "loss": 1.12551177, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 2.953125, + "step": 26, + "time_per_iteration": 2.581066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462989, + "balance_loss_mlp": 1.20988345, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.12330238493557526, + "language_loss": 1.19441557, + "learning_rate": 0.0006526595731190848, + "loss": 1.20904553, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 2.52929688, + "step": 27, + "time_per_iteration": 2.49725604057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423898, + "balance_loss_mlp": 1.20874906, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.09841719698503415, + "language_loss": 1.12322927, + "learning_rate": 0.0006598612921618983, + "loss": 1.13746822, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 2.15625, + "step": 28, + "time_per_iteration": 2.822068929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399446, + "balance_loss_mlp": 1.21443295, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.2589331093265968, + "language_loss": 1.06232262, + "learning_rate": 0.0006668102665011454, + "loss": 1.07631707, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 1.84765625, + "step": 29, + "time_per_iteration": 3.2402820587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444994, + "balance_loss_mlp": 1.28353739, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.1317361033328709, + "language_loss": 1.14859319, + "learning_rate": 0.0006735236364718957, + "loss": 1.16304302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 1.61425781, + "step": 30, + "time_per_iteration": 2.6861231327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333301, + "balance_loss_mlp": 1.20445967, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.07039345614882069, + "language_loss": 1.13512135, + "learning_rate": 0.0006800168558381346, + "loss": 1.14845431, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 1.28808594, + "step": 31, + "time_per_iteration": 2.6444640159606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254242, + "balance_loss_mlp": 1.153772, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.07602265872136475, + "language_loss": 1.1720531, + "learning_rate": 0.0006863039060567947, + "loss": 1.18459558, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 1.00439453, + "step": 32, + "time_per_iteration": 2.7225399017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117915, + "balance_loss_mlp": 1.10071015, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.062098451262649575, + "language_loss": 1.09530759, + "learning_rate": 0.0006923974775611263, + "loss": 1.10709918, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.78417969, + "step": 33, + "time_per_iteration": 2.795565366744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155392, + "balance_loss_mlp": 1.09416604, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0750568617782567, + "language_loss": 1.06307364, + "learning_rate": 0.0006983091239737814, + "loss": 1.0746274, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.61132812, + "step": 34, + "time_per_iteration": 3.0703423023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.0903163, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.057198892540160154, + "language_loss": 1.05094206, + "learning_rate": 0.0007040493939600222, + "loss": 1.06232452, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.47949219, + "step": 35, + "time_per_iteration": 2.8476996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136821, + "balance_loss_mlp": 1.09926963, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.07105443011946577, + "language_loss": 1.05056715, + "learning_rate": 0.0007096279445021078, + "loss": 1.06193542, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.37548828, + "step": 36, + "time_per_iteration": 2.8306472301483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_mlp": 1.12274194, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09366404592926651, + "language_loss": 1.11846077, + "learning_rate": 0.0007150536386503726, + "loss": 1.12998605, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.29736328, + "step": 37, + "time_per_iteration": 2.875190258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150569, + "balance_loss_mlp": 1.12677491, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.0928332145488954, + "language_loss": 1.04548562, + "learning_rate": 0.0007203346302358509, + "loss": 1.05699134, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.23791504, + "step": 38, + "time_per_iteration": 3.0075292587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128748, + "balance_loss_mlp": 1.10757613, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.056043607360260886, + "language_loss": 1.09224963, + "learning_rate": 0.000725478437577282, + "loss": 1.10353708, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.21179199, + "step": 39, + "time_per_iteration": 2.78564715385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_mlp": 1.09953475, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.2122838817863008, + "language_loss": 1.04638147, + "learning_rate": 0.0007304920078549186, + "loss": 1.0575583, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18151855, + "step": 40, + "time_per_iteration": 2.745100975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133734, + "balance_loss_mlp": 1.11621058, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.14528393981530327, + "language_loss": 1.06509054, + "learning_rate": 0.0007353817735343603, + "loss": 1.07642794, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.17529297, + "step": 41, + "time_per_iteration": 2.7425575256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.10357416, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.06769616325508275, + "language_loss": 1.0188365, + "learning_rate": 0.0007401537019902344, + "loss": 1.03003538, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.16308594, + "step": 42, + "time_per_iteration": 2.6797902584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118919, + "balance_loss_mlp": 1.10271883, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.14916902722339276, + "language_loss": 1.05194306, + "learning_rate": 0.0007448133392900729, + "loss": 1.06313229, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.1619873, + "step": 43, + "time_per_iteration": 2.779276132583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_mlp": 1.09945166, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.052417895665492535, + "language_loss": 1.00651026, + "learning_rate": 0.0007493658489441491, + "loss": 1.0176717, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.16711426, + "step": 44, + "time_per_iteration": 2.965435028076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_mlp": 1.09195447, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.04248825884697869, + "language_loss": 1.04600978, + "learning_rate": 0.0007538160463002316, + "loss": 1.05709875, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.16967773, + "step": 45, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_mlp": 1.08735132, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.08538228051147774, + "language_loss": 1.08093452, + "learning_rate": 0.0007581684291577274, + "loss": 1.09198785, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.17980957, + "step": 46, + "time_per_iteration": 2.6020169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.04723509056908367, + "language_loss": 1.10695386, + "learning_rate": 0.0007624272050891776, + "loss": 1.11800754, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.19006348, + "step": 47, + "time_per_iteration": 2.8620407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_mlp": 1.08244705, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.07235265954126073, + "language_loss": 1.00601125, + "learning_rate": 0.0007665963158851307, + "loss": 1.01704311, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.20751953, + "step": 48, + "time_per_iteration": 2.8312995433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114938, + "balance_loss_mlp": 1.09308696, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.10505304652404167, + "language_loss": 1.09914839, + "learning_rate": 0.0007706794594783609, + "loss": 1.1102978, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.21850586, + "step": 49, + "time_per_iteration": 2.779561758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.0874207, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.04709564792407722, + "language_loss": 1.08694363, + "learning_rate": 0.0007746801096530423, + "loss": 1.09804368, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.22583008, + "step": 50, + "time_per_iteration": 2.785332441329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_mlp": 1.09285581, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09574874491356838, + "language_loss": 1.13402438, + "learning_rate": 0.0007786015338021173, + "loss": 1.14518726, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.23425293, + "step": 51, + "time_per_iteration": 2.676326274871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_mlp": 1.09500206, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.12325193255180054, + "language_loss": 1.06019998, + "learning_rate": 0.0007824468089603051, + "loss": 1.07138121, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.23144531, + "step": 52, + "time_per_iteration": 2.688828945159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_mlp": 1.11038983, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.07208467676878935, + "language_loss": 1.05329835, + "learning_rate": 0.0007862188363098669, + "loss": 1.06464922, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.24707031, + "step": 53, + "time_per_iteration": 3.3342933654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126914, + "balance_loss_mlp": 1.10158229, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.09794855088059086, + "language_loss": 1.06043434, + "learning_rate": 0.0007899203543304438, + "loss": 1.07170355, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25354004, + "step": 54, + "time_per_iteration": 2.933236837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145083, + "balance_loss_mlp": 1.12053776, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.1404118977896248, + "language_loss": 1.20000231, + "learning_rate": 0.0007935539507422731, + "loss": 1.2114532, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.24536133, + "step": 55, + "time_per_iteration": 2.8257975578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.12969017, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.05382700946372506, + "language_loss": 1.10560298, + "learning_rate": 0.0007971220733732573, + "loss": 1.11713552, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.2355957, + "step": 56, + "time_per_iteration": 2.749382495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_mlp": 1.13151693, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.17392462927294325, + "language_loss": 1.05995011, + "learning_rate": 0.0008006270400641869, + "loss": 1.07150006, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.23474121, + "step": 57, + "time_per_iteration": 2.743929147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_mlp": 1.10234821, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.10169017538987117, + "language_loss": 1.06833839, + "learning_rate": 0.0008040710477125043, + "loss": 1.07959747, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.23547363, + "step": 58, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111797, + "balance_loss_mlp": 1.08861065, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.059941584643697095, + "language_loss": 1.07409072, + "learning_rate": 0.0008074561805429771, + "loss": 1.08520865, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.23181152, + "step": 59, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123772, + "balance_loss_mlp": 1.09970331, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.06438674129900752, + "language_loss": 1.04891515, + "learning_rate": 0.0008107844176832545, + "loss": 1.06015277, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.24072266, + "step": 60, + "time_per_iteration": 2.7009053230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.11569333, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.09833112160800331, + "language_loss": 1.0671711, + "learning_rate": 0.0008140576401132568, + "loss": 1.07856739, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.23913574, + "step": 61, + "time_per_iteration": 2.678501844406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114169, + "balance_loss_mlp": 1.11887348, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.11014501355567002, + "language_loss": 1.07748628, + "learning_rate": 0.0008172776370494935, + "loss": 1.08890319, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.22814941, + "step": 62, + "time_per_iteration": 2.7718141078948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116479, + "balance_loss_mlp": 1.09356666, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.06441650429015075, + "language_loss": 1.15269816, + "learning_rate": 0.0008204461118185703, + "loss": 1.16386294, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.22912598, + "step": 63, + "time_per_iteration": 2.5839178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_mlp": 1.09543014, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.06608006175674933, + "language_loss": 1.04523873, + "learning_rate": 0.0008235646872681536, + "loss": 1.05641007, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.21728516, + "step": 64, + "time_per_iteration": 2.5611703395843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_mlp": 1.10659182, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.07834673611922068, + "language_loss": 1.04319417, + "learning_rate": 0.0008266349107584288, + "loss": 1.05447328, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.21313477, + "step": 65, + "time_per_iteration": 2.727666139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141841, + "balance_loss_mlp": 1.1207881, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.06003338375813584, + "language_loss": 1.07126927, + "learning_rate": 0.0008296582587724851, + "loss": 1.08268762, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21057129, + "step": 66, + "time_per_iteration": 2.716701030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127113, + "balance_loss_mlp": 1.10609627, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.04807876202194694, + "language_loss": 1.04662776, + "learning_rate": 0.0008326361411800136, + "loss": 1.05789876, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21008301, + "step": 67, + "time_per_iteration": 2.9571592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114337, + "balance_loss_mlp": 1.09446514, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.05551510449528945, + "language_loss": 1.05008268, + "learning_rate": 0.0008355699051851403, + "loss": 1.06122601, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1986084, + "step": 68, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.1242373, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.0697970629442659, + "language_loss": 1.12296045, + "learning_rate": 0.0008384608389860635, + "loss": 1.13439655, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.19372559, + "step": 69, + "time_per_iteration": 2.685215711593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.122311, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.08511613263061502, + "language_loss": 1.02745342, + "learning_rate": 0.000841310175171381, + "loss": 1.03886437, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.18774414, + "step": 70, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_mlp": 1.12464356, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.055787325190813475, + "language_loss": 1.0065217, + "learning_rate": 0.000844119093875517, + "loss": 1.0179472, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.17944336, + "step": 71, + "time_per_iteration": 2.753220319747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152267, + "balance_loss_mlp": 1.13508892, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08668312915327946, + "language_loss": 1.05463254, + "learning_rate": 0.0008468887257134666, + "loss": 1.0661552, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.17199707, + "step": 72, + "time_per_iteration": 2.7056305408477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117134, + "balance_loss_mlp": 1.15478206, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07356095482564125, + "language_loss": 1.08388793, + "learning_rate": 0.0008496201545131264, + "loss": 1.09560132, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.16564941, + "step": 73, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152437, + "balance_loss_mlp": 1.13545001, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.06787935984484554, + "language_loss": 1.06090975, + "learning_rate": 0.0008523144198617317, + "loss": 1.07243395, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16992188, + "step": 74, + "time_per_iteration": 3.2090003490448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1223346, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.04825332815792917, + "language_loss": 1.053195, + "learning_rate": 0.0008549725194813783, + "loss": 1.06458783, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.16967773, + "step": 75, + "time_per_iteration": 2.654343605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.10599899, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.03887402020767282, + "language_loss": 1.04797029, + "learning_rate": 0.0008575954114472099, + "loss": 1.05919111, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.1607666, + "step": 76, + "time_per_iteration": 3.119884967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.1187191, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.056937643991546806, + "language_loss": 1.02038705, + "learning_rate": 0.0008601840162606118, + "loss": 1.03173184, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.1574707, + "step": 77, + "time_per_iteration": 3.025688886642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146725, + "balance_loss_mlp": 1.13034582, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04989291514363055, + "language_loss": 1.08127129, + "learning_rate": 0.000862739218788641, + "loss": 1.09273863, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16381836, + "step": 78, + "time_per_iteration": 2.7922520637512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149681, + "balance_loss_mlp": 1.13339734, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.06709094188277621, + "language_loss": 1.06189477, + "learning_rate": 0.0008652618700799138, + "loss": 1.07339156, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.1628418, + "step": 79, + "time_per_iteration": 2.6902618408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_mlp": 1.1367681, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.062162504049989416, + "language_loss": 1.05161238, + "learning_rate": 0.0008677527890662774, + "loss": 1.06314492, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16491699, + "step": 80, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_mlp": 1.13076603, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.04934081686369646, + "language_loss": 1.06529951, + "learning_rate": 0.0008702127641587799, + "loss": 1.0767715, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.16430664, + "step": 81, + "time_per_iteration": 2.634038209915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_mlp": 1.12558985, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.08879987127008451, + "language_loss": 1.0221808, + "learning_rate": 0.0008726425547457192, + "loss": 1.0336051, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.16845703, + "step": 82, + "time_per_iteration": 2.74308705329895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.13108134, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.06313420095488197, + "language_loss": 1.01906681, + "learning_rate": 0.0008750428925998964, + "loss": 1.03054249, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.16491699, + "step": 83, + "time_per_iteration": 2.777132511138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146759, + "balance_loss_mlp": 1.13009322, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.11663644047392754, + "language_loss": 1.07169831, + "learning_rate": 0.0008774144832015932, + "loss": 1.08316588, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16674805, + "step": 84, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01524523, + "balance_loss_mlp": 1.51412809, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.22860236459315994, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76298833, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.10400391, + "step": 85, + "time_per_iteration": 4.57580041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166169, + "balance_loss_mlp": 1.1501826, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.05249425037579876, + "language_loss": 1.01959693, + "learning_rate": 0.0008820741205014318, + "loss": 1.03125858, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.15979004, + "step": 86, + "time_per_iteration": 2.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223619, + "balance_loss_mlp": 1.20703709, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.10761462625124436, + "language_loss": 1.03955913, + "learning_rate": 0.0008843634575408404, + "loss": 1.05179524, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.16577148, + "step": 87, + "time_per_iteration": 2.6694159507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228231, + "balance_loss_mlp": 1.21267366, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.10737104518045529, + "language_loss": 1.05078888, + "learning_rate": 0.0008866266301555082, + "loss": 1.06307125, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.15551758, + "step": 88, + "time_per_iteration": 2.7686069011688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212138, + "balance_loss_mlp": 1.19609249, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.1616084590878673, + "language_loss": 1.0609467, + "learning_rate": 0.0008888642296509615, + "loss": 1.07306814, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.16040039, + "step": 89, + "time_per_iteration": 2.625988721847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199649, + "balance_loss_mlp": 1.18316197, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.07585409016808545, + "language_loss": 1.1065979, + "learning_rate": 0.0008910768275115906, + "loss": 1.11859453, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16491699, + "step": 90, + "time_per_iteration": 2.793017864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_mlp": 1.15697813, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.07277460951060387, + "language_loss": 1.06493175, + "learning_rate": 0.0008932649762767675, + "loss": 1.07666695, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16552734, + "step": 91, + "time_per_iteration": 2.5919723510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169355, + "balance_loss_mlp": 1.15323818, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.10172519854243242, + "language_loss": 1.09112859, + "learning_rate": 0.0008954292103690864, + "loss": 1.10282218, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.16113281, + "step": 92, + "time_per_iteration": 2.9366836547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174542, + "balance_loss_mlp": 1.15828145, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.07803491111319032, + "language_loss": 1.10981905, + "learning_rate": 0.0008975700468778296, + "loss": 1.12156439, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16259766, + "step": 93, + "time_per_iteration": 2.592458963394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156862, + "balance_loss_mlp": 1.14067388, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.09102852745954727, + "language_loss": 1.04703569, + "learning_rate": 0.0008996879863005366, + "loss": 1.05860424, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.71566104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148536, + "balance_loss_mlp": 1.13235974, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.03859462796979438, + "language_loss": 1.04768109, + "learning_rate": 0.0009017835132453337, + "loss": 1.05916631, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.16174316, + "step": 95, + "time_per_iteration": 2.664511203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_mlp": 1.121889, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.060963703759419355, + "language_loss": 1.04675508, + "learning_rate": 0.0009038570970964896, + "loss": 1.05813384, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.15991211, + "step": 96, + "time_per_iteration": 2.7669789791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_mlp": 1.10899043, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0943042692373462, + "language_loss": 1.02071011, + "learning_rate": 0.0009059091926454854, + "loss": 1.03196073, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16064453, + "step": 97, + "time_per_iteration": 2.6028668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_mlp": 1.11052442, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.06745462513624549, + "language_loss": 1.0144124, + "learning_rate": 0.0009079402406897198, + "loss": 1.02567911, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.16137695, + "step": 98, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127975, + "balance_loss_mlp": 1.11166739, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.10523687850003575, + "language_loss": 1.03251696, + "learning_rate": 0.0009099506686008212, + "loss": 1.04379678, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16308594, + "step": 99, + "time_per_iteration": 2.8251914978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116643, + "balance_loss_mlp": 1.10100293, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.08495157768411668, + "language_loss": 1.0609076, + "learning_rate": 0.0009119408908644013, + "loss": 1.07207406, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15625, + "step": 100, + "time_per_iteration": 2.6573309898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.12211871, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.09022378013673595, + "language_loss": 1.11755276, + "learning_rate": 0.0009139113095929519, + "loss": 1.12892556, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15124512, + "step": 101, + "time_per_iteration": 2.844698429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159661, + "balance_loss_mlp": 1.14373517, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.0892612752622512, + "language_loss": 1.05698013, + "learning_rate": 0.0009158623150134762, + "loss": 1.06857681, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15917969, + "step": 102, + "time_per_iteration": 2.589857339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_mlp": 1.12158906, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.06508497546963277, + "language_loss": 1.05496848, + "learning_rate": 0.000917794285931332, + "loss": 1.06634164, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15710449, + "step": 103, + "time_per_iteration": 2.6433918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_mlp": 1.1019367, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.07675487095909958, + "language_loss": 0.97610366, + "learning_rate": 0.0009197075901716639, + "loss": 0.98728061, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.1574707, + "step": 104, + "time_per_iteration": 2.709157943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137693, + "balance_loss_mlp": 1.12159956, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.05257934075389246, + "language_loss": 1.0758431, + "learning_rate": 0.0009216025849997171, + "loss": 1.08722019, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16088867, + "step": 105, + "time_per_iteration": 2.7638583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111903, + "balance_loss_mlp": 1.09596467, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.07457888312135433, + "language_loss": 1.02261579, + "learning_rate": 0.0009234796175212258, + "loss": 1.03373492, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.15930176, + "step": 106, + "time_per_iteration": 2.9391980171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117989, + "balance_loss_mlp": 1.10228872, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.06024423434996524, + "language_loss": 1.05948544, + "learning_rate": 0.000925339025064007, + "loss": 1.07066536, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.15686035, + "step": 107, + "time_per_iteration": 2.975294828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118819, + "balance_loss_mlp": 1.10334611, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.07105297051955457, + "language_loss": 0.99294066, + "learning_rate": 0.0009271811355418027, + "loss": 1.00412893, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.15454102, + "step": 108, + "time_per_iteration": 2.8750014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125269, + "balance_loss_mlp": 1.10940242, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09212378946406244, + "language_loss": 1.05636311, + "learning_rate": 0.0009290062678013548, + "loss": 1.06761575, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.15856934, + "step": 109, + "time_per_iteration": 2.8552017211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119898, + "balance_loss_mlp": 1.10393572, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.059465971869905314, + "language_loss": 1.04477715, + "learning_rate": 0.0009308147319536321, + "loss": 1.05597615, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.1595459, + "step": 110, + "time_per_iteration": 2.6493232250213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129754, + "balance_loss_mlp": 1.11385095, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.08324280754141193, + "language_loss": 1.10257316, + "learning_rate": 0.0009326068296900676, + "loss": 1.11387074, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.15893555, + "step": 111, + "time_per_iteration": 2.8384125232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112769, + "balance_loss_mlp": 1.11171615, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06941460102767082, + "language_loss": 1.01355243, + "learning_rate": 0.0009343828545846161, + "loss": 1.02482939, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.15966797, + "step": 112, + "time_per_iteration": 2.7743477821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114893, + "balance_loss_mlp": 1.13326573, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.047977415311889204, + "language_loss": 1.05199587, + "learning_rate": 0.0009361430923823841, + "loss": 1.06348515, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.15649414, + "step": 113, + "time_per_iteration": 2.6022982597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.10308659, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.080001842017843, + "language_loss": 1.09258401, + "learning_rate": 0.0009378878212755459, + "loss": 1.10376549, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15039062, + "step": 114, + "time_per_iteration": 2.491594076156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115419, + "balance_loss_mlp": 1.09967113, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.05036418666557463, + "language_loss": 0.9906168, + "learning_rate": 0.0009396173121672103, + "loss": 1.00177097, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.15734863, + "step": 115, + "time_per_iteration": 2.668848991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_mlp": 1.10945916, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.05918191636932359, + "language_loss": 1.04414749, + "learning_rate": 0.0009413318289238633, + "loss": 1.05539548, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.15307617, + "step": 116, + "time_per_iteration": 2.7496132850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106823, + "balance_loss_mlp": 1.09139705, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.1124204963758038, + "language_loss": 0.96924931, + "learning_rate": 0.0009430316286169771, + "loss": 0.98031747, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.15405273, + "step": 117, + "time_per_iteration": 3.026118278503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_mlp": 1.11998308, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.03693994945601898, + "language_loss": 1.02417183, + "learning_rate": 0.0009447169617543361, + "loss": 1.03552485, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15307617, + "step": 118, + "time_per_iteration": 2.575666666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156925, + "balance_loss_mlp": 1.14185703, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.10959367855453626, + "language_loss": 1.09001684, + "learning_rate": 0.0009463880725016029, + "loss": 1.1015861, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.15039062, + "step": 119, + "time_per_iteration": 2.6811347007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115422, + "balance_loss_mlp": 1.10052109, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.05068852434870314, + "language_loss": 1.03909945, + "learning_rate": 0.0009480451988946134, + "loss": 1.05025363, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.14880371, + "step": 120, + "time_per_iteration": 2.801814079284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_mlp": 1.09179425, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.05688398470992871, + "language_loss": 1.05377555, + "learning_rate": 0.0009496885730428627, + "loss": 1.06484532, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1517334, + "step": 121, + "time_per_iteration": 3.04720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_mlp": 1.10574555, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.08369646841136469, + "language_loss": 1.03908122, + "learning_rate": 0.0009513184213246156, + "loss": 1.05029583, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.15710449, + "step": 122, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129626, + "balance_loss_mlp": 1.11406958, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.05522871343558165, + "language_loss": 1.07008672, + "learning_rate": 0.0009529349645740552, + "loss": 1.08138299, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15539551, + "step": 123, + "time_per_iteration": 2.69759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129797, + "balance_loss_mlp": 1.11481285, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.053769267634074955, + "language_loss": 1.05687594, + "learning_rate": 0.0009545384182608524, + "loss": 1.06817389, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1496582, + "step": 124, + "time_per_iteration": 2.550584316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126, + "balance_loss_mlp": 1.11114669, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.08700167249890467, + "language_loss": 1.02945745, + "learning_rate": 0.0009561289926625252, + "loss": 1.04071736, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14831543, + "step": 125, + "time_per_iteration": 2.6619794368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_mlp": 1.10831082, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.07114777459455598, + "language_loss": 1.07932711, + "learning_rate": 0.0009577068930299292, + "loss": 1.09056234, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.15209961, + "step": 126, + "time_per_iteration": 2.553642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125226, + "balance_loss_mlp": 1.11038458, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.08279894264625885, + "language_loss": 1.03556633, + "learning_rate": 0.0009592723197462087, + "loss": 1.04681861, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.14819336, + "step": 127, + "time_per_iteration": 2.7255966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_mlp": 1.10936916, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07600858050716931, + "language_loss": 0.99905002, + "learning_rate": 0.0009608254684795125, + "loss": 1.01029539, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15148926, + "step": 128, + "time_per_iteration": 2.9839587211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_mlp": 1.11718702, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.08573045125619827, + "language_loss": 1.02976727, + "learning_rate": 0.0009623665303297678, + "loss": 1.04109192, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.15258789, + "step": 129, + "time_per_iteration": 2.7344865798950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_mlp": 1.10497391, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.07510500588649292, + "language_loss": 1.07057762, + "learning_rate": 0.0009638956919697878, + "loss": 1.08177161, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.14416504, + "step": 130, + "time_per_iteration": 2.864952802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_mlp": 1.08930528, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.0567118244953117, + "language_loss": 0.99135083, + "learning_rate": 0.0009654131357809714, + "loss": 1.00239229, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.14819336, + "step": 131, + "time_per_iteration": 2.6095099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_mlp": 1.1081202, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.05892082702998288, + "language_loss": 1.08188879, + "learning_rate": 0.0009669190399838441, + "loss": 1.09312594, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.15576172, + "step": 132, + "time_per_iteration": 3.096733331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_mlp": 1.08531809, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.09564892115109941, + "language_loss": 1.01233923, + "learning_rate": 0.0009684135787636724, + "loss": 1.02334726, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.15478516, + "step": 133, + "time_per_iteration": 2.8120856285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111325, + "balance_loss_mlp": 1.09529161, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.04870542745948935, + "language_loss": 1.05797207, + "learning_rate": 0.0009698969223913726, + "loss": 1.06908536, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.16027832, + "step": 134, + "time_per_iteration": 3.0269176959991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_mlp": 1.10735679, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.04083122637660085, + "language_loss": 1.08225274, + "learning_rate": 0.0009713692373399265, + "loss": 1.09348655, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.16015625, + "step": 135, + "time_per_iteration": 2.690932273864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01798361, + "balance_loss_mlp": 1.75773478, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.2058674005568875, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.8125459, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.40625, + "step": 136, + "time_per_iteration": 5.460411548614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01507549, + "balance_loss_mlp": 1.47512448, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.12866590611947104, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79318589, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.32421875, + "step": 137, + "time_per_iteration": 4.989046335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146765, + "balance_loss_mlp": 1.13081443, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.04917093034878699, + "language_loss": 1.00934815, + "learning_rate": 0.0009757216201974225, + "loss": 1.02081585, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.1595459, + "step": 138, + "time_per_iteration": 2.9566736221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162305, + "balance_loss_mlp": 1.1448524, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.06281235859244827, + "language_loss": 1.0596863, + "learning_rate": 0.0009771514130396581, + "loss": 1.07130933, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17468262, + "step": 139, + "time_per_iteration": 2.683931350708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150087, + "balance_loss_mlp": 1.1330874, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09254080332591261, + "language_loss": 1.06202602, + "learning_rate": 0.00097857095638274, + "loss": 1.07352686, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17016602, + "step": 140, + "time_per_iteration": 2.558708906173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149417, + "balance_loss_mlp": 1.13241768, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.03864103733020509, + "language_loss": 0.97399604, + "learning_rate": 0.0009799803961288726, + "loss": 0.9854902, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17016602, + "step": 141, + "time_per_iteration": 2.992034673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_mlp": 1.10685217, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.06378420241673269, + "language_loss": 1.03629804, + "learning_rate": 0.000981379875086876, + "loss": 1.0475328, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16625977, + "step": 142, + "time_per_iteration": 3.063534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121821, + "balance_loss_mlp": 1.10560894, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.046520134554953796, + "language_loss": 0.98784387, + "learning_rate": 0.0009827695330590185, + "loss": 0.99906206, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.1619873, + "step": 143, + "time_per_iteration": 2.6495330333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_mlp": 1.1078757, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.05485832849515215, + "language_loss": 0.98036379, + "learning_rate": 0.0009841495069248256, + "loss": 0.99160779, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.1652832, + "step": 144, + "time_per_iteration": 2.9577834606170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_mlp": 1.12901306, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.09798795242100523, + "language_loss": 0.97478735, + "learning_rate": 0.0009855199307219871, + "loss": 0.98624128, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.16381836, + "step": 145, + "time_per_iteration": 2.6759142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148365, + "balance_loss_mlp": 1.13168764, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.1254453322996171, + "language_loss": 0.99733889, + "learning_rate": 0.0009868809357244854, + "loss": 1.00882256, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16687012, + "step": 146, + "time_per_iteration": 2.66375994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113683, + "balance_loss_mlp": 1.11978364, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.08248071954181796, + "language_loss": 1.03600287, + "learning_rate": 0.0009882326505180556, + "loss": 1.04737115, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.1706543, + "step": 147, + "time_per_iteration": 2.719353437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_mlp": 1.13280392, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.12761243433758393, + "language_loss": 1.02101135, + "learning_rate": 0.0009895752010730906, + "loss": 1.03252351, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.1842041, + "step": 148, + "time_per_iteration": 2.9704201221466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141454, + "balance_loss_mlp": 1.12377512, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07962775403881484, + "language_loss": 1.0825479, + "learning_rate": 0.0009909087108150867, + "loss": 1.09396255, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.17687988, + "step": 149, + "time_per_iteration": 2.7516071796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151554, + "balance_loss_mlp": 1.13330352, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.10196194967952074, + "language_loss": 1.09083438, + "learning_rate": 0.0009922333006927371, + "loss": 1.10235, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.18249512, + "step": 150, + "time_per_iteration": 2.4685099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.15218103, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.13259475383105176, + "language_loss": 1.020684, + "learning_rate": 0.0009935490892437632, + "loss": 1.03238916, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.18322754, + "step": 151, + "time_per_iteration": 2.5665087699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166904, + "balance_loss_mlp": 1.14880824, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10481585745820837, + "language_loss": 1.00390673, + "learning_rate": 0.0009948561926585687, + "loss": 1.01557577, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.18103027, + "step": 152, + "time_per_iteration": 2.7641003131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139325, + "balance_loss_mlp": 1.122576, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09697971136145118, + "language_loss": 1.05073512, + "learning_rate": 0.0009961547248418122, + "loss": 1.06212831, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.16760254, + "step": 153, + "time_per_iteration": 2.631476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123418, + "balance_loss_mlp": 1.10662186, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.05437877185758658, + "language_loss": 1.01441622, + "learning_rate": 0.0009974447974719707, + "loss": 1.0256505, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.16809082, + "step": 154, + "time_per_iteration": 2.709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.11151338, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.09703401576709127, + "language_loss": 1.03478801, + "learning_rate": 0.0009987265200589763, + "loss": 1.0460813, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.17810059, + "step": 155, + "time_per_iteration": 2.77809739112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140894, + "balance_loss_mlp": 1.12376344, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.08300490544518559, + "language_loss": 1.02959824, + "learning_rate": 0.001, + "loss": 1.04100728, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.17150879, + "step": 156, + "time_per_iteration": 2.845790386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144802, + "balance_loss_mlp": 1.12720668, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07590676388764007, + "language_loss": 1.00599122, + "learning_rate": 0.0009999999029413921, + "loss": 1.01743913, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.17614746, + "step": 157, + "time_per_iteration": 2.833735227584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142594, + "balance_loss_mlp": 1.12554669, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.06607639809804342, + "language_loss": 1.01453137, + "learning_rate": 0.0009999996117656068, + "loss": 1.02595735, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.1706543, + "step": 158, + "time_per_iteration": 2.803636074066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011301, + "balance_loss_mlp": 1.11345792, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.08769352458743468, + "language_loss": 0.94982773, + "learning_rate": 0.0009999991264727564, + "loss": 0.96112871, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.16638184, + "step": 159, + "time_per_iteration": 2.7776851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.11870432, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.05788098803643346, + "language_loss": 1.06247735, + "learning_rate": 0.0009999984470630296, + "loss": 1.07383585, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.17163086, + "step": 160, + "time_per_iteration": 2.6311371326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125321, + "balance_loss_mlp": 1.10836911, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.05159431076001957, + "language_loss": 0.94850963, + "learning_rate": 0.0009999975735366902, + "loss": 0.95976287, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.16955566, + "step": 161, + "time_per_iteration": 3.0904829502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148114, + "balance_loss_mlp": 1.13099504, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0692270455282635, + "language_loss": 0.96706492, + "learning_rate": 0.0009999965058940775, + "loss": 0.97854608, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.17138672, + "step": 162, + "time_per_iteration": 3.490063428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150632, + "balance_loss_mlp": 1.13323975, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08572766411177644, + "language_loss": 1.03267431, + "learning_rate": 0.0009999952441356057, + "loss": 1.04418063, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.17382812, + "step": 163, + "time_per_iteration": 2.497690439224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130614, + "balance_loss_mlp": 1.11405563, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.05784293330097489, + "language_loss": 1.03805065, + "learning_rate": 0.000999993788261765, + "loss": 1.0493567, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.16564941, + "step": 164, + "time_per_iteration": 3.6041390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.1152972, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.05766532368121917, + "language_loss": 1.05311596, + "learning_rate": 0.00099999213827312, + "loss": 1.06444073, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.171875, + "step": 165, + "time_per_iteration": 2.806014060974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_mlp": 1.12589669, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.05992608893057494, + "language_loss": 1.00112009, + "learning_rate": 0.000999990294170312, + "loss": 1.01254439, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.16540527, + "step": 166, + "time_per_iteration": 2.6405951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.11351717, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.05363857392651908, + "language_loss": 1.03767109, + "learning_rate": 0.0009999882559540566, + "loss": 1.04897451, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.16845703, + "step": 167, + "time_per_iteration": 2.69801664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_mlp": 1.11079764, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.03971308084427602, + "language_loss": 1.00767386, + "learning_rate": 0.000999986023625145, + "loss": 1.01894999, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.16821289, + "step": 168, + "time_per_iteration": 2.710706949234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04227602, + "balance_loss_mlp": 3.93005633, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.49669676383753814, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8315202, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.96875, + "step": 169, + "time_per_iteration": 4.921034574508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178384, + "balance_loss_mlp": 1.15987098, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11256254520903143, + "language_loss": 1.01289928, + "learning_rate": 0.0009999809766328958, + "loss": 1.02468312, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.18518066, + "step": 170, + "time_per_iteration": 2.6784250736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236994, + "balance_loss_mlp": 1.21676469, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.13219145589868983, + "language_loss": 1.0357101, + "learning_rate": 0.0009999781619715177, + "loss": 1.04807997, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.20227051, + "step": 171, + "time_per_iteration": 2.5412755012512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234758, + "balance_loss_mlp": 1.21518433, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.05193788120122226, + "language_loss": 1.03408492, + "learning_rate": 0.000999975153201402, + "loss": 1.0464325, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.19567871, + "step": 172, + "time_per_iteration": 2.864586353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_mlp": 1.21688426, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.0814546252210238, + "language_loss": 1.01345742, + "learning_rate": 0.0009999719503237174, + "loss": 1.02582097, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.19470215, + "step": 173, + "time_per_iteration": 2.765923261642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_mlp": 1.24583161, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11494520888694326, + "language_loss": 1.10141742, + "learning_rate": 0.0009999685533397073, + "loss": 1.11407971, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20410156, + "step": 174, + "time_per_iteration": 2.5439114570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_mlp": 1.24525094, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.12313705571337571, + "language_loss": 1.01947784, + "learning_rate": 0.00099996496225069, + "loss": 1.03212488, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19445801, + "step": 175, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257561, + "balance_loss_mlp": 1.23677111, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07888015485072913, + "language_loss": 1.04929149, + "learning_rate": 0.0009999611770580604, + "loss": 1.06186724, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.20788574, + "step": 176, + "time_per_iteration": 2.841484785079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258013, + "balance_loss_mlp": 1.23668683, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.1202186920466195, + "language_loss": 1.03394961, + "learning_rate": 0.0009999571977632876, + "loss": 1.04652977, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21350098, + "step": 177, + "time_per_iteration": 2.567788600921631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_mlp": 1.25026441, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.09201820914192435, + "language_loss": 1.05765235, + "learning_rate": 0.0009999530243679166, + "loss": 1.07036722, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.21240234, + "step": 178, + "time_per_iteration": 2.5753743648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258548, + "balance_loss_mlp": 1.23935485, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.06529189645852858, + "language_loss": 1.00495052, + "learning_rate": 0.0009999486568735675, + "loss": 1.01753592, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.19177246, + "step": 179, + "time_per_iteration": 3.0607473850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251876, + "balance_loss_mlp": 1.23275518, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.07628849485304477, + "language_loss": 1.00889277, + "learning_rate": 0.0009999440952819362, + "loss": 1.02141166, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.19116211, + "step": 180, + "time_per_iteration": 3.6515376567840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248658, + "balance_loss_mlp": 1.22853494, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.05983966318213213, + "language_loss": 1.0115366, + "learning_rate": 0.0009999393395947935, + "loss": 1.02402306, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2010498, + "step": 181, + "time_per_iteration": 2.799633502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253433, + "balance_loss_mlp": 1.23378766, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.0770350968764605, + "language_loss": 1.04747987, + "learning_rate": 0.0009999343898139858, + "loss": 1.06001413, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19641113, + "step": 182, + "time_per_iteration": 2.627434253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258891, + "balance_loss_mlp": 1.23675334, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.06485795323962908, + "language_loss": 1.03381288, + "learning_rate": 0.0009999292459414348, + "loss": 1.04640174, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.22131348, + "step": 183, + "time_per_iteration": 2.5552356243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227697, + "balance_loss_mlp": 1.20765769, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.06837915158031915, + "language_loss": 1.07873201, + "learning_rate": 0.0009999239079791374, + "loss": 1.0910089, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.20031738, + "step": 184, + "time_per_iteration": 2.5553643703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225953, + "balance_loss_mlp": 1.20453107, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.05538225102727573, + "language_loss": 1.00595856, + "learning_rate": 0.0009999183759291659, + "loss": 1.01821804, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.21435547, + "step": 185, + "time_per_iteration": 2.6955769062042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199054, + "balance_loss_mlp": 1.17938447, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.052094207769016576, + "language_loss": 1.02581143, + "learning_rate": 0.0009999126497936682, + "loss": 1.03780198, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1965332, + "step": 186, + "time_per_iteration": 2.5304598808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198293, + "balance_loss_mlp": 1.1770494, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057723222775786294, + "language_loss": 1.05774581, + "learning_rate": 0.0009999067295748676, + "loss": 1.06972873, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21252441, + "step": 187, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225876, + "balance_loss_mlp": 1.20496714, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.0756096280824464, + "language_loss": 1.03738201, + "learning_rate": 0.000999900615275062, + "loss": 1.04964077, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.20922852, + "step": 188, + "time_per_iteration": 2.677471399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211466, + "balance_loss_mlp": 1.18979406, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.0898221855427691, + "language_loss": 1.09605587, + "learning_rate": 0.0009998943068966256, + "loss": 1.10817051, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21679688, + "step": 189, + "time_per_iteration": 2.4233202934265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217638, + "balance_loss_mlp": 1.19651425, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.10338446511893212, + "language_loss": 1.03747463, + "learning_rate": 0.0009998878044420072, + "loss": 1.04965115, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.21130371, + "step": 190, + "time_per_iteration": 2.6978025436401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177731, + "balance_loss_mlp": 1.15573716, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06881722524262912, + "language_loss": 0.99768066, + "learning_rate": 0.0009998811079137318, + "loss": 1.00945807, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22009277, + "step": 191, + "time_per_iteration": 2.5934321880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.12218916, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.0852793637050772, + "language_loss": 1.0086391, + "learning_rate": 0.0009998742173143987, + "loss": 1.02007401, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.2130127, + "step": 192, + "time_per_iteration": 2.6706249713897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139307, + "balance_loss_mlp": 1.1180048, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.07456835679934387, + "language_loss": 1.01398337, + "learning_rate": 0.0009998671326466833, + "loss": 1.02537644, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.21313477, + "step": 193, + "time_per_iteration": 2.992595672607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10519516, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.08171257283174432, + "language_loss": 1.02813613, + "learning_rate": 0.0009998598539133362, + "loss": 1.03940392, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21594238, + "step": 194, + "time_per_iteration": 3.0081543922424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113199, + "balance_loss_mlp": 1.11179638, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.05573112518601677, + "language_loss": 1.02892375, + "learning_rate": 0.0009998523811171828, + "loss": 1.04024363, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2019043, + "step": 195, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149122, + "balance_loss_mlp": 1.12843966, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0935188115694547, + "language_loss": 1.0387187, + "learning_rate": 0.0009998447142611248, + "loss": 1.05020976, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.20690918, + "step": 196, + "time_per_iteration": 2.6388566493988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160139, + "balance_loss_mlp": 1.13986123, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.047444937864230444, + "language_loss": 0.96302813, + "learning_rate": 0.0009998368533481387, + "loss": 0.97462952, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.20275879, + "step": 197, + "time_per_iteration": 3.033572196960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132809, + "balance_loss_mlp": 1.11254394, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08710369828361038, + "language_loss": 0.9995833, + "learning_rate": 0.0009998287983812762, + "loss": 1.01091146, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.20263672, + "step": 198, + "time_per_iteration": 2.8421950340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155397, + "balance_loss_mlp": 1.13373709, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.10277508525357126, + "language_loss": 1.05776644, + "learning_rate": 0.0009998205493636646, + "loss": 1.06932044, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.2166748, + "step": 199, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141939, + "balance_loss_mlp": 1.12035084, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.09429923895154278, + "language_loss": 0.98451054, + "learning_rate": 0.0009998121062985063, + "loss": 0.99592984, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.21594238, + "step": 200, + "time_per_iteration": 2.6926732063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171328, + "balance_loss_mlp": 1.15014482, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.08332681767957313, + "language_loss": 1.00419915, + "learning_rate": 0.0009998034691890794, + "loss": 1.01591253, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.21203613, + "step": 201, + "time_per_iteration": 2.7643332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165409, + "balance_loss_mlp": 1.14516699, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.11326578301102472, + "language_loss": 1.05536067, + "learning_rate": 0.0009997946380387369, + "loss": 1.06701469, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.20251465, + "step": 202, + "time_per_iteration": 2.630284070968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157571, + "balance_loss_mlp": 1.13723421, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09790094078320352, + "language_loss": 1.07388449, + "learning_rate": 0.0009997856128509076, + "loss": 1.08546019, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.20336914, + "step": 203, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144349, + "balance_loss_mlp": 1.12458408, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.1356659453961297, + "language_loss": 1.02559984, + "learning_rate": 0.0009997763936290952, + "loss": 1.03704333, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.19750977, + "step": 204, + "time_per_iteration": 2.503309965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138207, + "balance_loss_mlp": 1.11642766, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.053010676996176516, + "language_loss": 1.07603145, + "learning_rate": 0.0009997669803768789, + "loss": 1.08741355, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.21789551, + "step": 205, + "time_per_iteration": 2.7773749828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_mlp": 1.09366679, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07785432610828748, + "language_loss": 1.0289582, + "learning_rate": 0.0009997573730979134, + "loss": 1.04010415, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.20947266, + "step": 206, + "time_per_iteration": 2.7241222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04720912, + "balance_loss_mlp": 3.71993518, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.31672297251450016, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.83914113, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 10.0, + "step": 207, + "time_per_iteration": 4.65311074256897 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160001, + "balance_loss_mlp": 1.13651657, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.09244016287770654, + "language_loss": 1.01599813, + "learning_rate": 0.0009997375764747294, + "loss": 1.02759814, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.23449707, + "step": 208, + "time_per_iteration": 2.999249219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144547, + "balance_loss_mlp": 1.12159967, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.10768555369795524, + "language_loss": 0.98886019, + "learning_rate": 0.0009997273871381967, + "loss": 1.00030565, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.22949219, + "step": 209, + "time_per_iteration": 2.740895986557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154635, + "balance_loss_mlp": 1.13075733, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.0670178022721504, + "language_loss": 1.03911638, + "learning_rate": 0.0009997170037902862, + "loss": 1.05066276, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.23876953, + "step": 210, + "time_per_iteration": 2.7199809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161677, + "balance_loss_mlp": 1.13826418, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.062356382061819024, + "language_loss": 1.06535935, + "learning_rate": 0.0009997064264350292, + "loss": 1.07697606, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.23413086, + "step": 211, + "time_per_iteration": 2.85477614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164794, + "balance_loss_mlp": 1.14111865, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.11782714892356931, + "language_loss": 1.00570273, + "learning_rate": 0.0009996956550765317, + "loss": 1.01735067, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.23657227, + "step": 212, + "time_per_iteration": 2.683258295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178964, + "balance_loss_mlp": 1.15452623, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07352585681220185, + "language_loss": 0.95357072, + "learning_rate": 0.0009996846897189762, + "loss": 0.9653604, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.24438477, + "step": 213, + "time_per_iteration": 2.64486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.14665973, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.06101080420793073, + "language_loss": 1.01569629, + "learning_rate": 0.0009996735303666193, + "loss": 1.02740788, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.24499512, + "step": 214, + "time_per_iteration": 2.719754934310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189275, + "balance_loss_mlp": 1.16434813, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.09805160088916984, + "language_loss": 1.03784573, + "learning_rate": 0.0009996621770237937, + "loss": 1.04973853, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24938965, + "step": 215, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202725, + "balance_loss_mlp": 1.17728579, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.05858333324383458, + "language_loss": 0.99328029, + "learning_rate": 0.0009996506296949073, + "loss": 1.00530756, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.25463867, + "step": 216, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175124, + "balance_loss_mlp": 1.14957714, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.09898600739692984, + "language_loss": 0.99386859, + "learning_rate": 0.0009996388883844428, + "loss": 1.00561976, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.25561523, + "step": 217, + "time_per_iteration": 2.5985324382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155134, + "balance_loss_mlp": 1.13007665, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06208913439552352, + "language_loss": 1.03500867, + "learning_rate": 0.0009996269530969588, + "loss": 1.04656017, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25048828, + "step": 218, + "time_per_iteration": 2.591993808746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152332, + "balance_loss_mlp": 1.12778735, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.08789931910276294, + "language_loss": 1.02762055, + "learning_rate": 0.0009996148238370888, + "loss": 1.0391438, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24536133, + "step": 219, + "time_per_iteration": 2.7247660160064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146753, + "balance_loss_mlp": 1.12125421, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.059765696203788965, + "language_loss": 0.98427057, + "learning_rate": 0.0009996025006095421, + "loss": 0.99573809, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.25524902, + "step": 220, + "time_per_iteration": 3.314250946044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04012538, + "balance_loss_mlp": 3.61886096, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.18322335632445477, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81795681, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 3.921875, + "step": 221, + "time_per_iteration": 5.397853851318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_mlp": 1.11779404, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.10045289138425088, + "language_loss": 0.98726314, + "learning_rate": 0.0009995772722706307, + "loss": 0.99869102, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.25, + "step": 222, + "time_per_iteration": 2.8346786499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168149, + "balance_loss_mlp": 1.14130318, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.07395583213906755, + "language_loss": 1.12709904, + "learning_rate": 0.0009995643671690604, + "loss": 1.13878047, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.26879883, + "step": 223, + "time_per_iteration": 2.4760169982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157966, + "balance_loss_mlp": 1.1317513, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.08239055528326475, + "language_loss": 1.00208497, + "learning_rate": 0.0009995512681194023, + "loss": 1.01366448, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.26257324, + "step": 224, + "time_per_iteration": 2.833751916885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151672, + "balance_loss_mlp": 1.12492132, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.058356102807926864, + "language_loss": 0.97854793, + "learning_rate": 0.0009995379751267417, + "loss": 0.99006462, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.2677002, + "step": 225, + "time_per_iteration": 3.295761823654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_mlp": 1.1551652, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.09032086206875983, + "language_loss": 0.99067688, + "learning_rate": 0.0009995244881962398, + "loss": 1.00250244, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.27416992, + "step": 226, + "time_per_iteration": 2.6147754192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162924, + "balance_loss_mlp": 1.1352675, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.05273235380658081, + "language_loss": 1.00220668, + "learning_rate": 0.0009995108073331323, + "loss": 1.01383591, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27661133, + "step": 227, + "time_per_iteration": 2.575477361679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165107, + "balance_loss_mlp": 1.13835633, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.07222661628022838, + "language_loss": 1.03328192, + "learning_rate": 0.0009994969325427309, + "loss": 1.04493296, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.26733398, + "step": 228, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159475, + "balance_loss_mlp": 1.13215184, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.05690950477809338, + "language_loss": 0.99788582, + "learning_rate": 0.0009994828638304218, + "loss": 1.0094806, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.2734375, + "step": 229, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160216, + "balance_loss_mlp": 1.13327467, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.0671245201901001, + "language_loss": 1.05080867, + "learning_rate": 0.0009994686012016675, + "loss": 1.06241083, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.26953125, + "step": 230, + "time_per_iteration": 2.5507686138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200075, + "balance_loss_mlp": 1.17368245, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.08083200993131012, + "language_loss": 1.04836714, + "learning_rate": 0.000999454144662005, + "loss": 1.06036782, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.26416016, + "step": 231, + "time_per_iteration": 2.872386932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177085, + "balance_loss_mlp": 1.15090632, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.06521500069668446, + "language_loss": 0.98697901, + "learning_rate": 0.0009994394942170468, + "loss": 0.99874985, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.26208496, + "step": 232, + "time_per_iteration": 2.6734542846679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_mlp": 1.13452244, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06848368332912834, + "language_loss": 0.96340638, + "learning_rate": 0.0009994246498724808, + "loss": 0.97500765, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.25598145, + "step": 233, + "time_per_iteration": 2.735145330429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.14341569, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.09664881582101635, + "language_loss": 0.99309772, + "learning_rate": 0.00099940961163407, + "loss": 1.00479114, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.25964355, + "step": 234, + "time_per_iteration": 2.8988683223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_mlp": 1.11722803, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.06003753756121682, + "language_loss": 1.01686716, + "learning_rate": 0.0009993943795076528, + "loss": 1.02828944, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.25012207, + "step": 235, + "time_per_iteration": 2.6333067417144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132836, + "balance_loss_mlp": 1.10618043, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.08170413586498586, + "language_loss": 1.0374043, + "learning_rate": 0.0009993789534991427, + "loss": 1.04873264, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.26708984, + "step": 236, + "time_per_iteration": 2.4350106716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_mlp": 1.0960753, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.0440176634981383, + "language_loss": 0.99063611, + "learning_rate": 0.0009993633336145287, + "loss": 1.00186157, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26513672, + "step": 237, + "time_per_iteration": 2.6414294242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134799, + "balance_loss_mlp": 1.10904956, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.04213473561248219, + "language_loss": 1.02718055, + "learning_rate": 0.0009993475198598752, + "loss": 1.03852856, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.25756836, + "step": 238, + "time_per_iteration": 2.9781904220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152995, + "balance_loss_mlp": 1.12614954, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08613106589232603, + "language_loss": 1.00055635, + "learning_rate": 0.0009993315122413212, + "loss": 1.01208627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.26879883, + "step": 239, + "time_per_iteration": 2.6395275592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_mlp": 1.13594294, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.06839694959482054, + "language_loss": 0.99973977, + "learning_rate": 0.0009993153107650818, + "loss": 1.01136363, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.2644043, + "step": 240, + "time_per_iteration": 2.563133716583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_mlp": 1.13391829, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.06471449859153773, + "language_loss": 0.98970807, + "learning_rate": 0.0009992989154374468, + "loss": 1.00131631, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.26928711, + "step": 241, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145463, + "balance_loss_mlp": 1.11914206, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06957696695924716, + "language_loss": 1.05868769, + "learning_rate": 0.0009992823262647817, + "loss": 1.07014227, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26342773, + "step": 242, + "time_per_iteration": 2.6841883659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111302, + "balance_loss_mlp": 1.08692503, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.0649477492764712, + "language_loss": 0.99848783, + "learning_rate": 0.0009992655432535264, + "loss": 1.00961804, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.2611084, + "step": 243, + "time_per_iteration": 2.7613234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107198, + "balance_loss_mlp": 1.08162785, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.05612685480258275, + "language_loss": 1.00329947, + "learning_rate": 0.0009992485664101973, + "loss": 1.01437151, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.25598145, + "step": 244, + "time_per_iteration": 2.717280387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.09556472, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.10316769075352135, + "language_loss": 1.02662849, + "learning_rate": 0.000999231395741385, + "loss": 1.03785205, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.26831055, + "step": 245, + "time_per_iteration": 3.095249891281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_mlp": 1.11837006, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.09647975042234339, + "language_loss": 1.01015186, + "learning_rate": 0.0009992140312537557, + "loss": 1.02159202, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.25671387, + "step": 246, + "time_per_iteration": 2.633258819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.09845233, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.09798218580430706, + "language_loss": 0.95550418, + "learning_rate": 0.000999196472954051, + "loss": 0.96674085, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.25231934, + "step": 247, + "time_per_iteration": 3.024939775466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02466762, + "balance_loss_mlp": 2.43700695, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.2831653982047738, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81891614, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 0.296875, + "step": 248, + "time_per_iteration": 5.486468076705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162703, + "balance_loss_mlp": 1.13626289, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.12969478117477343, + "language_loss": 1.03178453, + "learning_rate": 0.0009991607749457578, + "loss": 1.04341149, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.26464844, + "step": 249, + "time_per_iteration": 2.5253713130950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119774, + "balance_loss_mlp": 1.16941571, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.09425507858465235, + "language_loss": 1.01008546, + "learning_rate": 0.0009991426352510286, + "loss": 1.0220629, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.28295898, + "step": 250, + "time_per_iteration": 3.0042202472686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204128, + "balance_loss_mlp": 1.174016, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.07677732337183582, + "language_loss": 1.0282234, + "learning_rate": 0.0009991243017719422, + "loss": 1.04026473, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30126953, + "step": 251, + "time_per_iteration": 2.709934711456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206766, + "balance_loss_mlp": 1.17522311, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.1103729500964747, + "language_loss": 0.97436613, + "learning_rate": 0.0009991057745156165, + "loss": 0.9864338, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.31518555, + "step": 252, + "time_per_iteration": 2.5961716175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03348202, + "balance_loss_mlp": 3.30471396, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.3811060337507454, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85259187, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.43554688, + "step": 253, + "time_per_iteration": 5.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_mlp": 1.1623621, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.07473951959737497, + "language_loss": 1.05491519, + "learning_rate": 0.0009990681387000943, + "loss": 1.06686831, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.3293457, + "step": 254, + "time_per_iteration": 2.7937283515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121698, + "balance_loss_mlp": 1.18345821, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.06898181212790383, + "language_loss": 1.01063621, + "learning_rate": 0.0009990490301555093, + "loss": 1.02280605, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.33544922, + "step": 255, + "time_per_iteration": 2.9615726470947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05252755, + "balance_loss_mlp": 5.12458086, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.5609302024280507, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.84467912, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.28125, + "step": 256, + "time_per_iteration": 4.8413920402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03162439, + "balance_loss_mlp": 3.09758925, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.1723793408951341, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8240518, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6484375, + "step": 257, + "time_per_iteration": 4.985513687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03630928, + "balance_loss_mlp": 3.55844903, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4079591987734508, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73606813, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.7265625, + "step": 258, + "time_per_iteration": 4.858096361160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403117, + "balance_loss_mlp": 1.35569584, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.11330256318865821, + "language_loss": 0.95339322, + "learning_rate": 0.0009989706585723202, + "loss": 0.96742439, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.47436523, + "step": 259, + "time_per_iteration": 2.794419765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437412, + "balance_loss_mlp": 1.38651013, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.10381773722922016, + "language_loss": 1.0219605, + "learning_rate": 0.0009989505813633442, + "loss": 1.03633475, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.50927734, + "step": 260, + "time_per_iteration": 2.6660099029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145174, + "balance_loss_mlp": 1.39776254, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12909552841436595, + "language_loss": 1.02080631, + "learning_rate": 0.000998930310444573, + "loss": 1.03532374, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.5402832, + "step": 261, + "time_per_iteration": 2.7547266483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429363, + "balance_loss_mlp": 1.37698281, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.08616818959721087, + "language_loss": 0.99936116, + "learning_rate": 0.0009989098458238765, + "loss": 1.01365471, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.52441406, + "step": 262, + "time_per_iteration": 2.804656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431577, + "balance_loss_mlp": 1.38310647, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.10103635045761167, + "language_loss": 0.99213421, + "learning_rate": 0.0009988891875091998, + "loss": 1.00644994, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.48486328, + "step": 263, + "time_per_iteration": 2.780696392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359367, + "balance_loss_mlp": 1.31771505, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09437475228894394, + "language_loss": 0.93793595, + "learning_rate": 0.0009988683355085636, + "loss": 0.95152962, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.41625977, + "step": 264, + "time_per_iteration": 2.758275032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314446, + "balance_loss_mlp": 1.27684712, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09784246378207673, + "language_loss": 1.02612829, + "learning_rate": 0.000998847289830063, + "loss": 1.03927279, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37524414, + "step": 265, + "time_per_iteration": 2.8752288818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289086, + "balance_loss_mlp": 1.25468266, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.06973466471853282, + "language_loss": 0.95293748, + "learning_rate": 0.0009988260504818682, + "loss": 0.9658283, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.34423828, + "step": 266, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290407, + "balance_loss_mlp": 1.2563374, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.0971565340820806, + "language_loss": 1.02148294, + "learning_rate": 0.000998804617472226, + "loss": 1.03438699, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.34082031, + "step": 267, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275377, + "balance_loss_mlp": 1.24085402, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.10761719469623075, + "language_loss": 0.96939588, + "learning_rate": 0.0009987829908094568, + "loss": 0.98214972, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.34545898, + "step": 268, + "time_per_iteration": 2.8270740509033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271333, + "balance_loss_mlp": 1.23785877, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.1226169977774822, + "language_loss": 1.04002702, + "learning_rate": 0.0009987611705019569, + "loss": 1.05274034, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.33496094, + "step": 269, + "time_per_iteration": 4.483954429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277218, + "balance_loss_mlp": 1.24267149, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.07374197309260985, + "language_loss": 1.02401245, + "learning_rate": 0.0009987391565581978, + "loss": 1.03678453, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34594727, + "step": 270, + "time_per_iteration": 2.627356767654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304636, + "balance_loss_mlp": 1.26977956, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06923057034816653, + "language_loss": 0.94496262, + "learning_rate": 0.000998716948986726, + "loss": 0.95800889, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34887695, + "step": 271, + "time_per_iteration": 2.804185628890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322736, + "balance_loss_mlp": 1.28718746, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.1173780328671846, + "language_loss": 0.97372609, + "learning_rate": 0.0009986945477961633, + "loss": 0.9869535, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.35571289, + "step": 272, + "time_per_iteration": 2.739595890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297409, + "balance_loss_mlp": 1.2620039, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07261359465506025, + "language_loss": 1.02136993, + "learning_rate": 0.0009986719529952066, + "loss": 1.03434396, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.35424805, + "step": 273, + "time_per_iteration": 2.8717877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_mlp": 1.20389819, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.13624684616705834, + "language_loss": 1.01736569, + "learning_rate": 0.000998649164592628, + "loss": 1.0297575, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.35327148, + "step": 274, + "time_per_iteration": 2.590993642807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206885, + "balance_loss_mlp": 1.16945291, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.061304815826305474, + "language_loss": 0.99439085, + "learning_rate": 0.0009986261825972748, + "loss": 1.00645971, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.37426758, + "step": 275, + "time_per_iteration": 2.702202081680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_mlp": 1.14466429, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.10486338408500256, + "language_loss": 1.01433325, + "learning_rate": 0.000998603007018069, + "loss": 1.02616751, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.38745117, + "step": 276, + "time_per_iteration": 2.876267671585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190326, + "balance_loss_mlp": 1.15055728, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.08719890934761923, + "language_loss": 0.99445826, + "learning_rate": 0.0009985796378640089, + "loss": 1.00636148, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.39746094, + "step": 277, + "time_per_iteration": 2.74886155128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165278, + "balance_loss_mlp": 1.12720275, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.06292174667602014, + "language_loss": 0.99806106, + "learning_rate": 0.0009985560751441665, + "loss": 1.00971389, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.38061523, + "step": 278, + "time_per_iteration": 2.8894753456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175743, + "balance_loss_mlp": 1.13790607, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.06329003141341145, + "language_loss": 1.01538157, + "learning_rate": 0.00099853231886769, + "loss": 1.02713895, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.37792969, + "step": 279, + "time_per_iteration": 2.783085823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183406, + "balance_loss_mlp": 1.14633179, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06545769746199957, + "language_loss": 1.01316965, + "learning_rate": 0.0009985083690438024, + "loss": 1.02500367, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.37084961, + "step": 280, + "time_per_iteration": 2.707329511642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147788, + "balance_loss_mlp": 1.11245418, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.05305898567294309, + "language_loss": 0.9175781, + "learning_rate": 0.0009984842256818016, + "loss": 0.92905599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.35400391, + "step": 281, + "time_per_iteration": 3.1014201641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_mlp": 1.13106215, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.05782684737590577, + "language_loss": 1.02446878, + "learning_rate": 0.0009984598887910613, + "loss": 1.03612816, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.34912109, + "step": 282, + "time_per_iteration": 2.75343656539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_mlp": 1.14555514, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0631633618899466, + "language_loss": 0.98333299, + "learning_rate": 0.0009984353583810297, + "loss": 0.99513876, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.3503418, + "step": 283, + "time_per_iteration": 2.8092565536499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.15350997, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0821933313576245, + "language_loss": 1.00416183, + "learning_rate": 0.0009984106344612302, + "loss": 1.01602352, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.32666016, + "step": 284, + "time_per_iteration": 2.7632908821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_mlp": 1.1310904, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.06349155766627652, + "language_loss": 0.95740765, + "learning_rate": 0.0009983857170412615, + "loss": 0.96904278, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.32421875, + "step": 285, + "time_per_iteration": 2.9946134090423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130334, + "balance_loss_mlp": 1.09912539, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.0487694941790178, + "language_loss": 0.95326382, + "learning_rate": 0.000998360606130798, + "loss": 0.96456718, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.31176758, + "step": 286, + "time_per_iteration": 2.8205370903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.09512836, + "balance_loss_mlp": 7.26674223, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.42812971022266805, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.78585953, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 22.5, + "step": 287, + "time_per_iteration": 4.986966848373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173658, + "balance_loss_mlp": 1.14278328, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08917023960137904, + "language_loss": 1.01027536, + "learning_rate": 0.0009983098038774552, + "loss": 1.02201188, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.30834961, + "step": 288, + "time_per_iteration": 2.8100168704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06110836, + "balance_loss_mlp": 5.25634384, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.4031517895181362, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.84281063, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 8.5625, + "step": 289, + "time_per_iteration": 4.790200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_mlp": 1.23435044, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.18275347501036113, + "language_loss": 0.9955281, + "learning_rate": 0.0009982582277800948, + "loss": 1.00819802, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.32641602, + "step": 290, + "time_per_iteration": 2.5976333618164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281728, + "balance_loss_mlp": 1.24694288, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.14603269886404707, + "language_loss": 1.06751418, + "learning_rate": 0.0009982321495648908, + "loss": 1.08033144, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.34838867, + "step": 291, + "time_per_iteration": 2.8513312339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250537, + "balance_loss_mlp": 1.21348643, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.09283742859778188, + "language_loss": 0.97403693, + "learning_rate": 0.0009982058779188115, + "loss": 0.98654234, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.37011719, + "step": 292, + "time_per_iteration": 2.728203773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230786, + "balance_loss_mlp": 1.19170928, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.08826519450204054, + "language_loss": 1.05705655, + "learning_rate": 0.0009981794128520567, + "loss": 1.06936455, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.39038086, + "step": 293, + "time_per_iteration": 2.79616379737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253904, + "balance_loss_mlp": 1.21258569, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.08065602932127632, + "language_loss": 1.01724029, + "learning_rate": 0.000998152754374901, + "loss": 1.02977943, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.41333008, + "step": 294, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232141, + "balance_loss_mlp": 1.19132411, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.07309017642696977, + "language_loss": 0.9826439, + "learning_rate": 0.0009981259024976943, + "loss": 0.99496531, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.40820312, + "step": 295, + "time_per_iteration": 2.7376105785369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244019, + "balance_loss_mlp": 1.20112753, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.07769478500482971, + "language_loss": 0.96765345, + "learning_rate": 0.0009980988572308612, + "loss": 0.9800936, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.42871094, + "step": 296, + "time_per_iteration": 3.001779556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226011, + "balance_loss_mlp": 1.18197489, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.0588150430335769, + "language_loss": 0.99343681, + "learning_rate": 0.0009980716185849015, + "loss": 1.00569689, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44067383, + "step": 297, + "time_per_iteration": 2.9817121028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223805, + "balance_loss_mlp": 1.18153381, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06400414638033543, + "language_loss": 0.95616293, + "learning_rate": 0.0009980441865703904, + "loss": 0.96840101, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4230957, + "step": 298, + "time_per_iteration": 2.615875244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122669, + "balance_loss_mlp": 1.18513405, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.09089975305964836, + "language_loss": 1.03662193, + "learning_rate": 0.000998016561197978, + "loss": 1.04888892, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.41577148, + "step": 299, + "time_per_iteration": 2.765833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219698, + "balance_loss_mlp": 1.17835617, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.05662219614280908, + "language_loss": 0.94978034, + "learning_rate": 0.0009979887424783895, + "loss": 0.96197736, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.41357422, + "step": 300, + "time_per_iteration": 2.8931760787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122099, + "balance_loss_mlp": 1.17850339, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.05388706690809858, + "language_loss": 0.94851983, + "learning_rate": 0.0009979607304224248, + "loss": 0.96072972, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.42504883, + "step": 301, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213648, + "balance_loss_mlp": 1.16951644, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.0564182452216587, + "language_loss": 1.02312028, + "learning_rate": 0.000997932525040959, + "loss": 1.03525686, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.44140625, + "step": 302, + "time_per_iteration": 2.7084572315216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.14165473, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.07525794393376325, + "language_loss": 1.04335976, + "learning_rate": 0.000997904126344943, + "loss": 1.05521822, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.44165039, + "step": 303, + "time_per_iteration": 2.6271631717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121438, + "balance_loss_mlp": 1.17055893, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.0664075129682053, + "language_loss": 1.00263453, + "learning_rate": 0.0009978755343454018, + "loss": 1.01477838, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.791146993637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182664, + "balance_loss_mlp": 1.13869941, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07350056034493838, + "language_loss": 1.01461756, + "learning_rate": 0.0009978467490534355, + "loss": 1.0264442, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.43969727, + "step": 305, + "time_per_iteration": 2.614455461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186922, + "balance_loss_mlp": 1.14424467, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.056638515612222363, + "language_loss": 0.97774673, + "learning_rate": 0.00099781777048022, + "loss": 0.98961592, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.42700195, + "step": 306, + "time_per_iteration": 2.717700481414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011718, + "balance_loss_mlp": 1.12855101, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.056560878082468485, + "language_loss": 0.99827361, + "learning_rate": 0.0009977885986370057, + "loss": 1.00999165, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.43310547, + "step": 307, + "time_per_iteration": 2.557203531265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164879, + "balance_loss_mlp": 1.12263095, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.05991229640473007, + "language_loss": 0.9525907, + "learning_rate": 0.000997759233535118, + "loss": 0.9642396, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.42285156, + "step": 308, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174986, + "balance_loss_mlp": 1.1345737, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.06710738832596337, + "language_loss": 1.01122141, + "learning_rate": 0.0009977296751859576, + "loss": 1.02297115, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.40405273, + "step": 309, + "time_per_iteration": 2.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164837, + "balance_loss_mlp": 1.12487829, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.05223481097130428, + "language_loss": 1.03482628, + "learning_rate": 0.0009976999236009998, + "loss": 1.0464747, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.39941406, + "step": 310, + "time_per_iteration": 2.769092321395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164403, + "balance_loss_mlp": 1.1263994, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.05685909644716586, + "language_loss": 1.04877043, + "learning_rate": 0.0009976699787917955, + "loss": 1.06041443, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37963867, + "step": 311, + "time_per_iteration": 2.6526851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08775091, + "balance_loss_mlp": 7.79852915, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.2725707199289832, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.82218087, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 9.75, + "step": 312, + "time_per_iteration": 5.006884813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_mlp": 1.12172294, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.06726838636277511, + "language_loss": 0.96427834, + "learning_rate": 0.0009976095095472243, + "loss": 0.97589004, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39428711, + "step": 313, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166252, + "balance_loss_mlp": 1.12738967, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.0761643630364548, + "language_loss": 0.97957367, + "learning_rate": 0.0009975789851353334, + "loss": 0.99123621, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.38818359, + "step": 314, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_mlp": 1.13191843, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07475166161853689, + "language_loss": 1.00319684, + "learning_rate": 0.0009975482675461487, + "loss": 1.0149318, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.41601562, + "step": 315, + "time_per_iteration": 2.65468692779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159286, + "balance_loss_mlp": 1.11591756, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08252555003670439, + "language_loss": 0.98425788, + "learning_rate": 0.0009975173567915952, + "loss": 0.99585068, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.43383789, + "step": 316, + "time_per_iteration": 2.6916940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.12767935, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.0640207679679256, + "language_loss": 0.91960573, + "learning_rate": 0.000997486252883674, + "loss": 0.93133986, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.45727539, + "step": 317, + "time_per_iteration": 2.8535635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188261, + "balance_loss_mlp": 1.13979006, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.0671416603225842, + "language_loss": 0.97457695, + "learning_rate": 0.0009974549558344602, + "loss": 0.98645949, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.484375, + "step": 318, + "time_per_iteration": 3.6911113262176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189393, + "balance_loss_mlp": 1.14037383, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.09268216800999254, + "language_loss": 1.06808639, + "learning_rate": 0.000997423465656105, + "loss": 1.07998025, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.49023438, + "step": 319, + "time_per_iteration": 2.727130651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.096205, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.06029287427116143, + "language_loss": 1.04509127, + "learning_rate": 0.0009973917823608335, + "loss": 1.05656588, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.51318359, + "step": 320, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.09605646, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.03213952729051003, + "language_loss": 0.98612553, + "learning_rate": 0.0009973599059609462, + "loss": 0.99760658, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.52075195, + "step": 321, + "time_per_iteration": 2.7024786472320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.09133446, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.04984356389382333, + "language_loss": 0.97161096, + "learning_rate": 0.000997327836468819, + "loss": 0.9830358, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.51147461, + "step": 322, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_mlp": 1.0917964, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.06671524152363617, + "language_loss": 0.99795449, + "learning_rate": 0.000997295573896902, + "loss": 1.00938356, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.51171875, + "step": 323, + "time_per_iteration": 2.834237813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03299168, + "balance_loss_mlp": 3.12445545, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.43556355854402456, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.84495211, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.75, + "step": 324, + "time_per_iteration": 4.770992040634155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02151431, + "balance_loss_mlp": 1.9545927, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.14082611715048204, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80723369, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.9609375, + "step": 325, + "time_per_iteration": 4.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.14768362, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.08367806581965369, + "language_loss": 0.93651855, + "learning_rate": 0.000997197627828043, + "loss": 0.94848073, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.4855957, + "step": 326, + "time_per_iteration": 2.5508148670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215208, + "balance_loss_mlp": 1.16862106, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.06635735350324974, + "language_loss": 0.89348811, + "learning_rate": 0.0009971645930629716, + "loss": 0.90564024, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.46533203, + "step": 327, + "time_per_iteration": 2.711386203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125047, + "balance_loss_mlp": 1.20192814, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.08863859510008423, + "language_loss": 1.03147936, + "learning_rate": 0.0009971313652814872, + "loss": 1.04398406, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.48486328, + "step": 328, + "time_per_iteration": 2.8484854698181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225004, + "balance_loss_mlp": 1.17553234, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.08503417282278386, + "language_loss": 1.0059731, + "learning_rate": 0.0009970979444964903, + "loss": 1.01822317, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.49487305, + "step": 329, + "time_per_iteration": 2.957482099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197604, + "balance_loss_mlp": 1.14846587, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.06790724972181753, + "language_loss": 1.01849604, + "learning_rate": 0.0009970643307209556, + "loss": 1.03047216, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.49121094, + "step": 330, + "time_per_iteration": 2.8220374584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170349, + "balance_loss_mlp": 1.1215446, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.06721894230078661, + "language_loss": 0.98097444, + "learning_rate": 0.0009970305239679334, + "loss": 0.99267793, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.48803711, + "step": 331, + "time_per_iteration": 2.8813369274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176679, + "balance_loss_mlp": 1.12754059, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.056286161373139375, + "language_loss": 1.03013992, + "learning_rate": 0.0009969965242505483, + "loss": 1.04190671, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.4909668, + "step": 332, + "time_per_iteration": 2.6662604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168774, + "balance_loss_mlp": 1.11932611, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06031850484613652, + "language_loss": 0.99096131, + "learning_rate": 0.0009969623315820007, + "loss": 1.00264907, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.49487305, + "step": 333, + "time_per_iteration": 2.6671581268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.10619712, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06229524640691676, + "language_loss": 0.99215055, + "learning_rate": 0.000996927945975565, + "loss": 1.00368309, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.47070312, + "step": 334, + "time_per_iteration": 2.568838357925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.1125921, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.05620099657237302, + "language_loss": 0.95852566, + "learning_rate": 0.0009968933674445906, + "loss": 0.97011936, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.46728516, + "step": 335, + "time_per_iteration": 2.6725666522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160514, + "balance_loss_mlp": 1.1122818, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.05589062806096766, + "language_loss": 0.97974062, + "learning_rate": 0.0009968585960025028, + "loss": 0.99134576, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.48217773, + "step": 336, + "time_per_iteration": 2.945194959640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0396516, + "balance_loss_mlp": 3.85834861, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.42886267506062575, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.81618351, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.0703125, + "step": 337, + "time_per_iteration": 4.802944183349609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215082, + "balance_loss_mlp": 1.16968668, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.09324534870618859, + "language_loss": 0.96021777, + "learning_rate": 0.0009967884744390583, + "loss": 0.9723686, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.45361328, + "step": 338, + "time_per_iteration": 3.5247950553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251582, + "balance_loss_mlp": 1.2060678, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.09123718626917265, + "language_loss": 0.97373873, + "learning_rate": 0.0009967531243449256, + "loss": 0.98625457, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.45507812, + "step": 339, + "time_per_iteration": 2.681973695755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211309, + "balance_loss_mlp": 1.163077, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.06030156589334856, + "language_loss": 1.04525125, + "learning_rate": 0.000996717581394126, + "loss": 1.05736434, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.48242188, + "step": 340, + "time_per_iteration": 2.6031126976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205107, + "balance_loss_mlp": 1.15630233, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.06934362388274598, + "language_loss": 1.05133414, + "learning_rate": 0.000996681845600459, + "loss": 1.06338525, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.48803711, + "step": 341, + "time_per_iteration": 2.6689491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190009, + "balance_loss_mlp": 1.1402986, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07929020766121274, + "language_loss": 0.97276402, + "learning_rate": 0.0009966459169777982, + "loss": 0.98466408, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.49731445, + "step": 342, + "time_per_iteration": 2.5235347747802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183772, + "balance_loss_mlp": 1.13444376, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.06503113555429127, + "language_loss": 1.05431008, + "learning_rate": 0.0009966097955400924, + "loss": 1.0661478, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.4934082, + "step": 343, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195626, + "balance_loss_mlp": 1.14772749, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.05810753199069879, + "language_loss": 0.99792945, + "learning_rate": 0.0009965734813013652, + "loss": 1.00988579, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.47924805, + "step": 344, + "time_per_iteration": 2.8092823028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211149, + "balance_loss_mlp": 1.16191518, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.08606224500635251, + "language_loss": 1.02011895, + "learning_rate": 0.0009965369742757151, + "loss": 1.03223062, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.49243164, + "step": 345, + "time_per_iteration": 2.5981764793395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193116, + "balance_loss_mlp": 1.14435959, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.0619511290056959, + "language_loss": 0.98293203, + "learning_rate": 0.0009965002744773152, + "loss": 0.99486327, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.48730469, + "step": 346, + "time_per_iteration": 3.4968950748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178364, + "balance_loss_mlp": 1.13115668, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.04856723246232052, + "language_loss": 0.95658922, + "learning_rate": 0.0009964633819204139, + "loss": 0.96837282, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.47167969, + "step": 347, + "time_per_iteration": 2.6705336570739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04576048, + "balance_loss_mlp": 4.3029151, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.32603271390487504, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.86377156, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 2.734375, + "step": 348, + "time_per_iteration": 4.961863994598389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03789769, + "balance_loss_mlp": 3.60590124, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.16497869204612428, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.78943658, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.8359375, + "step": 349, + "time_per_iteration": 4.876751184463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181375, + "balance_loss_mlp": 1.13578987, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.07770510755269132, + "language_loss": 0.96067584, + "learning_rate": 0.000996351547842304, + "loss": 0.9724896, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.45581055, + "step": 350, + "time_per_iteration": 3.166680097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217287, + "balance_loss_mlp": 1.16969919, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.06167835917893234, + "language_loss": 0.94333142, + "learning_rate": 0.0009963138843953744, + "loss": 0.9555043, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.47558594, + "step": 351, + "time_per_iteration": 2.5784904956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122803, + "balance_loss_mlp": 1.18005991, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.06188972934791396, + "language_loss": 0.98543227, + "learning_rate": 0.000996276028262306, + "loss": 0.99771261, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.47924805, + "step": 352, + "time_per_iteration": 2.7985076904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216963, + "balance_loss_mlp": 1.16760993, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.0659402302829914, + "language_loss": 1.04801619, + "learning_rate": 0.0009962379794577964, + "loss": 1.06018579, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.49365234, + "step": 353, + "time_per_iteration": 2.608032703399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123128, + "balance_loss_mlp": 1.18266606, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.051231802586423875, + "language_loss": 0.94352609, + "learning_rate": 0.000996199737996617, + "loss": 0.95583886, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.48657227, + "step": 354, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227436, + "balance_loss_mlp": 1.17770219, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.05676190931504088, + "language_loss": 1.03759205, + "learning_rate": 0.0009961613038936149, + "loss": 1.04986644, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.49755859, + "step": 355, + "time_per_iteration": 2.617859125137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216553, + "balance_loss_mlp": 1.16572189, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.04878484453506707, + "language_loss": 0.95482612, + "learning_rate": 0.000996122677163711, + "loss": 0.96699166, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.50878906, + "step": 356, + "time_per_iteration": 2.8171308040618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230039, + "balance_loss_mlp": 1.18037653, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.06504242786199886, + "language_loss": 1.01527905, + "learning_rate": 0.000996083857821902, + "loss": 1.02757955, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.49682617, + "step": 357, + "time_per_iteration": 3.0562636852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221322, + "balance_loss_mlp": 1.17237508, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.043415107047687695, + "language_loss": 0.99947309, + "learning_rate": 0.0009960448458832588, + "loss": 1.01168633, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.48925781, + "step": 358, + "time_per_iteration": 2.6778266429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224961, + "balance_loss_mlp": 1.17675292, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.061398357107108094, + "language_loss": 0.99686754, + "learning_rate": 0.000996005641362927, + "loss": 1.00911713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.48193359, + "step": 359, + "time_per_iteration": 2.5839953422546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218039, + "balance_loss_mlp": 1.16792321, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.045504813624839685, + "language_loss": 1.02907789, + "learning_rate": 0.0009959662442761274, + "loss": 1.04125834, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.5012207, + "step": 360, + "time_per_iteration": 2.9012227058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225991, + "balance_loss_mlp": 1.17504108, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.05242893208235044, + "language_loss": 0.96392268, + "learning_rate": 0.000995926654638155, + "loss": 0.97618258, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.50976562, + "step": 361, + "time_per_iteration": 2.7972850799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120421, + "balance_loss_mlp": 1.15323579, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0452718414118582, + "language_loss": 0.98678619, + "learning_rate": 0.00099588687246438, + "loss": 0.99882829, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.51025391, + "step": 362, + "time_per_iteration": 2.845742702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011953, + "balance_loss_mlp": 1.14241886, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.06654716127982052, + "language_loss": 1.06146324, + "learning_rate": 0.0009958468977702471, + "loss": 1.07341623, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.52978516, + "step": 363, + "time_per_iteration": 2.5876591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05386722, + "balance_loss_mlp": 5.09527922, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.35536528906135745, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.85121429, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 2.921875, + "step": 364, + "time_per_iteration": 4.7958595752716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183221, + "balance_loss_mlp": 1.12800324, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.06493728064972926, + "language_loss": 0.94085538, + "learning_rate": 0.0009957663708830612, + "loss": 0.95268762, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.55273438, + "step": 365, + "time_per_iteration": 3.238919258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188034, + "balance_loss_mlp": 1.13048029, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.06418297657416602, + "language_loss": 0.98210049, + "learning_rate": 0.0009957258187212714, + "loss": 0.99398077, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.57470703, + "step": 366, + "time_per_iteration": 3.0337131023406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0292345, + "balance_loss_mlp": 2.78612089, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.09868001986151984, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.82118309, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.375, + "step": 367, + "time_per_iteration": 4.825684070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118916, + "balance_loss_mlp": 1.12988925, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.06345017711900697, + "language_loss": 0.94456601, + "learning_rate": 0.0009956441370400167, + "loss": 0.95645761, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.59179688, + "step": 368, + "time_per_iteration": 2.6685595512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203671, + "balance_loss_mlp": 1.14411354, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.07550644934377632, + "language_loss": 1.00098681, + "learning_rate": 0.0009956030075522636, + "loss": 1.0130235, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.59472656, + "step": 369, + "time_per_iteration": 2.7824065685272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.12555027, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0634963537383221, + "language_loss": 1.00245738, + "learning_rate": 0.0009955616856543587, + "loss": 1.01431036, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.59667969, + "step": 370, + "time_per_iteration": 2.6869115829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117739, + "balance_loss_mlp": 1.11649847, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.04749901473855408, + "language_loss": 0.92605507, + "learning_rate": 0.0009955201713623448, + "loss": 0.93782902, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.60791016, + "step": 371, + "time_per_iteration": 2.7894065380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03553003, + "balance_loss_mlp": 3.34700894, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.1539254818196356, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.80225718, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 2.0625, + "step": 372, + "time_per_iteration": 5.025646924972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_mlp": 1.12739396, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.05697389015463885, + "language_loss": 1.05361807, + "learning_rate": 0.0009954365656605333, + "loss": 1.06550562, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.61328125, + "step": 373, + "time_per_iteration": 2.5767741203308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203971, + "balance_loss_mlp": 1.13878703, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.0561234241567743, + "language_loss": 0.98981488, + "learning_rate": 0.0009953944742831947, + "loss": 1.00185454, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.65185547, + "step": 374, + "time_per_iteration": 3.0126912593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209318, + "balance_loss_mlp": 1.14351439, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.05197007853134015, + "language_loss": 1.02623391, + "learning_rate": 0.0009953521905766642, + "loss": 1.0383271, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.65820312, + "step": 375, + "time_per_iteration": 2.9678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207965, + "balance_loss_mlp": 1.14464104, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.05250799377029981, + "language_loss": 1.01212132, + "learning_rate": 0.0009953097145573577, + "loss": 1.02420104, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.6328125, + "step": 376, + "time_per_iteration": 2.7048561573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121329, + "balance_loss_mlp": 1.1502521, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.050651846587156886, + "language_loss": 0.98499894, + "learning_rate": 0.000995267046241766, + "loss": 0.99713182, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.62988281, + "step": 377, + "time_per_iteration": 3.287705421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225924, + "balance_loss_mlp": 1.16341114, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.05776369312695448, + "language_loss": 0.98701203, + "learning_rate": 0.0009952241856464547, + "loss": 0.99927127, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.62451172, + "step": 378, + "time_per_iteration": 2.5897629261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220563, + "balance_loss_mlp": 1.16010034, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.05450855675542614, + "language_loss": 1.05642247, + "learning_rate": 0.0009951811327880632, + "loss": 1.06862807, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.60351562, + "step": 379, + "time_per_iteration": 2.7320594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220943, + "balance_loss_mlp": 1.15924072, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.04947645913164449, + "language_loss": 0.99005401, + "learning_rate": 0.0009951378876833063, + "loss": 1.00226343, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.61669922, + "step": 380, + "time_per_iteration": 2.595810651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196634, + "balance_loss_mlp": 1.13798296, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.058807068798268386, + "language_loss": 1.05567527, + "learning_rate": 0.0009950944503489736, + "loss": 1.06764162, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.5859375, + "step": 381, + "time_per_iteration": 2.733560562133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197608, + "balance_loss_mlp": 1.13914812, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.06747680453051412, + "language_loss": 0.99337935, + "learning_rate": 0.0009950508208019285, + "loss": 1.00535548, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.58398438, + "step": 382, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176507, + "balance_loss_mlp": 1.12062192, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.05827239016363537, + "language_loss": 1.03707182, + "learning_rate": 0.0009950069990591096, + "loss": 1.04883695, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.55908203, + "step": 383, + "time_per_iteration": 2.6856980323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05393736, + "balance_loss_mlp": 5.19079447, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.38241300139143997, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.81795102, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 2.03125, + "step": 384, + "time_per_iteration": 4.860661268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128436, + "balance_loss_mlp": 1.07369518, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.06005395599718801, + "language_loss": 0.96679938, + "learning_rate": 0.0009949187790542777, + "loss": 0.97808379, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.54760742, + "step": 385, + "time_per_iteration": 2.7245922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.09042215, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.06780842756482337, + "language_loss": 0.9270733, + "learning_rate": 0.0009948743808265148, + "loss": 0.93854064, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.56298828, + "step": 386, + "time_per_iteration": 2.6745331287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187036, + "balance_loss_mlp": 1.13334417, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.04295711334598506, + "language_loss": 1.02854586, + "learning_rate": 0.0009948297904714782, + "loss": 1.04041624, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.53759766, + "step": 387, + "time_per_iteration": 2.681718111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202671, + "balance_loss_mlp": 1.15167296, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.05564614333293379, + "language_loss": 0.94366896, + "learning_rate": 0.0009947850080064796, + "loss": 0.95569569, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.51049805, + "step": 388, + "time_per_iteration": 2.788663148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216483, + "balance_loss_mlp": 1.16817975, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.07112384111458, + "language_loss": 0.99713415, + "learning_rate": 0.0009947400334489047, + "loss": 1.00929892, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.48291016, + "step": 389, + "time_per_iteration": 2.9905049800872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227498, + "balance_loss_mlp": 1.17926562, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.06900212518032732, + "language_loss": 0.91264081, + "learning_rate": 0.0009946948668162145, + "loss": 0.92491579, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.48168945, + "step": 390, + "time_per_iteration": 2.767531394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012247, + "balance_loss_mlp": 1.17277205, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.052104168644034804, + "language_loss": 0.95126128, + "learning_rate": 0.0009946495081259441, + "loss": 0.96350825, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.52001953, + "step": 391, + "time_per_iteration": 2.816908597946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192311, + "balance_loss_mlp": 1.14057434, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.051504782312047234, + "language_loss": 0.99421549, + "learning_rate": 0.0009946039573957035, + "loss": 1.00613856, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.51782227, + "step": 392, + "time_per_iteration": 2.9265222549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116666, + "balance_loss_mlp": 1.11478019, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.055053573084277836, + "language_loss": 0.95799196, + "learning_rate": 0.000994558214643177, + "loss": 0.96965855, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.51928711, + "step": 393, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165121, + "balance_loss_mlp": 1.11352682, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.05925711706254076, + "language_loss": 0.97585773, + "learning_rate": 0.000994512279886123, + "loss": 0.98750889, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.51660156, + "step": 394, + "time_per_iteration": 3.0709142684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.09191656, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.04191079383555719, + "language_loss": 0.97239089, + "learning_rate": 0.0009944661531423758, + "loss": 0.98382699, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.51757812, + "step": 395, + "time_per_iteration": 2.7044599056243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134219, + "balance_loss_mlp": 1.08338809, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.05545815376917658, + "language_loss": 0.96390671, + "learning_rate": 0.000994419834429843, + "loss": 0.97524893, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.50854492, + "step": 396, + "time_per_iteration": 2.6767609119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135922, + "balance_loss_mlp": 1.08525789, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.05307630449121137, + "language_loss": 1.01208472, + "learning_rate": 0.0009943733237665069, + "loss": 1.02344394, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.50683594, + "step": 397, + "time_per_iteration": 2.819148302078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124426, + "balance_loss_mlp": 1.07502615, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.049844903289807924, + "language_loss": 0.99488425, + "learning_rate": 0.0009943266211704248, + "loss": 1.00612843, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.49389648, + "step": 398, + "time_per_iteration": 2.9555482864379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125466, + "balance_loss_mlp": 1.07675719, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.05620775813161816, + "language_loss": 1.01430082, + "learning_rate": 0.000994279726659728, + "loss": 1.02555549, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.48706055, + "step": 399, + "time_per_iteration": 2.5138003826141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.07761765, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.05674792404596756, + "language_loss": 0.99883693, + "learning_rate": 0.0009942326402526231, + "loss": 1.01010823, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.49511719, + "step": 400, + "time_per_iteration": 2.5245604515075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_mlp": 1.07793891, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.036646942736225624, + "language_loss": 0.9767518, + "learning_rate": 0.0009941853619673902, + "loss": 0.98802906, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.49804688, + "step": 401, + "time_per_iteration": 2.644771099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_mlp": 1.07451057, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.057554732491620374, + "language_loss": 1.01884329, + "learning_rate": 0.0009941378918223844, + "loss": 1.0300777, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.48876953, + "step": 402, + "time_per_iteration": 3.051617383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_mlp": 1.07618988, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.04510164642433069, + "language_loss": 0.94372368, + "learning_rate": 0.0009940902298360354, + "loss": 0.95496523, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.47924805, + "step": 403, + "time_per_iteration": 2.7302582263946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118279, + "balance_loss_mlp": 1.0687592, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.062376946911402976, + "language_loss": 1.04687834, + "learning_rate": 0.0009940423760268473, + "loss": 1.05806112, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.49536133, + "step": 404, + "time_per_iteration": 2.856938600540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118682, + "balance_loss_mlp": 1.07009196, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.046838991637930295, + "language_loss": 0.97888398, + "learning_rate": 0.0009939943304133982, + "loss": 0.99007082, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.48608398, + "step": 405, + "time_per_iteration": 2.6161091327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115161, + "balance_loss_mlp": 1.06881261, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.04496148345425058, + "language_loss": 1.04081011, + "learning_rate": 0.0009939460930143416, + "loss": 1.0519619, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.46337891, + "step": 406, + "time_per_iteration": 2.6310677528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119218, + "balance_loss_mlp": 1.07332289, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.037201804651944344, + "language_loss": 0.98071587, + "learning_rate": 0.0009938976638484043, + "loss": 0.99190807, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.45874023, + "step": 407, + "time_per_iteration": 2.8977036476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.06844616, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.04629061554837057, + "language_loss": 0.97991359, + "learning_rate": 0.0009938490429343887, + "loss": 0.99104249, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.44458008, + "step": 408, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07315516, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.04004461216150975, + "language_loss": 0.97974342, + "learning_rate": 0.0009938002302911709, + "loss": 0.99092889, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.4543457, + "step": 409, + "time_per_iteration": 2.738518238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123547, + "balance_loss_mlp": 1.07915401, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.07048914756312923, + "language_loss": 1.00401747, + "learning_rate": 0.0009937512259377015, + "loss": 1.01525307, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.44384766, + "step": 410, + "time_per_iteration": 2.670149564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110678, + "balance_loss_mlp": 1.0668565, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.049646402233970426, + "language_loss": 0.99659574, + "learning_rate": 0.000993702029893006, + "loss": 1.00770259, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.4387207, + "step": 411, + "time_per_iteration": 2.7853777408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118473, + "balance_loss_mlp": 1.07200527, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.04880092350488667, + "language_loss": 0.98862529, + "learning_rate": 0.0009936526421761838, + "loss": 0.99981004, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.46435547, + "step": 412, + "time_per_iteration": 3.030674457550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114951, + "balance_loss_mlp": 1.07043815, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.04383720282943398, + "language_loss": 1.01490402, + "learning_rate": 0.000993603062806409, + "loss": 1.02605367, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.4453125, + "step": 413, + "time_per_iteration": 2.7101500034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0637151, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.046157231925668944, + "language_loss": 1.04664707, + "learning_rate": 0.0009935532918029298, + "loss": 1.05774391, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.45947266, + "step": 414, + "time_per_iteration": 2.593390941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118947, + "balance_loss_mlp": 1.07278943, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.058468816323775735, + "language_loss": 0.97956645, + "learning_rate": 0.0009935033291850694, + "loss": 0.99075592, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.46166992, + "step": 415, + "time_per_iteration": 2.6693851947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_mlp": 1.0654031, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.061030352209764355, + "language_loss": 1.00225627, + "learning_rate": 0.0009934531749722247, + "loss": 1.01337099, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.46044922, + "step": 416, + "time_per_iteration": 2.578746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_mlp": 1.07337523, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.05071064772829009, + "language_loss": 0.98778659, + "learning_rate": 0.0009934028291838672, + "loss": 0.99898028, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.45996094, + "step": 417, + "time_per_iteration": 2.7096333503723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106202, + "balance_loss_mlp": 1.06166553, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.045680808340910005, + "language_loss": 0.94326293, + "learning_rate": 0.0009933522918395433, + "loss": 0.95432496, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.44555664, + "step": 418, + "time_per_iteration": 2.644414186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04959176, + "balance_loss_mlp": 4.71808767, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.3214703434406663, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.83210278, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 2.40625, + "step": 419, + "time_per_iteration": 4.868964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_mlp": 1.07108891, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08060687528614664, + "language_loss": 1.13036489, + "learning_rate": 0.000993250642561551, + "loss": 1.14152122, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.4453125, + "step": 420, + "time_per_iteration": 2.632162094116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121548, + "balance_loss_mlp": 1.07538986, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.08633853635548816, + "language_loss": 0.9784801, + "learning_rate": 0.0009931995306673466, + "loss": 0.98969555, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.46118164, + "step": 421, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134412, + "balance_loss_mlp": 1.08815861, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.038770411105538145, + "language_loss": 1.03907061, + "learning_rate": 0.000993148227296103, + "loss": 1.05041468, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.4621582, + "step": 422, + "time_per_iteration": 2.669496536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133253, + "balance_loss_mlp": 1.08707166, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.053095831055692516, + "language_loss": 0.9112367, + "learning_rate": 0.000993096732467738, + "loss": 0.92256927, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.46166992, + "step": 423, + "time_per_iteration": 2.961660861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150855, + "balance_loss_mlp": 1.10498345, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.08137036582560589, + "language_loss": 0.99760056, + "learning_rate": 0.0009930450462022435, + "loss": 1.00910902, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.45874023, + "step": 424, + "time_per_iteration": 2.7952311038970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03600409, + "balance_loss_mlp": 3.48901963, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.18349806711668631, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.82790214, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.1171875, + "step": 425, + "time_per_iteration": 4.8854875564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.11344862, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.06491953183218531, + "language_loss": 0.9776966, + "learning_rate": 0.0009929410994402065, + "loss": 0.98928833, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.45703125, + "step": 426, + "time_per_iteration": 4.275091886520386 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169515, + "balance_loss_mlp": 1.12223697, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.07437504582125473, + "language_loss": 1.02033544, + "learning_rate": 0.0009928888389840196, + "loss": 1.03203058, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.47241211, + "step": 427, + "time_per_iteration": 2.7036454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145234, + "balance_loss_mlp": 1.09941018, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.05964472172349544, + "language_loss": 1.03706717, + "learning_rate": 0.0009928363871714147, + "loss": 1.04851961, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.45849609, + "step": 428, + "time_per_iteration": 2.6669116020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.10254741, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.07530468467255677, + "language_loss": 0.97491598, + "learning_rate": 0.0009927837440227556, + "loss": 0.98641634, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.47485352, + "step": 429, + "time_per_iteration": 2.8463807106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120703, + "balance_loss_mlp": 1.07588065, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.04140843961960757, + "language_loss": 0.92054397, + "learning_rate": 0.0009927309095584798, + "loss": 0.93175101, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.44824219, + "step": 430, + "time_per_iteration": 2.9767606258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116415, + "balance_loss_mlp": 1.07278419, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.04726827868993605, + "language_loss": 1.04780793, + "learning_rate": 0.0009926778837991, + "loss": 1.05897212, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.43652344, + "step": 431, + "time_per_iteration": 2.5883395671844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.06749809, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.049074519776006666, + "language_loss": 1.0243988, + "learning_rate": 0.000992624666765202, + "loss": 1.0355196, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.44604492, + "step": 432, + "time_per_iteration": 2.7943906784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_mlp": 1.07200766, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.04417562175093811, + "language_loss": 1.00109053, + "learning_rate": 0.000992571258477447, + "loss": 1.01224887, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.43823242, + "step": 433, + "time_per_iteration": 2.836127758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_mlp": 1.07260084, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.04319706549365549, + "language_loss": 0.93695247, + "learning_rate": 0.0009925176589565695, + "loss": 0.94812053, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.44213867, + "step": 434, + "time_per_iteration": 2.8157734870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131219, + "balance_loss_mlp": 1.08756483, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.04172416189060796, + "language_loss": 1.04242814, + "learning_rate": 0.0009924638682233791, + "loss": 1.05374026, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.43652344, + "step": 435, + "time_per_iteration": 2.5577316284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503783, + "balance_loss_mlp": 2.3527205, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06968128915635463, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82068378, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.5078125, + "step": 436, + "time_per_iteration": 4.594938516616821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.08348453, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.0610737753852808, + "language_loss": 0.94037408, + "learning_rate": 0.0009923557132036668, + "loss": 0.95166528, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.45629883, + "step": 437, + "time_per_iteration": 3.0716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_mlp": 1.07430601, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.04662895628051273, + "language_loss": 0.97730738, + "learning_rate": 0.0009923013489591345, + "loss": 0.98849535, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.4453125, + "step": 438, + "time_per_iteration": 2.726792812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_mlp": 1.06685066, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04626496214247174, + "language_loss": 0.96079296, + "learning_rate": 0.0009922467935862681, + "loss": 0.97189873, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.4375, + "step": 439, + "time_per_iteration": 3.0908052921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119416, + "balance_loss_mlp": 1.07273376, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.048922855388473234, + "language_loss": 0.99432743, + "learning_rate": 0.0009921920471062478, + "loss": 1.00552154, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.46655273, + "step": 440, + "time_per_iteration": 2.622451066970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117818, + "balance_loss_mlp": 1.07342434, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.07502031783190574, + "language_loss": 0.9797709, + "learning_rate": 0.0009921371095403281, + "loss": 0.99094903, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.44433594, + "step": 441, + "time_per_iteration": 2.705152750015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011116, + "balance_loss_mlp": 1.06863689, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.04941418140969711, + "language_loss": 1.00754833, + "learning_rate": 0.0009920819809098379, + "loss": 1.01866436, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.42993164, + "step": 442, + "time_per_iteration": 2.5887317657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119689, + "balance_loss_mlp": 1.07715499, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.06964486535702215, + "language_loss": 0.96275294, + "learning_rate": 0.0009920266612361798, + "loss": 0.97394979, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.42578125, + "step": 443, + "time_per_iteration": 2.745222330093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_mlp": 1.06587708, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.05163049083883061, + "language_loss": 0.96866751, + "learning_rate": 0.0009919711505408308, + "loss": 0.97974443, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.41821289, + "step": 444, + "time_per_iteration": 2.780095100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106314, + "balance_loss_mlp": 1.0654248, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.054748359311131624, + "language_loss": 0.94535226, + "learning_rate": 0.000991915448845342, + "loss": 0.95641541, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.40893555, + "step": 445, + "time_per_iteration": 2.5229337215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.06279922, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.0575820537988498, + "language_loss": 1.03181779, + "learning_rate": 0.000991859556171339, + "loss": 1.04284596, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.40039062, + "step": 446, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_mlp": 1.06497526, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.04289742759235468, + "language_loss": 1.05262291, + "learning_rate": 0.000991803472540521, + "loss": 1.06367946, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.40673828, + "step": 447, + "time_per_iteration": 2.6220486164093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_mlp": 1.06550729, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.04330621576945977, + "language_loss": 1.00096428, + "learning_rate": 0.0009917471979746615, + "loss": 1.01202178, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.40234375, + "step": 448, + "time_per_iteration": 2.9767467975616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.07379115, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.03609686036920932, + "language_loss": 0.98485255, + "learning_rate": 0.0009916907324956086, + "loss": 0.99600053, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.41015625, + "step": 449, + "time_per_iteration": 2.701143980026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117003, + "balance_loss_mlp": 1.07480288, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.04834207301210501, + "language_loss": 0.95441091, + "learning_rate": 0.0009916340761252837, + "loss": 0.965581, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.42211914, + "step": 450, + "time_per_iteration": 2.6036393642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129901, + "balance_loss_mlp": 1.08910751, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.07269963588094165, + "language_loss": 0.9243114, + "learning_rate": 0.0009915772288856832, + "loss": 0.93561041, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.40820312, + "step": 451, + "time_per_iteration": 3.05719256401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125865, + "balance_loss_mlp": 1.08359361, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.05954656443346509, + "language_loss": 0.93746579, + "learning_rate": 0.000991520190798877, + "loss": 0.94872439, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.42285156, + "step": 452, + "time_per_iteration": 2.804128885269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_mlp": 1.07723105, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.05604676795867647, + "language_loss": 1.04000187, + "learning_rate": 0.0009914629618870089, + "loss": 1.05120206, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.42797852, + "step": 453, + "time_per_iteration": 2.8959083557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02032313, + "balance_loss_mlp": 1.86675501, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.06678910630402063, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.80708182, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.65625, + "step": 454, + "time_per_iteration": 4.753306865692139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974747, + "balance_loss_mlp": 1.80537415, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.06350102966569023, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83402705, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.6953125, + "step": 455, + "time_per_iteration": 4.909627914428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_mlp": 1.05778539, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.07384563339861851, + "language_loss": 0.95938599, + "learning_rate": 0.0009912901304235883, + "loss": 0.97038674, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.42333984, + "step": 456, + "time_per_iteration": 3.0303096771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_mlp": 1.05112898, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.061767025741825826, + "language_loss": 0.93898749, + "learning_rate": 0.000991232138434397, + "loss": 0.94991863, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.41992188, + "step": 457, + "time_per_iteration": 2.834221601486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089137, + "balance_loss_mlp": 1.04824805, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.05183647995223567, + "language_loss": 1.00765896, + "learning_rate": 0.000991173955731976, + "loss": 1.0185504, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.40869141, + "step": 458, + "time_per_iteration": 2.628783702850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_mlp": 1.05569601, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.052575936673692925, + "language_loss": 1.04489028, + "learning_rate": 0.0009911155823389137, + "loss": 1.0558753, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.42797852, + "step": 459, + "time_per_iteration": 2.964416742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_mlp": 1.06523609, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.05270293395412616, + "language_loss": 1.00385904, + "learning_rate": 0.000991057018277873, + "loss": 1.01492882, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.41748047, + "step": 460, + "time_per_iteration": 2.6944808959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_mlp": 1.06245136, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.04953210926048159, + "language_loss": 1.01399374, + "learning_rate": 0.0009909982635715898, + "loss": 1.02504039, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.42236328, + "step": 461, + "time_per_iteration": 2.6137924194335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_mlp": 1.05374336, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.050729417377465176, + "language_loss": 1.00123549, + "learning_rate": 0.0009909393182428751, + "loss": 1.01219559, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.42285156, + "step": 462, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109539, + "balance_loss_mlp": 1.06891286, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.043715633324142876, + "language_loss": 0.94138575, + "learning_rate": 0.000990880182314614, + "loss": 0.95248115, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.40625, + "step": 463, + "time_per_iteration": 2.733408212661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_mlp": 1.06121325, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.051961844945365605, + "language_loss": 0.94176865, + "learning_rate": 0.0009908208558097643, + "loss": 0.9527818, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.40087891, + "step": 464, + "time_per_iteration": 2.9006474018096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105621, + "balance_loss_mlp": 1.06508923, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.04470923680131565, + "language_loss": 0.9716863, + "learning_rate": 0.000990761338751359, + "loss": 0.98274255, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.40527344, + "step": 465, + "time_per_iteration": 2.775830030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410893, + "balance_loss_mlp": 1.25296497, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.0425617539044403, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75070524, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.578125, + "step": 466, + "time_per_iteration": 5.023500919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_mlp": 1.05869305, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.04007163966797277, + "language_loss": 0.9983623, + "learning_rate": 0.0009906417330663815, + "loss": 1.00936306, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.41381836, + "step": 467, + "time_per_iteration": 2.6194305419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099405, + "balance_loss_mlp": 1.05889773, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.03985353179312445, + "language_loss": 0.96447593, + "learning_rate": 0.0009905816444862442, + "loss": 0.97546995, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.4050293, + "step": 468, + "time_per_iteration": 2.623267889022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_mlp": 1.06568456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.038840192804800056, + "language_loss": 0.93513083, + "learning_rate": 0.0009905213654454216, + "loss": 0.94620228, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.41455078, + "step": 469, + "time_per_iteration": 2.9024641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_mlp": 1.06466317, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.04985478927164425, + "language_loss": 1.01848495, + "learning_rate": 0.0009904608959673158, + "loss": 1.02953827, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.40649414, + "step": 470, + "time_per_iteration": 2.7711682319641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097659, + "balance_loss_mlp": 1.0588448, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.04989175862356038, + "language_loss": 1.02851224, + "learning_rate": 0.000990400236075403, + "loss": 1.03948903, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.38793945, + "step": 471, + "time_per_iteration": 2.536189317703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109095, + "balance_loss_mlp": 1.05113411, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.03738902964718639, + "language_loss": 0.98994756, + "learning_rate": 0.0009903393857932338, + "loss": 1.000857, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.39794922, + "step": 472, + "time_per_iteration": 2.6588857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097802, + "balance_loss_mlp": 1.05908275, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.045733529486957185, + "language_loss": 0.97091877, + "learning_rate": 0.0009902783451444317, + "loss": 0.98189688, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.38720703, + "step": 473, + "time_per_iteration": 2.6981122493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091239, + "balance_loss_mlp": 1.05406976, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.04942472768420212, + "language_loss": 1.00819659, + "learning_rate": 0.0009902171141526956, + "loss": 1.01910901, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.37158203, + "step": 474, + "time_per_iteration": 2.527256727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099497, + "balance_loss_mlp": 1.06225586, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.04275448033987936, + "language_loss": 0.88210893, + "learning_rate": 0.000990155692841797, + "loss": 0.8931039, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.37231445, + "step": 475, + "time_per_iteration": 2.989063262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_mlp": 1.06084871, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.04412440376655801, + "language_loss": 1.00229144, + "learning_rate": 0.0009900940812355818, + "loss": 1.01326227, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.36254883, + "step": 476, + "time_per_iteration": 2.8778445720672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105736, + "balance_loss_mlp": 1.07011676, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.06417087981964828, + "language_loss": 0.97168529, + "learning_rate": 0.00099003227935797, + "loss": 0.98274267, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.35620117, + "step": 477, + "time_per_iteration": 2.708608627319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101416, + "balance_loss_mlp": 1.06369829, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.06707216335576115, + "language_loss": 1.01291215, + "learning_rate": 0.000989970287232955, + "loss": 1.02392626, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.37695312, + "step": 478, + "time_per_iteration": 2.783325672149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090795, + "balance_loss_mlp": 1.05431736, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.05564878549890474, + "language_loss": 0.9726451, + "learning_rate": 0.0009899081048846043, + "loss": 0.98355305, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.36474609, + "step": 479, + "time_per_iteration": 2.6017916202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097049, + "balance_loss_mlp": 1.05964088, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.06044394784495309, + "language_loss": 1.03484094, + "learning_rate": 0.0009898457323370593, + "loss": 1.04581141, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.37402344, + "step": 480, + "time_per_iteration": 2.575676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.0533123, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.05778783373137127, + "language_loss": 0.99753714, + "learning_rate": 0.000989783169614535, + "loss": 1.00844884, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.37817383, + "step": 481, + "time_per_iteration": 2.646942615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283887, + "balance_loss_mlp": 1.15876544, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.01956789957612316, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80036646, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.25, + "step": 482, + "time_per_iteration": 4.860741376876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_mlp": 1.06158745, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.06801501049369231, + "language_loss": 0.97102278, + "learning_rate": 0.000989657473741779, + "loss": 0.98201108, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.37231445, + "step": 483, + "time_per_iteration": 2.819138526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095911, + "balance_loss_mlp": 1.05979109, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.038333848574242754, + "language_loss": 0.98462784, + "learning_rate": 0.0009895943406403465, + "loss": 0.99558693, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.36132812, + "step": 484, + "time_per_iteration": 2.7088170051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_mlp": 1.06854701, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.05828015098596693, + "language_loss": 0.92231822, + "learning_rate": 0.0009895310174615338, + "loss": 0.933357, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.35351562, + "step": 485, + "time_per_iteration": 2.760511636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_mlp": 1.14983261, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.018538812380254305, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76984316, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.15625, + "step": 486, + "time_per_iteration": 4.656491994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_mlp": 1.0699296, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.04721263549483299, + "language_loss": 0.95839012, + "learning_rate": 0.0009894038009701782, + "loss": 0.96944392, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.35498047, + "step": 487, + "time_per_iteration": 2.6169629096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_mlp": 1.06868315, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.05102581257360949, + "language_loss": 0.98848963, + "learning_rate": 0.0009893399077070253, + "loss": 0.99952644, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.3503418, + "step": 488, + "time_per_iteration": 2.5845744609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_mlp": 1.07193291, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.05918319403016569, + "language_loss": 0.92944884, + "learning_rate": 0.0009892758244652718, + "loss": 0.94051951, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.35180664, + "step": 489, + "time_per_iteration": 2.660200357437134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091731, + "balance_loss_mlp": 1.05801892, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.041386989889926534, + "language_loss": 1.00010514, + "learning_rate": 0.0009892115512697968, + "loss": 1.01102245, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.33740234, + "step": 490, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_mlp": 1.05631554, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.04182034264497562, + "language_loss": 1.00108159, + "learning_rate": 0.0009891470881455537, + "loss": 1.01198137, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.33666992, + "step": 491, + "time_per_iteration": 2.746169328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108736, + "balance_loss_mlp": 1.05319476, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.0458284589248403, + "language_loss": 0.98654628, + "learning_rate": 0.0009890824351175692, + "loss": 0.99741989, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.34204102, + "step": 492, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.05654192, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.041327442652051224, + "language_loss": 1.0219661, + "learning_rate": 0.0009890175922109435, + "loss": 1.0328722, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.34082031, + "step": 493, + "time_per_iteration": 2.6482973098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010971, + "balance_loss_mlp": 1.06086028, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.06926989533772566, + "language_loss": 1.01090789, + "learning_rate": 0.0009889525594508513, + "loss": 1.02187896, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.36254883, + "step": 494, + "time_per_iteration": 3.0095505714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_mlp": 1.05596447, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.04986765426945594, + "language_loss": 0.94310975, + "learning_rate": 0.0009888873368625404, + "loss": 0.95402986, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.3605957, + "step": 495, + "time_per_iteration": 2.5451042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05426204, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.05650320770937666, + "language_loss": 0.98877072, + "learning_rate": 0.0009888219244713326, + "loss": 0.99966443, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.3515625, + "step": 496, + "time_per_iteration": 2.8157310485839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086342, + "balance_loss_mlp": 1.05100799, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.05039739829653265, + "language_loss": 0.99588835, + "learning_rate": 0.0009887563223026229, + "loss": 1.00675178, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.35375977, + "step": 497, + "time_per_iteration": 2.6563401222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244906, + "balance_loss_mlp": 1.14648652, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.01649790273231252, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80313075, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.98046875, + "step": 498, + "time_per_iteration": 4.8689799308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098776, + "balance_loss_mlp": 1.0630604, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.06260101269903841, + "language_loss": 0.97272921, + "learning_rate": 0.0009886245487346482, + "loss": 0.98371696, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35742188, + "step": 499, + "time_per_iteration": 3.0292818546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.08159947, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.055723050712230264, + "language_loss": 1.00704551, + "learning_rate": 0.0009885583773865422, + "loss": 1.01822114, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.35986328, + "step": 500, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117756, + "balance_loss_mlp": 1.08137345, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.06268683986847115, + "language_loss": 0.9714855, + "learning_rate": 0.0009884920163632524, + "loss": 0.98266304, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.36352539, + "step": 501, + "time_per_iteration": 2.666341781616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111747, + "balance_loss_mlp": 1.07638931, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.04553274405873497, + "language_loss": 1.01245189, + "learning_rate": 0.000988425465690543, + "loss": 1.02356935, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35375977, + "step": 502, + "time_per_iteration": 2.55082106590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06867552, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.04373339165225573, + "language_loss": 0.99427342, + "learning_rate": 0.0009883587253942505, + "loss": 1.00530469, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.34472656, + "step": 503, + "time_per_iteration": 2.7674455642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_mlp": 1.07378531, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.051161986083573203, + "language_loss": 1.04393589, + "learning_rate": 0.0009882917955002862, + "loss": 1.05501866, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.3449707, + "step": 504, + "time_per_iteration": 2.549203872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_mlp": 1.07116556, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.04840022534917253, + "language_loss": 0.95342839, + "learning_rate": 0.0009882246760346343, + "loss": 0.96448457, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.3449707, + "step": 505, + "time_per_iteration": 2.653627872467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115925, + "balance_loss_mlp": 1.08128262, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.08271599518488834, + "language_loss": 1.02799106, + "learning_rate": 0.0009881573670233533, + "loss": 1.03915036, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.34692383, + "step": 506, + "time_per_iteration": 2.5279319286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104761, + "balance_loss_mlp": 1.07061946, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.05291653517072512, + "language_loss": 0.96169406, + "learning_rate": 0.0009880898684925747, + "loss": 0.97274166, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.34179688, + "step": 507, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_mlp": 1.06039834, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.053809005456099755, + "language_loss": 0.94680405, + "learning_rate": 0.0009880221804685037, + "loss": 0.95776224, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.35424805, + "step": 508, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245061, + "balance_loss_mlp": 1.15503371, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.024665830319341657, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80589479, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.8984375, + "step": 509, + "time_per_iteration": 4.705655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094606, + "balance_loss_mlp": 1.05932045, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.06644626598388864, + "language_loss": 1.02131915, + "learning_rate": 0.0009878862360456733, + "loss": 1.03226519, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.35327148, + "step": 510, + "time_per_iteration": 2.682035446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097961, + "balance_loss_mlp": 1.06336641, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.06543943311749917, + "language_loss": 0.9266718, + "learning_rate": 0.0009878179796996922, + "loss": 0.9376514, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.34619141, + "step": 511, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105256, + "balance_loss_mlp": 1.07030368, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.054213046356477584, + "language_loss": 0.96428764, + "learning_rate": 0.0009877495339659754, + "loss": 0.97534013, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.34985352, + "step": 512, + "time_per_iteration": 2.746337413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105714, + "balance_loss_mlp": 1.07190621, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.0573170093193853, + "language_loss": 0.91841626, + "learning_rate": 0.000987680898871096, + "loss": 0.9294734, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.33837891, + "step": 513, + "time_per_iteration": 2.7060482501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110186, + "balance_loss_mlp": 1.07675993, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.0786420176645203, + "language_loss": 0.95400196, + "learning_rate": 0.0009876120744417, + "loss": 0.96510386, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33447266, + "step": 514, + "time_per_iteration": 2.9473536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105767, + "balance_loss_mlp": 1.07071972, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.04861145683213968, + "language_loss": 1.01586378, + "learning_rate": 0.0009875430607045078, + "loss": 1.02692139, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.35058594, + "step": 515, + "time_per_iteration": 2.6745734214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095325, + "balance_loss_mlp": 1.06044412, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.061184004848699555, + "language_loss": 0.96467805, + "learning_rate": 0.000987473857686313, + "loss": 0.97563124, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.34887695, + "step": 516, + "time_per_iteration": 2.70771861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_mlp": 1.06909752, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.06268031252544905, + "language_loss": 1.01795554, + "learning_rate": 0.0009874044654139824, + "loss": 1.02899015, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34399414, + "step": 517, + "time_per_iteration": 2.7501027584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104488, + "balance_loss_mlp": 1.07020378, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.05802057466070587, + "language_loss": 1.01047516, + "learning_rate": 0.0009873348839144563, + "loss": 1.02152014, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34301758, + "step": 518, + "time_per_iteration": 2.5247762203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125408, + "balance_loss_mlp": 1.09100425, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.057276560313135924, + "language_loss": 1.0153054, + "learning_rate": 0.000987265113214749, + "loss": 1.02655947, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34448242, + "step": 519, + "time_per_iteration": 2.569776773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151248, + "balance_loss_mlp": 1.11705852, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.06886779278024428, + "language_loss": 1.05486548, + "learning_rate": 0.0009871951533419476, + "loss": 1.066378, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.34204102, + "step": 520, + "time_per_iteration": 2.646489381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155904, + "balance_loss_mlp": 1.12085652, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.06947260655531057, + "language_loss": 0.93715644, + "learning_rate": 0.0009871250043232132, + "loss": 0.94871557, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.35058594, + "step": 521, + "time_per_iteration": 2.729825258255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145676, + "balance_loss_mlp": 1.11196363, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.05700460680955029, + "language_loss": 0.94319808, + "learning_rate": 0.0009870546661857797, + "loss": 0.95465487, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.33740234, + "step": 522, + "time_per_iteration": 2.589205026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.10572577, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.0627280587118585, + "language_loss": 1.04607201, + "learning_rate": 0.0009869841389569553, + "loss": 1.05746591, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.33666992, + "step": 523, + "time_per_iteration": 3.007927656173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_mlp": 1.07816648, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.07025860249961899, + "language_loss": 0.94709289, + "learning_rate": 0.0009869134226641206, + "loss": 0.95821834, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.34399414, + "step": 524, + "time_per_iteration": 2.5647661685943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096367, + "balance_loss_mlp": 1.06134343, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.0754869647085307, + "language_loss": 0.96719551, + "learning_rate": 0.0009868425173347303, + "loss": 0.97815919, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.3503418, + "step": 525, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_mlp": 1.04816294, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.04461045481777941, + "language_loss": 1.01427031, + "learning_rate": 0.0009867714229963125, + "loss": 1.02508664, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.3347168, + "step": 526, + "time_per_iteration": 2.7551424503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_mlp": 1.06672287, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.06519670287778681, + "language_loss": 0.99495387, + "learning_rate": 0.000986700139676468, + "loss": 1.00596797, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34716797, + "step": 527, + "time_per_iteration": 2.5689845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_mlp": 1.08317983, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.055001529425537175, + "language_loss": 0.97175169, + "learning_rate": 0.0009866286674028717, + "loss": 0.98293233, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.34936523, + "step": 528, + "time_per_iteration": 2.6308236122131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118149, + "balance_loss_mlp": 1.08307743, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.06791274268555884, + "language_loss": 0.93964088, + "learning_rate": 0.0009865570062032717, + "loss": 0.95082229, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.35083008, + "step": 529, + "time_per_iteration": 2.931939125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117806, + "balance_loss_mlp": 1.08104193, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.05469252484924326, + "language_loss": 0.97321147, + "learning_rate": 0.0009864851561054893, + "loss": 0.98438954, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.36743164, + "step": 530, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_mlp": 1.0567745, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.053032092698093954, + "language_loss": 0.97237867, + "learning_rate": 0.0009864131171374191, + "loss": 0.9832958, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34936523, + "step": 531, + "time_per_iteration": 2.671963930130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_mlp": 1.05704737, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.037042660663456926, + "language_loss": 0.97530323, + "learning_rate": 0.0009863408893270292, + "loss": 0.98621887, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.34521484, + "step": 532, + "time_per_iteration": 2.8692965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080567, + "balance_loss_mlp": 1.0459249, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.045189468397627275, + "language_loss": 0.93818736, + "learning_rate": 0.0009862684727023605, + "loss": 0.94899297, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34692383, + "step": 533, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_mlp": 1.04978406, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.041807858593286534, + "language_loss": 0.94846106, + "learning_rate": 0.0009861958672915283, + "loss": 0.95930672, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.34814453, + "step": 534, + "time_per_iteration": 2.7894833087921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088029, + "balance_loss_mlp": 1.05348206, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.04113334704287127, + "language_loss": 0.93477535, + "learning_rate": 0.0009861230731227201, + "loss": 0.94565558, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.34570312, + "step": 535, + "time_per_iteration": 2.8369100093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_mlp": 1.06589389, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.06472741174466715, + "language_loss": 0.9716177, + "learning_rate": 0.0009860500902241973, + "loss": 0.98262858, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.35205078, + "step": 536, + "time_per_iteration": 2.6308608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_mlp": 1.06559658, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.06015330648509861, + "language_loss": 1.02488375, + "learning_rate": 0.0009859769186242942, + "loss": 1.0358845, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.34521484, + "step": 537, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094076, + "balance_loss_mlp": 1.06188989, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04182272700248836, + "language_loss": 0.96166039, + "learning_rate": 0.0009859035583514187, + "loss": 0.97260106, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32177734, + "step": 538, + "time_per_iteration": 2.665483236312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107546, + "balance_loss_mlp": 1.07497787, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.03728554890083732, + "language_loss": 0.9932602, + "learning_rate": 0.0009858300094340517, + "loss": 1.00433564, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.32568359, + "step": 539, + "time_per_iteration": 2.772207021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_mlp": 1.07908368, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.05284254114338104, + "language_loss": 0.91679931, + "learning_rate": 0.0009857562719007473, + "loss": 0.92790818, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.31787109, + "step": 540, + "time_per_iteration": 2.633002519607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_mlp": 1.06964111, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.07454941449424961, + "language_loss": 0.93962657, + "learning_rate": 0.0009856823457801331, + "loss": 0.95063812, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.888354539871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098965, + "balance_loss_mlp": 1.06682634, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.06016078646373104, + "language_loss": 1.01014686, + "learning_rate": 0.00098560823110091, + "loss": 1.02113652, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32128906, + "step": 542, + "time_per_iteration": 2.612365484237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.05664408, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.07331709746631812, + "language_loss": 0.99634022, + "learning_rate": 0.000985533927891851, + "loss": 1.00722837, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.3215332, + "step": 543, + "time_per_iteration": 2.6642584800720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_mlp": 1.05406535, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.07406485241554656, + "language_loss": 0.99318308, + "learning_rate": 0.0009854594361818044, + "loss": 1.00405657, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33300781, + "step": 544, + "time_per_iteration": 2.650541067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087044, + "balance_loss_mlp": 1.05357027, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.05515562757052397, + "language_loss": 0.98072803, + "learning_rate": 0.0009853847559996897, + "loss": 0.99159849, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.3347168, + "step": 545, + "time_per_iteration": 2.7268693447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098973, + "balance_loss_mlp": 1.0640682, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.05014767442192859, + "language_loss": 0.9781934, + "learning_rate": 0.0009853098873745, + "loss": 0.98918307, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34936523, + "step": 546, + "time_per_iteration": 3.001844644546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094885, + "balance_loss_mlp": 1.06010008, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.06665960072991474, + "language_loss": 0.96499509, + "learning_rate": 0.0009852348303353027, + "loss": 0.97594392, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34814453, + "step": 547, + "time_per_iteration": 2.7768120765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109085, + "balance_loss_mlp": 1.05692363, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.04477171592325676, + "language_loss": 0.89746928, + "learning_rate": 0.000985159584911237, + "loss": 0.90837783, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33959961, + "step": 548, + "time_per_iteration": 3.1397063732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109064, + "balance_loss_mlp": 1.0567131, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.057455808878804256, + "language_loss": 0.97617745, + "learning_rate": 0.0009850841511315162, + "loss": 0.98708391, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.33959961, + "step": 549, + "time_per_iteration": 2.6143858432769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.05660701, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.04134640300819554, + "language_loss": 0.97230792, + "learning_rate": 0.0009850085290254256, + "loss": 0.98321134, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33740234, + "step": 550, + "time_per_iteration": 2.784057855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_mlp": 1.05478084, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.041486348142279396, + "language_loss": 0.9340632, + "learning_rate": 0.0009849327186223246, + "loss": 0.94494367, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.33276367, + "step": 551, + "time_per_iteration": 2.822755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086641, + "balance_loss_mlp": 1.0536921, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.044652358506572586, + "language_loss": 1.00453854, + "learning_rate": 0.000984856719951646, + "loss": 1.01540482, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.32958984, + "step": 552, + "time_per_iteration": 2.561384439468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_mlp": 1.05577254, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.05595352831954139, + "language_loss": 0.98322356, + "learning_rate": 0.0009847805330428943, + "loss": 0.99410868, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.32739258, + "step": 553, + "time_per_iteration": 2.8988356590270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04940784, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05618387686115577, + "language_loss": 1.02895415, + "learning_rate": 0.0009847041579256481, + "loss": 1.03977895, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.33081055, + "step": 554, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088152, + "balance_loss_mlp": 1.05548859, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.04459262579832553, + "language_loss": 0.99802542, + "learning_rate": 0.0009846275946295592, + "loss": 1.00890684, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32641602, + "step": 555, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108533, + "balance_loss_mlp": 1.05347764, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.04108965909817336, + "language_loss": 0.92502242, + "learning_rate": 0.0009845508431843518, + "loss": 0.93587577, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.31835938, + "step": 556, + "time_per_iteration": 3.0189473628997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087957, + "balance_loss_mlp": 1.05612838, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.05029379164990677, + "language_loss": 0.95060432, + "learning_rate": 0.0009844739036198233, + "loss": 0.96148396, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.31811523, + "step": 557, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06340766, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.047100661757994676, + "language_loss": 1.0152961, + "learning_rate": 0.0009843967759658448, + "loss": 1.02625763, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.32739258, + "step": 558, + "time_per_iteration": 2.6677682399749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264894, + "balance_loss_mlp": 1.19775486, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.03689581784010691, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74032652, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.671875, + "step": 559, + "time_per_iteration": 4.873044013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.07234466, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.06480790167761245, + "language_loss": 1.01098323, + "learning_rate": 0.000984241956509384, + "loss": 1.02203977, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.33325195, + "step": 560, + "time_per_iteration": 2.655430555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095265, + "balance_loss_mlp": 1.0617907, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.05361377514900226, + "language_loss": 1.00074768, + "learning_rate": 0.0009841642647670078, + "loss": 1.01170027, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.33496094, + "step": 561, + "time_per_iteration": 2.5627329349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.05633116, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.04993888185520414, + "language_loss": 0.93071151, + "learning_rate": 0.0009840863850553944, + "loss": 0.94160575, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33105469, + "step": 562, + "time_per_iteration": 3.0020592212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108807, + "balance_loss_mlp": 1.05686092, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.046287089248472475, + "language_loss": 0.97956204, + "learning_rate": 0.0009840083174047782, + "loss": 0.99044275, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.31176758, + "step": 563, + "time_per_iteration": 2.7123258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_mlp": 1.06275535, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.036863902598139514, + "language_loss": 0.91394317, + "learning_rate": 0.0009839300618454685, + "loss": 0.92488301, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31176758, + "step": 564, + "time_per_iteration": 2.855482578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_mlp": 1.05386496, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0447892393855046, + "language_loss": 0.97269231, + "learning_rate": 0.0009838516184078466, + "loss": 0.98355657, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.32568359, + "step": 565, + "time_per_iteration": 2.8027093410491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090742, + "balance_loss_mlp": 1.05881739, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.039430635834492286, + "language_loss": 0.95326865, + "learning_rate": 0.0009837729871223669, + "loss": 0.964176, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3190918, + "step": 566, + "time_per_iteration": 2.621044158935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097443, + "balance_loss_mlp": 1.06473231, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.03524126234366562, + "language_loss": 0.96988255, + "learning_rate": 0.0009836941680195568, + "loss": 0.98085701, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.32714844, + "step": 567, + "time_per_iteration": 2.8241846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_mlp": 1.06359148, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.05940738915226433, + "language_loss": 0.94011569, + "learning_rate": 0.0009836151611300166, + "loss": 0.95106757, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.31567383, + "step": 568, + "time_per_iteration": 3.2259325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093362, + "balance_loss_mlp": 1.06327355, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.04952949609465528, + "language_loss": 1.01886261, + "learning_rate": 0.0009835359664844194, + "loss": 1.02979624, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.30029297, + "step": 569, + "time_per_iteration": 2.61936616897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235986, + "balance_loss_mlp": 1.17113578, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.02580255803672051, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82272792, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.6484375, + "step": 570, + "time_per_iteration": 4.946800470352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_mlp": 1.06947398, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.04088785760268294, + "language_loss": 0.98121774, + "learning_rate": 0.0009833770140481118, + "loss": 0.99224108, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.32861328, + "step": 571, + "time_per_iteration": 2.6676580905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_mlp": 1.07113993, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.04146527084622454, + "language_loss": 0.88084227, + "learning_rate": 0.000983297256319112, + "loss": 0.89187813, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.32446289, + "step": 572, + "time_per_iteration": 3.1977450847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098726, + "balance_loss_mlp": 1.06503749, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.11112801331440751, + "language_loss": 0.93675387, + "learning_rate": 0.000983217310957477, + "loss": 0.94774115, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33691406, + "step": 573, + "time_per_iteration": 2.771477222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08530974, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.046936313049011164, + "language_loss": 0.98079342, + "learning_rate": 0.000983137177994244, + "loss": 0.99198341, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.3371582, + "step": 574, + "time_per_iteration": 2.842641830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127826, + "balance_loss_mlp": 1.0945909, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.047970587572460185, + "language_loss": 0.91368234, + "learning_rate": 0.0009830568574605235, + "loss": 0.92496061, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.33227539, + "step": 575, + "time_per_iteration": 2.9841148853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136053, + "balance_loss_mlp": 1.10260296, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.06212944390612344, + "language_loss": 0.95608473, + "learning_rate": 0.0009829763493874992, + "loss": 0.96744525, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3347168, + "step": 576, + "time_per_iteration": 3.094599485397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122949, + "balance_loss_mlp": 1.08918953, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.040009357062280086, + "language_loss": 1.0022918, + "learning_rate": 0.0009828956538064264, + "loss": 1.01352131, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.33764648, + "step": 577, + "time_per_iteration": 2.7913765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128001, + "balance_loss_mlp": 1.09428823, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07834189266391174, + "language_loss": 0.97103804, + "learning_rate": 0.0009828147707486344, + "loss": 0.98231804, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.33740234, + "step": 578, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.0659467, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.066476002167881, + "language_loss": 0.94244707, + "learning_rate": 0.0009827337002455245, + "loss": 0.95344198, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.33544922, + "step": 579, + "time_per_iteration": 2.6212143898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.05940461, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.0598380025645264, + "language_loss": 0.93403691, + "learning_rate": 0.0009826524423285712, + "loss": 0.94494587, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.31469727, + "step": 580, + "time_per_iteration": 2.916363000869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_mlp": 1.05466461, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.051352596452175936, + "language_loss": 0.95457065, + "learning_rate": 0.0009825709970293218, + "loss": 0.96543789, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.32055664, + "step": 581, + "time_per_iteration": 2.975459575653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094975, + "balance_loss_mlp": 1.06414759, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.06330579048660655, + "language_loss": 1.01360774, + "learning_rate": 0.0009824893643793956, + "loss": 1.02455735, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.30810547, + "step": 582, + "time_per_iteration": 3.0850436687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109989, + "balance_loss_mlp": 1.06772757, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.05517621871728721, + "language_loss": 0.96568394, + "learning_rate": 0.0009824075444104857, + "loss": 0.9766829, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3215332, + "step": 583, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104353, + "balance_loss_mlp": 1.07214284, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.05273776870459213, + "language_loss": 1.00669086, + "learning_rate": 0.000982325537154357, + "loss": 1.01773441, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.32202148, + "step": 584, + "time_per_iteration": 2.566066265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109419, + "balance_loss_mlp": 1.07768583, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.05755454669423396, + "language_loss": 1.01869726, + "learning_rate": 0.0009822433426428484, + "loss": 1.02979159, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31713867, + "step": 585, + "time_per_iteration": 2.611968994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_mlp": 1.08987498, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.06034275506000564, + "language_loss": 0.93750811, + "learning_rate": 0.0009821609609078697, + "loss": 0.94872963, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.32275391, + "step": 586, + "time_per_iteration": 2.584847927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0726887, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.06416707827025614, + "language_loss": 0.95279968, + "learning_rate": 0.0009820783919814045, + "loss": 0.96384937, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.32275391, + "step": 587, + "time_per_iteration": 2.7885184288024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06359744, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.049104346633589514, + "language_loss": 0.92135406, + "learning_rate": 0.0009819956358955095, + "loss": 0.93231547, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32543945, + "step": 588, + "time_per_iteration": 2.560117483139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_mlp": 1.05427432, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.05114307144868452, + "language_loss": 0.93675017, + "learning_rate": 0.0009819126926823127, + "loss": 0.94761813, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.32519531, + "step": 589, + "time_per_iteration": 2.517035722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.05966008, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.04613241529975588, + "language_loss": 0.94437975, + "learning_rate": 0.000981829562374016, + "loss": 0.95531201, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.33569336, + "step": 590, + "time_per_iteration": 2.8174262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_mlp": 1.05913091, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.05348492004263644, + "language_loss": 1.04949331, + "learning_rate": 0.0009817462450028933, + "loss": 1.0604248, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.34057617, + "step": 591, + "time_per_iteration": 2.6302859783172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.0668143, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.2030818500746725, + "language_loss": 0.92329478, + "learning_rate": 0.0009816627406012916, + "loss": 0.93430716, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.34472656, + "step": 592, + "time_per_iteration": 2.8384313583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.09943521, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.0774704650100976, + "language_loss": 0.91851664, + "learning_rate": 0.0009815790492016295, + "loss": 0.92987645, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36523438, + "step": 593, + "time_per_iteration": 2.9409682750701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136834, + "balance_loss_mlp": 1.10192943, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.09332707993556091, + "language_loss": 0.94690275, + "learning_rate": 0.0009814951708363993, + "loss": 0.95827115, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.34912109, + "step": 594, + "time_per_iteration": 2.8599631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221657, + "balance_loss_mlp": 1.16023993, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.030934197408724044, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79212642, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.61328125, + "step": 595, + "time_per_iteration": 4.801583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_mlp": 1.10138512, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.0746127254366864, + "language_loss": 0.94972038, + "learning_rate": 0.0009813268533395648, + "loss": 0.96109354, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.359375, + "step": 596, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_mlp": 1.0882678, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.061536990211155544, + "language_loss": 0.95371294, + "learning_rate": 0.0009812424142733073, + "loss": 0.96494377, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.34765625, + "step": 597, + "time_per_iteration": 2.5663998126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07387781, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.04795398370622496, + "language_loss": 0.91199464, + "learning_rate": 0.000981157788372175, + "loss": 0.92308056, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.34716797, + "step": 598, + "time_per_iteration": 3.004436492919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_mlp": 1.06864619, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.04762632796488997, + "language_loss": 0.94997883, + "learning_rate": 0.0009810729756690223, + "loss": 0.96100628, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.34106445, + "step": 599, + "time_per_iteration": 2.704676628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_mlp": 1.06947374, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.06699944809564747, + "language_loss": 0.98224139, + "learning_rate": 0.0009809879761967766, + "loss": 0.99328732, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35107422, + "step": 600, + "time_per_iteration": 2.953348159790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_mlp": 1.07922578, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.06801646297960097, + "language_loss": 0.96874714, + "learning_rate": 0.0009809027899884378, + "loss": 0.97988677, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.34765625, + "step": 601, + "time_per_iteration": 2.896559953689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104267, + "balance_loss_mlp": 1.07014918, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.062436318450634756, + "language_loss": 0.9484992, + "learning_rate": 0.0009808174170770779, + "loss": 0.95954192, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.34130859, + "step": 602, + "time_per_iteration": 2.814558982849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220455, + "balance_loss_mlp": 1.16704941, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.025680107820064087, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86118698, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.53515625, + "step": 603, + "time_per_iteration": 4.897503614425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118739, + "balance_loss_mlp": 1.08566999, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.05533944227900463, + "language_loss": 1.0028702, + "learning_rate": 0.0009806461112779462, + "loss": 1.01405764, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.33081055, + "step": 604, + "time_per_iteration": 2.6172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115094, + "balance_loss_mlp": 1.08281231, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.07231087595972972, + "language_loss": 0.97971618, + "learning_rate": 0.0009805601784566814, + "loss": 0.99086702, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.32250977, + "step": 605, + "time_per_iteration": 2.4791650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125941, + "balance_loss_mlp": 1.09208584, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.06015253149930396, + "language_loss": 1.02430916, + "learning_rate": 0.0009804740590654089, + "loss": 1.03556848, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.33862305, + "step": 606, + "time_per_iteration": 2.614476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124787, + "balance_loss_mlp": 1.09229016, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.08034134565527169, + "language_loss": 0.97153747, + "learning_rate": 0.0009803877531375635, + "loss": 0.9827854, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.32495117, + "step": 607, + "time_per_iteration": 2.851011276245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_mlp": 1.09228706, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.05400582488055185, + "language_loss": 0.97512484, + "learning_rate": 0.0009803012607066523, + "loss": 0.9864068, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.35913086, + "step": 608, + "time_per_iteration": 2.700596570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128537, + "balance_loss_mlp": 1.09294093, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.15792902837654846, + "language_loss": 0.95375645, + "learning_rate": 0.0009802145818062543, + "loss": 0.96504182, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.35620117, + "step": 609, + "time_per_iteration": 2.693417549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123637, + "balance_loss_mlp": 1.08742094, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.06851059455565046, + "language_loss": 0.99132365, + "learning_rate": 0.0009801277164700212, + "loss": 1.00256002, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36254883, + "step": 610, + "time_per_iteration": 2.5825185775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131797, + "balance_loss_mlp": 1.09541452, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.1113382534985323, + "language_loss": 0.96033651, + "learning_rate": 0.0009800406647316776, + "loss": 0.97165447, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.36376953, + "step": 611, + "time_per_iteration": 2.8625166416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231096, + "balance_loss_mlp": 1.18112373, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.03346184177846584, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78145558, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.49804688, + "step": 612, + "time_per_iteration": 4.748431444168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137214, + "balance_loss_mlp": 1.09880471, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.07612220197102978, + "language_loss": 0.95326376, + "learning_rate": 0.000979866002183916, + "loss": 0.96463591, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.38378906, + "step": 613, + "time_per_iteration": 2.6311473846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155666, + "balance_loss_mlp": 1.11482501, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.0832714106614858, + "language_loss": 0.96221644, + "learning_rate": 0.0009797783914423082, + "loss": 0.97377312, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.40844727, + "step": 614, + "time_per_iteration": 2.8568782806396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126933, + "balance_loss_mlp": 1.08721232, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.08355321383380138, + "language_loss": 0.91733479, + "learning_rate": 0.0009796905944342094, + "loss": 0.92860413, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.3972168, + "step": 615, + "time_per_iteration": 2.8348331451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07517743, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.05175964705030883, + "language_loss": 0.94486296, + "learning_rate": 0.0009796026111937057, + "loss": 0.9560017, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.38671875, + "step": 616, + "time_per_iteration": 2.609276056289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111065, + "balance_loss_mlp": 1.07393384, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.1779679576065946, + "language_loss": 0.94108498, + "learning_rate": 0.0009795144417549552, + "loss": 0.95219147, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.3671875, + "step": 617, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.07760203, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.0639893702788804, + "language_loss": 0.95137906, + "learning_rate": 0.0009794260861521883, + "loss": 0.96252483, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36987305, + "step": 618, + "time_per_iteration": 2.779780387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125632, + "balance_loss_mlp": 1.08908224, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.062080445707157726, + "language_loss": 0.94238096, + "learning_rate": 0.0009793375444197075, + "loss": 0.95363724, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.3659668, + "step": 619, + "time_per_iteration": 2.6269500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.12132859, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.05728911446624217, + "language_loss": 0.93181753, + "learning_rate": 0.000979248816591888, + "loss": 0.94341516, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.38452148, + "step": 620, + "time_per_iteration": 2.7879464626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155629, + "balance_loss_mlp": 1.11600351, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.05539388103354017, + "language_loss": 0.93241715, + "learning_rate": 0.0009791599027031766, + "loss": 0.94397342, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.39624023, + "step": 621, + "time_per_iteration": 3.058497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152711, + "balance_loss_mlp": 1.11439681, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.05959109763307043, + "language_loss": 0.93889141, + "learning_rate": 0.0009790708027880932, + "loss": 0.95041847, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.38330078, + "step": 622, + "time_per_iteration": 2.857905864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217773, + "balance_loss_mlp": 1.17447615, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.033264976771994935, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78645062, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.43359375, + "step": 623, + "time_per_iteration": 4.817517518997192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130228, + "balance_loss_mlp": 1.09372652, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.07130736684785184, + "language_loss": 0.99442542, + "learning_rate": 0.0009788920450172487, + "loss": 1.00572777, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.36499023, + "step": 624, + "time_per_iteration": 2.6089231967926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_mlp": 1.0987401, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.053387747347518576, + "language_loss": 0.97139525, + "learning_rate": 0.0009788023872308875, + "loss": 0.98273742, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35522461, + "step": 625, + "time_per_iteration": 2.5482659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171918, + "balance_loss_mlp": 1.12614214, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.016755812295179123, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76600921, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.45703125, + "step": 626, + "time_per_iteration": 4.767898797988892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142548, + "balance_loss_mlp": 1.10609388, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.053046953839951706, + "language_loss": 0.99526918, + "learning_rate": 0.0009786225140303285, + "loss": 1.00669467, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.36425781, + "step": 627, + "time_per_iteration": 2.666975975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145866, + "balance_loss_mlp": 1.10974586, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.06539343990980159, + "language_loss": 0.97403502, + "learning_rate": 0.0009785322986859634, + "loss": 0.98549366, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.36132812, + "step": 628, + "time_per_iteration": 2.6613006591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116443, + "balance_loss_mlp": 1.12830925, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.05337423256033143, + "language_loss": 0.99038112, + "learning_rate": 0.0009784418975588838, + "loss": 1.00202537, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.36108398, + "step": 629, + "time_per_iteration": 2.7266693115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.11248696, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.06598420413892771, + "language_loss": 0.97636682, + "learning_rate": 0.0009783513106841862, + "loss": 0.98784697, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.35522461, + "step": 630, + "time_per_iteration": 2.7734336853027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122661, + "balance_loss_mlp": 1.17663717, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.0364602282496576, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77959311, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.5, + "step": 631, + "time_per_iteration": 4.955650091171265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118689, + "balance_loss_mlp": 1.08283055, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.061523486228641615, + "language_loss": 0.94419873, + "learning_rate": 0.0009781695798326854, + "loss": 0.95538557, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35888672, + "step": 632, + "time_per_iteration": 2.6072514057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111815, + "balance_loss_mlp": 1.08319819, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.05761126083629287, + "language_loss": 0.93996418, + "learning_rate": 0.0009780784359264365, + "loss": 0.95114571, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.34985352, + "step": 633, + "time_per_iteration": 2.6186299324035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201074, + "balance_loss_mlp": 1.15548825, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.024414945484573326, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75389773, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.45507812, + "step": 634, + "time_per_iteration": 4.757866144180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_mlp": 1.05732846, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.05071444395915749, + "language_loss": 0.91919303, + "learning_rate": 0.000977895591329867, + "loss": 0.93010104, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.3347168, + "step": 635, + "time_per_iteration": 2.7802233695983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094425, + "balance_loss_mlp": 1.06006885, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.05652682698430024, + "language_loss": 0.93613631, + "learning_rate": 0.000977803890710533, + "loss": 0.94708061, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.34399414, + "step": 636, + "time_per_iteration": 2.719989538192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109182, + "balance_loss_mlp": 1.0546267, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.05019916823038997, + "language_loss": 0.97873759, + "learning_rate": 0.0009777120045912774, + "loss": 0.98965579, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.37231445, + "step": 637, + "time_per_iteration": 2.5960683822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099212, + "balance_loss_mlp": 1.06139851, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.05186361253186237, + "language_loss": 0.97095829, + "learning_rate": 0.0009776199330077736, + "loss": 0.9819504, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37841797, + "step": 638, + "time_per_iteration": 2.7152581214904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_mlp": 1.05121303, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.05467339203371928, + "language_loss": 0.99154645, + "learning_rate": 0.0009775276759957667, + "loss": 1.00242841, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.37011719, + "step": 639, + "time_per_iteration": 2.6985981464385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090176, + "balance_loss_mlp": 1.05465198, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.06600893718108056, + "language_loss": 0.97933781, + "learning_rate": 0.0009774352335910745, + "loss": 0.99023956, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.35546875, + "step": 640, + "time_per_iteration": 2.813744306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_mlp": 1.05298471, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.05927901471916764, + "language_loss": 0.99468219, + "learning_rate": 0.000977342605829586, + "loss": 1.00554824, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.33642578, + "step": 641, + "time_per_iteration": 2.73280668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110577, + "balance_loss_mlp": 1.07240582, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.07046674646118828, + "language_loss": 0.92099506, + "learning_rate": 0.0009772497927472623, + "loss": 0.93210077, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.38183594, + "step": 642, + "time_per_iteration": 3.1258397102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.09514427, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.07438352262018386, + "language_loss": 0.93366879, + "learning_rate": 0.0009771567943801368, + "loss": 0.94501698, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3972168, + "step": 643, + "time_per_iteration": 2.6720776557922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149366, + "balance_loss_mlp": 1.10912085, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.055730629552303436, + "language_loss": 0.96261084, + "learning_rate": 0.0009770636107643152, + "loss": 0.97410446, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.40234375, + "step": 644, + "time_per_iteration": 2.7093722820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144915, + "balance_loss_mlp": 1.10734022, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.05250459899213186, + "language_loss": 0.92937833, + "learning_rate": 0.0009769702419359738, + "loss": 0.94082749, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.37597656, + "step": 645, + "time_per_iteration": 2.661512613296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173428, + "balance_loss_mlp": 1.13146591, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.052890865129340166, + "language_loss": 0.94770992, + "learning_rate": 0.000976876687931362, + "loss": 0.95944417, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.41943359, + "step": 646, + "time_per_iteration": 2.972522258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164317, + "balance_loss_mlp": 1.12555003, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.07033761546633982, + "language_loss": 0.91270661, + "learning_rate": 0.0009767829487868005, + "loss": 0.92434984, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.38769531, + "step": 647, + "time_per_iteration": 2.6150805950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164183, + "balance_loss_mlp": 1.12281775, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.07269814667774141, + "language_loss": 0.95938772, + "learning_rate": 0.000976689024538682, + "loss": 0.97102952, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.41381836, + "step": 648, + "time_per_iteration": 2.6567764282226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_mlp": 1.11497951, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.06659282576896536, + "language_loss": 0.94783676, + "learning_rate": 0.0009765949152234716, + "loss": 0.95937783, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.39135742, + "step": 649, + "time_per_iteration": 2.9032628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118823, + "balance_loss_mlp": 1.15084565, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.027365485913225348, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79874313, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.37304688, + "step": 650, + "time_per_iteration": 4.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145487, + "balance_loss_mlp": 1.10395491, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.07758701561639549, + "language_loss": 0.88880539, + "learning_rate": 0.0009764061415379919, + "loss": 0.90026021, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.41552734, + "step": 651, + "time_per_iteration": 3.2588987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_mlp": 1.09766221, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08409279007421946, + "language_loss": 0.94380724, + "learning_rate": 0.0009763114772410109, + "loss": 0.95518184, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.39794922, + "step": 652, + "time_per_iteration": 2.5698702335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112103, + "balance_loss_mlp": 1.08359814, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.056536251661147445, + "language_loss": 0.92061114, + "learning_rate": 0.0009762166280235146, + "loss": 0.93182147, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37451172, + "step": 653, + "time_per_iteration": 2.938668966293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_mlp": 1.08191729, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.0771848817407848, + "language_loss": 0.94092464, + "learning_rate": 0.0009761215939223267, + "loss": 0.95209974, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.35644531, + "step": 654, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_mlp": 1.06834149, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.07424845664771389, + "language_loss": 0.9475044, + "learning_rate": 0.0009760263749743428, + "loss": 0.95853353, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.34570312, + "step": 655, + "time_per_iteration": 2.5710902214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101838, + "balance_loss_mlp": 1.06771994, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.053259035011575195, + "language_loss": 0.94285154, + "learning_rate": 0.0009759309712165299, + "loss": 0.95386994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34130859, + "step": 656, + "time_per_iteration": 2.70626163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101868, + "balance_loss_mlp": 1.06858444, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.0693418830287988, + "language_loss": 0.9812479, + "learning_rate": 0.0009758353826859272, + "loss": 0.99226654, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.33300781, + "step": 657, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_mlp": 1.0663563, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.06782991509763603, + "language_loss": 0.96008623, + "learning_rate": 0.0009757396094196456, + "loss": 0.97111744, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36791992, + "step": 658, + "time_per_iteration": 2.8277065753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115911, + "balance_loss_mlp": 1.07926583, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.053606842709613675, + "language_loss": 0.89398581, + "learning_rate": 0.0009756436514548673, + "loss": 0.90514493, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36645508, + "step": 659, + "time_per_iteration": 2.796175718307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120986, + "balance_loss_mlp": 1.0811224, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.060525818769901533, + "language_loss": 0.92384607, + "learning_rate": 0.0009755475088288466, + "loss": 0.93505597, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.39916992, + "step": 660, + "time_per_iteration": 2.678682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133341, + "balance_loss_mlp": 1.09271395, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08191197530717065, + "language_loss": 0.958794, + "learning_rate": 0.0009754511815789095, + "loss": 0.97012746, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.40600586, + "step": 661, + "time_per_iteration": 2.7371177673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130898, + "balance_loss_mlp": 1.09093928, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08687138171908054, + "language_loss": 0.92166948, + "learning_rate": 0.0009753546697424533, + "loss": 0.93297845, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.39941406, + "step": 662, + "time_per_iteration": 2.704432249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125889, + "balance_loss_mlp": 1.08700323, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.06194581367760624, + "language_loss": 0.95628935, + "learning_rate": 0.0009752579733569475, + "loss": 0.96754825, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.38891602, + "step": 663, + "time_per_iteration": 2.682892084121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165087, + "balance_loss_mlp": 1.1326623, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.0245621431528993, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76046479, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.32421875, + "step": 664, + "time_per_iteration": 4.981603622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146598, + "balance_loss_mlp": 1.1060189, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.07818489478946229, + "language_loss": 0.96962506, + "learning_rate": 0.0009750640270890217, + "loss": 0.98109102, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.40576172, + "step": 665, + "time_per_iteration": 2.7139556407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139516, + "balance_loss_mlp": 1.10115409, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.10418725554084544, + "language_loss": 1.02824736, + "learning_rate": 0.0009749667772818983, + "loss": 1.03964257, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.38354492, + "step": 666, + "time_per_iteration": 3.000227689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148195, + "balance_loss_mlp": 1.11481678, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.027847994605201966, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78084135, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.33398438, + "step": 667, + "time_per_iteration": 4.858838319778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.1255703, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.0747922247275706, + "language_loss": 1.00932169, + "learning_rate": 0.0009747717245101093, + "loss": 1.0209403, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.36303711, + "step": 668, + "time_per_iteration": 2.4917514324188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172854, + "balance_loss_mlp": 1.13518405, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0795363237311063, + "language_loss": 0.91087645, + "learning_rate": 0.00097467392162117, + "loss": 0.92260504, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.37719727, + "step": 669, + "time_per_iteration": 2.601151466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196603, + "balance_loss_mlp": 1.15540457, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.0744221392925499, + "language_loss": 0.95630497, + "learning_rate": 0.0009745759344474708, + "loss": 0.96827102, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.41162109, + "step": 670, + "time_per_iteration": 2.878068447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200159, + "balance_loss_mlp": 1.16012812, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.07162427386273244, + "language_loss": 0.95158428, + "learning_rate": 0.0009744777630270536, + "loss": 0.96358585, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.40063477, + "step": 671, + "time_per_iteration": 2.5778517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220294, + "balance_loss_mlp": 1.17752171, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.07459259564874297, + "language_loss": 0.99775112, + "learning_rate": 0.000974379407398032, + "loss": 1.00995398, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.42797852, + "step": 672, + "time_per_iteration": 2.862168073654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_mlp": 1.15175724, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.05795101219152752, + "language_loss": 0.86696863, + "learning_rate": 0.0009742808675985913, + "loss": 0.87888587, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.3996582, + "step": 673, + "time_per_iteration": 3.0987160205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011832, + "balance_loss_mlp": 1.14142871, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.06292984682523013, + "language_loss": 0.96893597, + "learning_rate": 0.0009741821436669876, + "loss": 0.98076797, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.41772461, + "step": 674, + "time_per_iteration": 2.565317153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160814, + "balance_loss_mlp": 1.12123656, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.07127578315040689, + "language_loss": 0.99621803, + "learning_rate": 0.0009740832356415492, + "loss": 1.00782621, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.39550781, + "step": 675, + "time_per_iteration": 2.4777724742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144275, + "balance_loss_mlp": 1.10538852, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.07563598794059366, + "language_loss": 0.94837546, + "learning_rate": 0.0009739841435606756, + "loss": 0.95981824, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.38867188, + "step": 676, + "time_per_iteration": 2.9838767051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_mlp": 1.09186864, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.06693149578557214, + "language_loss": 0.94293654, + "learning_rate": 0.0009738848674628377, + "loss": 0.95424765, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.39233398, + "step": 677, + "time_per_iteration": 2.7052054405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130656, + "balance_loss_mlp": 1.0923903, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.05501746541124835, + "language_loss": 0.94784498, + "learning_rate": 0.000973785407386578, + "loss": 0.95915151, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.38232422, + "step": 678, + "time_per_iteration": 2.7535152435302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_mlp": 1.09727383, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.05430769504454563, + "language_loss": 0.91185606, + "learning_rate": 0.0009736857633705103, + "loss": 0.92322862, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.3996582, + "step": 679, + "time_per_iteration": 2.8686013221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135266, + "balance_loss_mlp": 1.09575987, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.06387426976514826, + "language_loss": 0.97335434, + "learning_rate": 0.0009735859354533196, + "loss": 0.984707, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.39501953, + "step": 680, + "time_per_iteration": 2.6952273845672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09626174, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.07637025474680663, + "language_loss": 0.97434723, + "learning_rate": 0.0009734859236737628, + "loss": 0.98571181, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.40185547, + "step": 681, + "time_per_iteration": 2.607431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_mlp": 1.09720194, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.06515090437153119, + "language_loss": 0.9831785, + "learning_rate": 0.0009733857280706678, + "loss": 0.99454683, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.39599609, + "step": 682, + "time_per_iteration": 2.5730957984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140739, + "balance_loss_mlp": 1.1007328, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.08408851923922504, + "language_loss": 0.89817083, + "learning_rate": 0.000973285348682934, + "loss": 0.90957826, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.39990234, + "step": 683, + "time_per_iteration": 2.7041609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_mlp": 1.08460057, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.021197399820989362, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7901845, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.35546875, + "step": 684, + "time_per_iteration": 4.7803051471710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145399, + "balance_loss_mlp": 1.10579789, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.06796914093678033, + "language_loss": 0.90116858, + "learning_rate": 0.0009730840387095046, + "loss": 0.91262257, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.39575195, + "step": 685, + "time_per_iteration": 3.289513111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154301, + "balance_loss_mlp": 1.11412716, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.0690044047280534, + "language_loss": 0.95956922, + "learning_rate": 0.0009729831082019642, + "loss": 0.97111225, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.40185547, + "step": 686, + "time_per_iteration": 2.8214356899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131343, + "balance_loss_mlp": 1.09383941, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08080780289155233, + "language_loss": 0.93596351, + "learning_rate": 0.0009728819940660958, + "loss": 0.94727689, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.375, + "step": 687, + "time_per_iteration": 2.749385118484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011246, + "balance_loss_mlp": 1.08542764, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.08853955851107219, + "language_loss": 0.91695315, + "learning_rate": 0.0009727806963411557, + "loss": 0.92819917, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.39135742, + "step": 688, + "time_per_iteration": 2.592099666595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_mlp": 1.08777368, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.06370494383790047, + "language_loss": 0.92130053, + "learning_rate": 0.000972679215066471, + "loss": 0.93258381, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.40551758, + "step": 689, + "time_per_iteration": 2.7344043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114145, + "balance_loss_mlp": 1.10246885, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.08478699193898473, + "language_loss": 1.04583168, + "learning_rate": 0.0009725775502814401, + "loss": 1.05724621, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.3894043, + "step": 690, + "time_per_iteration": 2.5881311893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155383, + "balance_loss_mlp": 1.1147325, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.07994389842197654, + "language_loss": 0.90077579, + "learning_rate": 0.0009724757020255327, + "loss": 0.91232961, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.40649414, + "step": 691, + "time_per_iteration": 2.8452539443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_mlp": 1.12566948, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09039906445052394, + "language_loss": 0.91914684, + "learning_rate": 0.0009723736703382902, + "loss": 0.93079573, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.39208984, + "step": 692, + "time_per_iteration": 2.5472824573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198661, + "balance_loss_mlp": 1.15557849, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07689546631051256, + "language_loss": 0.86461794, + "learning_rate": 0.0009722714552593244, + "loss": 0.87660456, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.4309082, + "step": 693, + "time_per_iteration": 2.6273465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199876, + "balance_loss_mlp": 1.15560198, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08142665414192346, + "language_loss": 1.00438499, + "learning_rate": 0.000972169056828319, + "loss": 1.01638389, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.44262695, + "step": 694, + "time_per_iteration": 2.477491617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221806, + "balance_loss_mlp": 1.17741275, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.07001491486919184, + "language_loss": 0.90590984, + "learning_rate": 0.0009720664750850283, + "loss": 0.91812789, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.4440918, + "step": 695, + "time_per_iteration": 2.7817704677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209285, + "balance_loss_mlp": 1.16870594, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.07077521288835904, + "language_loss": 0.97240067, + "learning_rate": 0.0009719637100692784, + "loss": 0.98449349, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.40625, + "step": 696, + "time_per_iteration": 2.7099833488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214589, + "balance_loss_mlp": 1.17069626, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.06395797985697109, + "language_loss": 0.87399805, + "learning_rate": 0.0009718607618209661, + "loss": 0.88614392, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.43896484, + "step": 697, + "time_per_iteration": 2.8280160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226846, + "balance_loss_mlp": 1.18445516, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.08853583224950028, + "language_loss": 0.91527486, + "learning_rate": 0.0009717576303800595, + "loss": 0.92754334, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.42382812, + "step": 698, + "time_per_iteration": 3.0102553367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206724, + "balance_loss_mlp": 1.16385674, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.07140979809376953, + "language_loss": 0.90443981, + "learning_rate": 0.0009716543157865975, + "loss": 0.91650712, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.4284668, + "step": 699, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192988, + "balance_loss_mlp": 1.15047789, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.0971528894423257, + "language_loss": 0.87731719, + "learning_rate": 0.0009715508180806907, + "loss": 0.88924706, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.42504883, + "step": 700, + "time_per_iteration": 3.183608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.12189686, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07253928509691168, + "language_loss": 0.94940412, + "learning_rate": 0.0009714471373025202, + "loss": 0.96104908, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.42578125, + "step": 701, + "time_per_iteration": 3.4071736335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_mlp": 1.10978746, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07349692890686976, + "language_loss": 0.93387866, + "learning_rate": 0.0009713432734923386, + "loss": 0.94542348, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.44702148, + "step": 702, + "time_per_iteration": 2.61545467376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149917, + "balance_loss_mlp": 1.10523736, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.07475145021416552, + "language_loss": 0.90919894, + "learning_rate": 0.0009712392266904696, + "loss": 0.92069811, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.44702148, + "step": 703, + "time_per_iteration": 2.739295482635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156115, + "balance_loss_mlp": 1.11219811, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.09690331363255131, + "language_loss": 0.90325272, + "learning_rate": 0.0009711349969373076, + "loss": 0.91481388, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.43945312, + "step": 704, + "time_per_iteration": 3.1653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175158, + "balance_loss_mlp": 1.12780786, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.09111648779989767, + "language_loss": 0.84997714, + "learning_rate": 0.0009710305842733178, + "loss": 0.86172873, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.47314453, + "step": 705, + "time_per_iteration": 2.7402727603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117117, + "balance_loss_mlp": 1.12737262, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.10189351673448747, + "language_loss": 0.9379847, + "learning_rate": 0.0009709259887390373, + "loss": 0.94969636, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.43774414, + "step": 706, + "time_per_iteration": 2.5640039443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.10467625, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.07946562356881365, + "language_loss": 0.95178437, + "learning_rate": 0.0009708212103750737, + "loss": 0.96325481, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.42382812, + "step": 707, + "time_per_iteration": 2.6138036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153312, + "balance_loss_mlp": 1.1095618, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.07708082078191984, + "language_loss": 0.91549516, + "learning_rate": 0.0009707162492221051, + "loss": 0.9270283, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.43725586, + "step": 708, + "time_per_iteration": 2.879612684249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143626, + "balance_loss_mlp": 1.10121179, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.08764140181907645, + "language_loss": 0.92509496, + "learning_rate": 0.0009706111053208815, + "loss": 0.93653119, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.42431641, + "step": 709, + "time_per_iteration": 2.804469347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156089, + "balance_loss_mlp": 1.10947847, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.07097269092186763, + "language_loss": 0.89579999, + "learning_rate": 0.0009705057787122232, + "loss": 0.90736091, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.46630859, + "step": 710, + "time_per_iteration": 2.568406105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174212, + "balance_loss_mlp": 1.12874603, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.06463299548184855, + "language_loss": 0.94250202, + "learning_rate": 0.0009704002694370216, + "loss": 0.9542442, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.45410156, + "step": 711, + "time_per_iteration": 2.525240659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116324, + "balance_loss_mlp": 1.11820245, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.06677275778781674, + "language_loss": 0.90675253, + "learning_rate": 0.0009702945775362388, + "loss": 0.91838491, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.45043945, + "step": 712, + "time_per_iteration": 2.572566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171995, + "balance_loss_mlp": 1.12478852, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.06549167744569931, + "language_loss": 0.91151595, + "learning_rate": 0.0009701887030509086, + "loss": 0.92323589, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.47167969, + "step": 713, + "time_per_iteration": 2.645202875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_mlp": 1.11450684, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.07696267649297317, + "language_loss": 0.95333648, + "learning_rate": 0.0009700826460221346, + "loss": 0.96490526, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.42382812, + "step": 714, + "time_per_iteration": 2.649831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187773, + "balance_loss_mlp": 1.13980293, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.08597126409557068, + "language_loss": 0.96336859, + "learning_rate": 0.0009699764064910921, + "loss": 0.97524625, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.47998047, + "step": 715, + "time_per_iteration": 2.8645238876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178216, + "balance_loss_mlp": 1.1317718, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08366808602410432, + "language_loss": 0.90892398, + "learning_rate": 0.0009698699844990268, + "loss": 0.92070615, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.46435547, + "step": 716, + "time_per_iteration": 2.635460376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171183, + "balance_loss_mlp": 1.12731409, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.051528021496160425, + "language_loss": 0.91132116, + "learning_rate": 0.0009697633800872555, + "loss": 0.923033, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.4387207, + "step": 717, + "time_per_iteration": 2.887854814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189757, + "balance_loss_mlp": 1.1432178, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.07388540586481528, + "language_loss": 0.94422555, + "learning_rate": 0.0009696565932971655, + "loss": 0.95612311, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.46557617, + "step": 718, + "time_per_iteration": 2.8565313816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171127, + "balance_loss_mlp": 1.12580407, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.06166568969162735, + "language_loss": 0.92794299, + "learning_rate": 0.0009695496241702153, + "loss": 0.93965423, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.45361328, + "step": 719, + "time_per_iteration": 2.827193021774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178611, + "balance_loss_mlp": 1.13152349, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.07046673128739296, + "language_loss": 0.8903814, + "learning_rate": 0.0009694424727479339, + "loss": 0.9021675, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.47094727, + "step": 720, + "time_per_iteration": 2.958855628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12150323, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.07332050167219753, + "language_loss": 0.91946507, + "learning_rate": 0.0009693351390719213, + "loss": 0.93114913, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.46899414, + "step": 721, + "time_per_iteration": 2.6910197734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012083, + "balance_loss_mlp": 1.15742183, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.06188248769550966, + "language_loss": 0.93531096, + "learning_rate": 0.000969227623183848, + "loss": 0.94739395, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.50830078, + "step": 722, + "time_per_iteration": 2.791097640991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.14776587, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06666345220966835, + "language_loss": 0.93550557, + "learning_rate": 0.0009691199251254554, + "loss": 0.94745386, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.47045898, + "step": 723, + "time_per_iteration": 2.8282151222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173107, + "balance_loss_mlp": 1.13059711, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.07191970231420823, + "language_loss": 0.88703346, + "learning_rate": 0.0009690120449385555, + "loss": 0.89876461, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.42504883, + "step": 724, + "time_per_iteration": 2.775456190109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158197, + "balance_loss_mlp": 1.11332655, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.06680700276551169, + "language_loss": 0.95181078, + "learning_rate": 0.0009689039826650312, + "loss": 0.96339279, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.44824219, + "step": 725, + "time_per_iteration": 2.7623417377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164951, + "balance_loss_mlp": 1.12756717, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.03995326528410751, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77688015, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.37304688, + "step": 726, + "time_per_iteration": 4.914167642593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146003, + "balance_loss_mlp": 1.09567261, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.07822541163530779, + "language_loss": 0.90488958, + "learning_rate": 0.0009686873120259941, + "loss": 0.91634959, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.50341797, + "step": 727, + "time_per_iteration": 2.563333749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132914, + "balance_loss_mlp": 1.09092879, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.0725242002086287, + "language_loss": 0.89161742, + "learning_rate": 0.0009685787037446004, + "loss": 0.90294659, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.41992188, + "step": 728, + "time_per_iteration": 2.7803192138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137226, + "balance_loss_mlp": 1.09192598, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.10183800223701604, + "language_loss": 0.9064362, + "learning_rate": 0.0009684699135448201, + "loss": 0.91780847, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.453125, + "step": 729, + "time_per_iteration": 2.750023603439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142476, + "balance_loss_mlp": 1.0995841, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.06503689668024501, + "language_loss": 0.94054115, + "learning_rate": 0.0009683609414688895, + "loss": 0.95196593, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.42895508, + "step": 730, + "time_per_iteration": 2.708470344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116576, + "balance_loss_mlp": 1.11652613, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.07277464462784268, + "language_loss": 0.89072424, + "learning_rate": 0.0009682517875591154, + "loss": 0.9023819, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.49243164, + "step": 731, + "time_per_iteration": 2.734145402908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173563, + "balance_loss_mlp": 1.12640429, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.08810260071203486, + "language_loss": 0.88790858, + "learning_rate": 0.0009681424518578749, + "loss": 0.8996442, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.47192383, + "step": 732, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166119, + "balance_loss_mlp": 1.11900759, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.07053265121681873, + "language_loss": 0.9010576, + "learning_rate": 0.000968032934407616, + "loss": 0.91271877, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.47143555, + "step": 733, + "time_per_iteration": 2.625128746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161975, + "balance_loss_mlp": 1.11514974, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.08143861058365946, + "language_loss": 0.84579933, + "learning_rate": 0.0009679232352508571, + "loss": 0.85741913, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.46850586, + "step": 734, + "time_per_iteration": 2.7461798191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145124, + "balance_loss_mlp": 1.10046864, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.0788084271092868, + "language_loss": 0.83272535, + "learning_rate": 0.0009678133544301871, + "loss": 0.84417665, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.44677734, + "step": 735, + "time_per_iteration": 2.68129301071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130971, + "balance_loss_mlp": 1.08731616, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.05044431767963513, + "language_loss": 0.93706036, + "learning_rate": 0.0009677032919882658, + "loss": 0.94837004, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.43652344, + "step": 736, + "time_per_iteration": 2.663874387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141167, + "balance_loss_mlp": 1.0970124, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.07155994363363784, + "language_loss": 0.94151366, + "learning_rate": 0.000967593047967823, + "loss": 0.95292532, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.44116211, + "step": 737, + "time_per_iteration": 2.512871265411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.10376751, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.07145762863961741, + "language_loss": 0.89657855, + "learning_rate": 0.0009674826224116593, + "loss": 0.90808284, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.46655273, + "step": 738, + "time_per_iteration": 2.797337293624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_mlp": 1.09865868, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.07589062836694223, + "language_loss": 0.89765012, + "learning_rate": 0.0009673720153626455, + "loss": 0.90910375, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.46728516, + "step": 739, + "time_per_iteration": 2.5743062496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.09274864, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07239717331604524, + "language_loss": 0.89863205, + "learning_rate": 0.0009672612268637235, + "loss": 0.9100163, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.45678711, + "step": 740, + "time_per_iteration": 2.6074059009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125723, + "balance_loss_mlp": 1.08125818, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.08552249660547784, + "language_loss": 0.8725301, + "learning_rate": 0.0009671502569579048, + "loss": 0.88378727, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.44458008, + "step": 741, + "time_per_iteration": 2.729733467102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116563, + "balance_loss_mlp": 1.07338512, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.05753110737252733, + "language_loss": 0.92330521, + "learning_rate": 0.0009670391056882719, + "loss": 0.93447083, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.43188477, + "step": 742, + "time_per_iteration": 2.69399356842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115871, + "balance_loss_mlp": 1.07367063, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.06711892894426404, + "language_loss": 0.91615599, + "learning_rate": 0.0009669277730979776, + "loss": 0.92731464, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.421875, + "step": 743, + "time_per_iteration": 3.1732802391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123454, + "balance_loss_mlp": 1.079561, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.07488288596065623, + "language_loss": 0.88249421, + "learning_rate": 0.0009668162592302449, + "loss": 0.89372879, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.43896484, + "step": 744, + "time_per_iteration": 2.88962459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_mlp": 1.09551311, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.08170086657731683, + "language_loss": 0.8873378, + "learning_rate": 0.0009667045641283676, + "loss": 0.89875567, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.46289062, + "step": 745, + "time_per_iteration": 2.6374380588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136601, + "balance_loss_mlp": 1.09158731, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.07376324969806651, + "language_loss": 0.9752661, + "learning_rate": 0.0009665926878357092, + "loss": 0.98663211, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.44995117, + "step": 746, + "time_per_iteration": 2.908377170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138589, + "balance_loss_mlp": 1.09283662, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.055840413500964095, + "language_loss": 0.93229979, + "learning_rate": 0.0009664806303957043, + "loss": 0.94368571, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.45751953, + "step": 747, + "time_per_iteration": 2.6940197944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_mlp": 1.11397541, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.07422855656653271, + "language_loss": 0.89923358, + "learning_rate": 0.0009663683918518571, + "loss": 0.91087878, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.50463867, + "step": 748, + "time_per_iteration": 2.8905599117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_mlp": 1.10977423, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.06951396400432043, + "language_loss": 0.88074797, + "learning_rate": 0.0009662559722477428, + "loss": 0.89237428, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.52880859, + "step": 749, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111749, + "balance_loss_mlp": 1.09059644, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.031134761916572575, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77280462, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.26953125, + "step": 750, + "time_per_iteration": 4.978729009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_mlp": 1.09359622, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.06451546089111031, + "language_loss": 0.9124738, + "learning_rate": 0.0009660305900333632, + "loss": 0.92388898, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.47973633, + "step": 751, + "time_per_iteration": 2.6556403636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145582, + "balance_loss_mlp": 1.09849465, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08083819383046088, + "language_loss": 0.8480792, + "learning_rate": 0.0009659176275105992, + "loss": 0.85953498, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.47070312, + "step": 752, + "time_per_iteration": 2.6868016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154886, + "balance_loss_mlp": 1.10667825, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.0601727082776222, + "language_loss": 0.87400204, + "learning_rate": 0.0009658044841025701, + "loss": 0.88555086, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.48217773, + "step": 753, + "time_per_iteration": 2.7701456546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189813, + "balance_loss_mlp": 1.136765, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.0800468655776831, + "language_loss": 0.83957088, + "learning_rate": 0.0009656911598532021, + "loss": 0.85146904, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.53051758, + "step": 754, + "time_per_iteration": 2.630211353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192149, + "balance_loss_mlp": 1.13943434, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.0631545589319864, + "language_loss": 0.9278729, + "learning_rate": 0.0009655776548064917, + "loss": 0.93979442, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.52758789, + "step": 755, + "time_per_iteration": 2.6447510719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.12506902, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.06497808848967317, + "language_loss": 0.90460694, + "learning_rate": 0.0009654639690065054, + "loss": 0.91637456, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.51708984, + "step": 756, + "time_per_iteration": 2.910578727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116602, + "balance_loss_mlp": 1.11785972, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.0580393303136577, + "language_loss": 0.90340179, + "learning_rate": 0.00096535010249738, + "loss": 0.91506201, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.48120117, + "step": 757, + "time_per_iteration": 2.7232277393341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149847, + "balance_loss_mlp": 1.10092402, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.07370663524734816, + "language_loss": 0.8531146, + "learning_rate": 0.0009652360553233224, + "loss": 0.86461306, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.48901367, + "step": 758, + "time_per_iteration": 2.7501397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.03528047, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.02263224740377231, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74837828, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.28710938, + "step": 759, + "time_per_iteration": 4.953639268875122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150341, + "balance_loss_mlp": 1.1019187, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.05750780582661247, + "language_loss": 0.83513778, + "learning_rate": 0.0009650074191575883, + "loss": 0.84664118, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.48388672, + "step": 760, + "time_per_iteration": 3.202252149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152626, + "balance_loss_mlp": 1.10179496, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.05303129095981597, + "language_loss": 0.88240772, + "learning_rate": 0.0009648928302546766, + "loss": 0.89393395, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.50878906, + "step": 761, + "time_per_iteration": 2.65380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_mlp": 1.09960222, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.06114398209353547, + "language_loss": 0.87573165, + "learning_rate": 0.0009647780608643613, + "loss": 0.88720453, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.47705078, + "step": 762, + "time_per_iteration": 3.3394339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.10831833, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.09093438426480749, + "language_loss": 0.90765309, + "learning_rate": 0.0009646631110312001, + "loss": 0.91919315, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.45678711, + "step": 763, + "time_per_iteration": 2.622671604156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.11200595, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.047784585244551814, + "language_loss": 0.90468627, + "learning_rate": 0.0009645479807998203, + "loss": 0.91626436, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.45751953, + "step": 764, + "time_per_iteration": 2.7322580814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156125, + "balance_loss_mlp": 1.11487842, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06523928090243644, + "language_loss": 0.94106412, + "learning_rate": 0.0009644326702149196, + "loss": 0.95262539, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.41235352, + "step": 765, + "time_per_iteration": 2.7013158798217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174535, + "balance_loss_mlp": 1.12761474, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.08055574364553787, + "language_loss": 0.86730242, + "learning_rate": 0.0009643171793212653, + "loss": 0.87904775, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.46923828, + "step": 766, + "time_per_iteration": 3.083709478378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_mlp": 1.11473966, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.07722330054572468, + "language_loss": 0.92188174, + "learning_rate": 0.0009642015081636952, + "loss": 0.93350834, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.47949219, + "step": 767, + "time_per_iteration": 2.6836585998535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.1132586, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.07123168873353844, + "language_loss": 0.90995437, + "learning_rate": 0.0009640856567871166, + "loss": 0.9215681, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.48168945, + "step": 768, + "time_per_iteration": 2.543670177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156907, + "balance_loss_mlp": 1.10626745, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07039727350928661, + "language_loss": 0.9123286, + "learning_rate": 0.0009639696252365072, + "loss": 0.92389768, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.50634766, + "step": 769, + "time_per_iteration": 3.027188539505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146634, + "balance_loss_mlp": 1.10326576, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.06094559984807647, + "language_loss": 0.83659029, + "learning_rate": 0.0009638534135569144, + "loss": 0.84805667, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.43359375, + "step": 770, + "time_per_iteration": 2.9126267433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_mlp": 1.09489226, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.06702358278695762, + "language_loss": 0.92293191, + "learning_rate": 0.0009637370217934554, + "loss": 0.93433982, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.45922852, + "step": 771, + "time_per_iteration": 2.6426541805267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.08600211, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.04968709901212579, + "language_loss": 0.84857935, + "learning_rate": 0.0009636204499913175, + "loss": 0.85987568, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.43603516, + "step": 772, + "time_per_iteration": 2.830029010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_mlp": 1.08478057, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06444605868824185, + "language_loss": 0.90028566, + "learning_rate": 0.0009635036981957581, + "loss": 0.91150796, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.37451172, + "step": 773, + "time_per_iteration": 2.850893259048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128047, + "balance_loss_mlp": 1.08546507, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.07558916443605426, + "language_loss": 0.92137265, + "learning_rate": 0.0009633867664521043, + "loss": 0.93265319, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.42553711, + "step": 774, + "time_per_iteration": 2.8405416011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154281, + "balance_loss_mlp": 1.10614467, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.07793461844194936, + "language_loss": 0.8938297, + "learning_rate": 0.0009632696548057527, + "loss": 0.9053725, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.48168945, + "step": 775, + "time_per_iteration": 2.5543088912963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158921, + "balance_loss_mlp": 1.11419404, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.07948352168051111, + "language_loss": 0.86982578, + "learning_rate": 0.0009631523633021704, + "loss": 0.88141501, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.44702148, + "step": 776, + "time_per_iteration": 2.8373982906341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151944, + "balance_loss_mlp": 1.10726452, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.07613081492567164, + "language_loss": 0.90593684, + "learning_rate": 0.0009630348919868936, + "loss": 0.91745627, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.4465332, + "step": 777, + "time_per_iteration": 2.688340187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164011, + "balance_loss_mlp": 1.1162796, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.07284380806791231, + "language_loss": 0.83743048, + "learning_rate": 0.0009629172409055293, + "loss": 0.84907055, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.47753906, + "step": 778, + "time_per_iteration": 2.496121406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_mlp": 1.13260555, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.0582041699055768, + "language_loss": 0.89173234, + "learning_rate": 0.0009627994101037531, + "loss": 0.9034642, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.40576172, + "step": 779, + "time_per_iteration": 2.7287445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116917, + "balance_loss_mlp": 1.12670779, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.06429714570378213, + "language_loss": 0.91374522, + "learning_rate": 0.0009626813996273114, + "loss": 0.92543697, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.42431641, + "step": 780, + "time_per_iteration": 2.8357532024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174237, + "balance_loss_mlp": 1.13258517, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.07735356487079731, + "language_loss": 0.90820873, + "learning_rate": 0.0009625632095220198, + "loss": 0.91995108, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.41625977, + "step": 781, + "time_per_iteration": 2.8360986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165333, + "balance_loss_mlp": 1.12408686, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.07591811383481707, + "language_loss": 0.88784671, + "learning_rate": 0.0009624448398337637, + "loss": 0.89950007, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.41259766, + "step": 782, + "time_per_iteration": 2.550873041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_mlp": 1.09920812, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.06500535683801296, + "language_loss": 0.90907973, + "learning_rate": 0.0009623262906084984, + "loss": 0.92046738, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.39550781, + "step": 783, + "time_per_iteration": 3.002237319946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127947, + "balance_loss_mlp": 1.08622408, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.06722303964642193, + "language_loss": 0.92323947, + "learning_rate": 0.0009622075618922486, + "loss": 0.93451893, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.41699219, + "step": 784, + "time_per_iteration": 2.669541120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117088, + "balance_loss_mlp": 1.07636571, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.06286377137641418, + "language_loss": 0.88948303, + "learning_rate": 0.0009620886537311091, + "loss": 0.90065384, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.40722656, + "step": 785, + "time_per_iteration": 2.6505391597747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132411, + "balance_loss_mlp": 1.08563375, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.06858268632652799, + "language_loss": 0.87318397, + "learning_rate": 0.000961969566171244, + "loss": 0.88450807, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.46777344, + "step": 786, + "time_per_iteration": 2.5492002964019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143812, + "balance_loss_mlp": 1.10037243, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.06762455123923776, + "language_loss": 0.9226557, + "learning_rate": 0.0009618502992588873, + "loss": 0.93409383, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.43481445, + "step": 787, + "time_per_iteration": 2.6596381664276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153043, + "balance_loss_mlp": 1.10714722, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07210135364095939, + "language_loss": 0.90213263, + "learning_rate": 0.0009617308530403424, + "loss": 0.91366303, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.45922852, + "step": 788, + "time_per_iteration": 2.9965012073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133874, + "balance_loss_mlp": 1.09358144, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0646084728999688, + "language_loss": 0.89177096, + "learning_rate": 0.0009616112275619825, + "loss": 0.90310967, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.40283203, + "step": 789, + "time_per_iteration": 2.702927350997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.08760214, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.04914514873585108, + "language_loss": 0.85434246, + "learning_rate": 0.0009614914228702503, + "loss": 0.86562753, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.40917969, + "step": 790, + "time_per_iteration": 2.734309196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120439, + "balance_loss_mlp": 1.08031344, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.0510031662309952, + "language_loss": 0.90581405, + "learning_rate": 0.0009613714390116581, + "loss": 0.91701841, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.40112305, + "step": 791, + "time_per_iteration": 2.9846036434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119178, + "balance_loss_mlp": 1.07890868, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.06466161117660295, + "language_loss": 0.87842512, + "learning_rate": 0.0009612512760327879, + "loss": 0.88961697, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.40283203, + "step": 792, + "time_per_iteration": 2.879507303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.0749234, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.06761791569724282, + "language_loss": 0.86834276, + "learning_rate": 0.0009611309339802909, + "loss": 0.87955594, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.46435547, + "step": 793, + "time_per_iteration": 2.4628419876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125819, + "balance_loss_mlp": 1.08180666, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.06955338926819006, + "language_loss": 0.85776877, + "learning_rate": 0.0009610104129008881, + "loss": 0.86902696, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.43994141, + "step": 794, + "time_per_iteration": 3.1157610416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112048, + "balance_loss_mlp": 1.07751703, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.0812849574801687, + "language_loss": 0.89832217, + "learning_rate": 0.0009608897128413701, + "loss": 0.90952694, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.4296875, + "step": 795, + "time_per_iteration": 2.7580387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_mlp": 1.08070254, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.07320179377966478, + "language_loss": 0.87414771, + "learning_rate": 0.0009607688338485965, + "loss": 0.88536048, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.40576172, + "step": 796, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112803, + "balance_loss_mlp": 1.08358848, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.08676784428227541, + "language_loss": 0.92063487, + "learning_rate": 0.0009606477759694969, + "loss": 0.93191516, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.4440918, + "step": 797, + "time_per_iteration": 3.0136139392852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129989, + "balance_loss_mlp": 1.08547592, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07379760567815713, + "language_loss": 0.89430279, + "learning_rate": 0.0009605265392510703, + "loss": 0.90560269, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.44555664, + "step": 798, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_mlp": 1.10169339, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.06797963908333281, + "language_loss": 0.93481082, + "learning_rate": 0.0009604051237403846, + "loss": 0.94626689, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.43896484, + "step": 799, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167168, + "balance_loss_mlp": 1.1217972, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.06891264186704958, + "language_loss": 0.88271165, + "learning_rate": 0.0009602835294845776, + "loss": 0.89438331, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.45361328, + "step": 800, + "time_per_iteration": 2.4739739894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12188447, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.06820302888180714, + "language_loss": 0.91848779, + "learning_rate": 0.0009601617565308565, + "loss": 0.93017173, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.46557617, + "step": 801, + "time_per_iteration": 2.599102020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196072, + "balance_loss_mlp": 1.14941311, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.08155438121007776, + "language_loss": 0.88506758, + "learning_rate": 0.0009600398049264977, + "loss": 0.89702827, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.46679688, + "step": 802, + "time_per_iteration": 2.9645981788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193217, + "balance_loss_mlp": 1.14574742, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.10468166660144326, + "language_loss": 0.93512642, + "learning_rate": 0.0009599176747188469, + "loss": 0.94705856, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.47485352, + "step": 803, + "time_per_iteration": 2.7997000217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160191, + "balance_loss_mlp": 1.11856318, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.07174757520021151, + "language_loss": 0.84728193, + "learning_rate": 0.0009597953659553196, + "loss": 0.85888386, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.41625977, + "step": 804, + "time_per_iteration": 2.700530529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_mlp": 1.09408379, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.4143347029392257, + "language_loss": 0.9033978, + "learning_rate": 0.0009596728786833997, + "loss": 0.91473466, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.39575195, + "step": 805, + "time_per_iteration": 2.6122889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150784, + "balance_loss_mlp": 1.10772574, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.061887733402931855, + "language_loss": 0.91321814, + "learning_rate": 0.0009595502129506415, + "loss": 0.92472601, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.43066406, + "step": 806, + "time_per_iteration": 3.336061716079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180508, + "balance_loss_mlp": 1.13694847, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.06807019640067784, + "language_loss": 0.84292483, + "learning_rate": 0.0009594273688046678, + "loss": 0.85472989, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.43579102, + "step": 807, + "time_per_iteration": 2.709182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210101, + "balance_loss_mlp": 1.15960383, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.0856522073787927, + "language_loss": 0.8780278, + "learning_rate": 0.000959304346293171, + "loss": 0.89012885, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.50512695, + "step": 808, + "time_per_iteration": 2.6307153701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236008, + "balance_loss_mlp": 1.18305564, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.09531038088821206, + "language_loss": 0.90107393, + "learning_rate": 0.0009591811454639125, + "loss": 0.91343403, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.52954102, + "step": 809, + "time_per_iteration": 2.742725372314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197955, + "balance_loss_mlp": 1.15184498, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.06212883071305714, + "language_loss": 0.902493, + "learning_rate": 0.0009590577663647234, + "loss": 0.91447246, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.4609375, + "step": 810, + "time_per_iteration": 2.711411237716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187108, + "balance_loss_mlp": 1.13837492, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.06321996034865444, + "language_loss": 0.88015836, + "learning_rate": 0.0009589342090435036, + "loss": 0.8920294, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.48779297, + "step": 811, + "time_per_iteration": 2.763784170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.12610841, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07315119709604147, + "language_loss": 0.89953744, + "learning_rate": 0.0009588104735482223, + "loss": 0.91127443, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.47631836, + "step": 812, + "time_per_iteration": 2.645106077194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169234, + "balance_loss_mlp": 1.12019134, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.06895714089970095, + "language_loss": 0.86002952, + "learning_rate": 0.0009586865599269177, + "loss": 0.87172186, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.49047852, + "step": 813, + "time_per_iteration": 2.6313953399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144128, + "balance_loss_mlp": 1.09851837, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.06467027207336487, + "language_loss": 0.90443802, + "learning_rate": 0.0009585624682276977, + "loss": 0.91587937, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.45605469, + "step": 814, + "time_per_iteration": 2.7377047538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144046, + "balance_loss_mlp": 1.09705353, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.06824176290368998, + "language_loss": 0.89156437, + "learning_rate": 0.0009584381984987386, + "loss": 0.90300483, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.47021484, + "step": 815, + "time_per_iteration": 2.5524120330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134862, + "balance_loss_mlp": 1.09225655, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.061358262400161866, + "language_loss": 0.92449033, + "learning_rate": 0.0009583137507882864, + "loss": 0.93583906, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.42626953, + "step": 816, + "time_per_iteration": 2.699207305908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.08698916, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.06309616730716378, + "language_loss": 0.82620019, + "learning_rate": 0.000958189125144656, + "loss": 0.8375479, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.47851562, + "step": 817, + "time_per_iteration": 2.6626293659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142354, + "balance_loss_mlp": 1.09493256, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08013787804574789, + "language_loss": 0.90297949, + "learning_rate": 0.0009580643216162313, + "loss": 0.91440302, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.47436523, + "step": 818, + "time_per_iteration": 2.6708288192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.09368527, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.06582812199168771, + "language_loss": 0.82167578, + "learning_rate": 0.0009579393402514652, + "loss": 0.83310658, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.49389648, + "step": 819, + "time_per_iteration": 2.577592611312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_mlp": 1.09898734, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.07647809261390527, + "language_loss": 0.92362559, + "learning_rate": 0.0009578141810988801, + "loss": 0.93505466, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.43920898, + "step": 820, + "time_per_iteration": 2.5464515686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152369, + "balance_loss_mlp": 1.10678363, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07136182637629812, + "language_loss": 0.92042351, + "learning_rate": 0.0009576888442070668, + "loss": 0.93194717, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.45556641, + "step": 821, + "time_per_iteration": 2.5755786895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114609, + "balance_loss_mlp": 1.10288835, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08295395391365894, + "language_loss": 0.94583452, + "learning_rate": 0.0009575633296246854, + "loss": 0.95729542, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.43212891, + "step": 822, + "time_per_iteration": 2.5701425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162298, + "balance_loss_mlp": 1.11821485, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.06548151577025092, + "language_loss": 0.85385978, + "learning_rate": 0.0009574376374004652, + "loss": 0.86548281, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.44116211, + "step": 823, + "time_per_iteration": 2.622905731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_mlp": 1.12019491, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.1009087476503521, + "language_loss": 0.82624936, + "learning_rate": 0.000957311767583204, + "loss": 0.83794677, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.49536133, + "step": 824, + "time_per_iteration": 2.5683999061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196161, + "balance_loss_mlp": 1.1752758, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.05150472419389455, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83267754, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.20898438, + "step": 825, + "time_per_iteration": 4.722898960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176776, + "balance_loss_mlp": 1.12170124, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.10062471557735768, + "language_loss": 0.94017303, + "learning_rate": 0.0009570594953650961, + "loss": 0.95194077, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.55029297, + "step": 826, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173437, + "balance_loss_mlp": 1.12091362, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.0719939675894647, + "language_loss": 0.8219676, + "learning_rate": 0.00095693309306219, + "loss": 0.83370197, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.52612305, + "step": 827, + "time_per_iteration": 3.0926811695098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_mlp": 1.12434745, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.06038838021195225, + "language_loss": 0.90083122, + "learning_rate": 0.0009568065133621244, + "loss": 0.91261542, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54077148, + "step": 828, + "time_per_iteration": 3.315122604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164888, + "balance_loss_mlp": 1.12013662, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.07025990147709567, + "language_loss": 0.87178355, + "learning_rate": 0.0009566797563140422, + "loss": 0.88343245, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.44775391, + "step": 829, + "time_per_iteration": 2.8680243492126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116659, + "balance_loss_mlp": 1.11912107, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.061296828426512996, + "language_loss": 0.89984798, + "learning_rate": 0.0009565528219671547, + "loss": 0.91151381, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.47460938, + "step": 830, + "time_per_iteration": 2.9325318336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.1076839, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07652275644998038, + "language_loss": 0.86699682, + "learning_rate": 0.0009564257103707418, + "loss": 0.87860584, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.53198242, + "step": 831, + "time_per_iteration": 2.598191976547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184474, + "balance_loss_mlp": 1.12973261, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08337472663089728, + "language_loss": 0.92543364, + "learning_rate": 0.0009562984215741533, + "loss": 0.93727839, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54736328, + "step": 832, + "time_per_iteration": 2.676666736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_mlp": 1.11177731, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.05762908483075192, + "language_loss": 0.8408711, + "learning_rate": 0.0009561709556268065, + "loss": 0.85247904, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.49047852, + "step": 833, + "time_per_iteration": 2.7075538635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162528, + "balance_loss_mlp": 1.11141133, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.06044842900072245, + "language_loss": 0.96042889, + "learning_rate": 0.0009560433125781884, + "loss": 0.97205412, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.51171875, + "step": 834, + "time_per_iteration": 2.7619521617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.09130979, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.06441579465763399, + "language_loss": 0.94159138, + "learning_rate": 0.0009559154924778544, + "loss": 0.95304114, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.53686523, + "step": 835, + "time_per_iteration": 2.7467222213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_mlp": 1.08218372, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.07312538570388089, + "language_loss": 0.86469144, + "learning_rate": 0.0009557874953754284, + "loss": 0.87598646, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.47314453, + "step": 836, + "time_per_iteration": 3.0907793045043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126281, + "balance_loss_mlp": 1.07618928, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08101808751207061, + "language_loss": 0.85894346, + "learning_rate": 0.0009556593213206038, + "loss": 0.87020624, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.5012207, + "step": 837, + "time_per_iteration": 2.7060487270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.07765627, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.060960398488271, + "language_loss": 0.89031309, + "learning_rate": 0.0009555309703631414, + "loss": 0.9015379, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.44848633, + "step": 838, + "time_per_iteration": 2.6838622093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131245, + "balance_loss_mlp": 1.07853079, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.0637381399971671, + "language_loss": 0.88547724, + "learning_rate": 0.0009554024425528722, + "loss": 0.89678967, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.52685547, + "step": 839, + "time_per_iteration": 2.7301504611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124486, + "balance_loss_mlp": 1.07978272, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0692663948027758, + "language_loss": 0.90811443, + "learning_rate": 0.0009552737379396948, + "loss": 0.91935933, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.44726562, + "step": 840, + "time_per_iteration": 2.6181893348693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129368, + "balance_loss_mlp": 1.08208978, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06449676765287365, + "language_loss": 0.89640445, + "learning_rate": 0.0009551448565735767, + "loss": 0.90769809, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.47265625, + "step": 841, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135502, + "balance_loss_mlp": 1.08555281, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.07291825437583387, + "language_loss": 0.86443651, + "learning_rate": 0.0009550157985045543, + "loss": 0.87579155, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.49926758, + "step": 842, + "time_per_iteration": 3.0523600578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_mlp": 1.08724499, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.06222432903322319, + "language_loss": 0.90556312, + "learning_rate": 0.0009548865637827321, + "loss": 0.91690183, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.46630859, + "step": 843, + "time_per_iteration": 2.6370396614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113271, + "balance_loss_mlp": 1.08757734, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.07459586377830821, + "language_loss": 0.91347718, + "learning_rate": 0.0009547571524582838, + "loss": 0.92480427, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.45141602, + "step": 844, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142931, + "balance_loss_mlp": 1.09460354, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.08463351541898638, + "language_loss": 0.94371468, + "learning_rate": 0.0009546275645814512, + "loss": 0.95514405, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.48339844, + "step": 845, + "time_per_iteration": 2.632861375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117157, + "balance_loss_mlp": 1.12107265, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.08033911629378378, + "language_loss": 0.92129737, + "learning_rate": 0.0009544978002025446, + "loss": 0.93301302, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.50561523, + "step": 846, + "time_per_iteration": 2.7044737339019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193795, + "balance_loss_mlp": 1.14096177, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.052695226385161484, + "language_loss": 0.88037688, + "learning_rate": 0.0009543678593719434, + "loss": 0.89231491, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.52880859, + "step": 847, + "time_per_iteration": 2.798231601715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208721, + "balance_loss_mlp": 1.15734136, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.056853368929671785, + "language_loss": 0.88963962, + "learning_rate": 0.0009542377421400945, + "loss": 0.90172684, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.51391602, + "step": 848, + "time_per_iteration": 2.7955727577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122402, + "balance_loss_mlp": 1.16584587, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06352967983147602, + "language_loss": 0.85259467, + "learning_rate": 0.0009541074485575145, + "loss": 0.86483485, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.58154297, + "step": 849, + "time_per_iteration": 2.703871488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_mlp": 1.17088127, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07774946886845908, + "language_loss": 0.93468195, + "learning_rate": 0.0009539769786747874, + "loss": 0.94693196, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.54125977, + "step": 850, + "time_per_iteration": 2.6687557697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012154, + "balance_loss_mlp": 1.16130245, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.057605035940766894, + "language_loss": 0.82393861, + "learning_rate": 0.0009538463325425665, + "loss": 0.83609259, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.54101562, + "step": 851, + "time_per_iteration": 2.751335382461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199288, + "balance_loss_mlp": 1.1491015, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.06621147850271279, + "language_loss": 0.87526274, + "learning_rate": 0.0009537155102115728, + "loss": 0.88725561, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.50170898, + "step": 852, + "time_per_iteration": 2.568573474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168884, + "balance_loss_mlp": 1.12236834, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.07419725806034035, + "language_loss": 0.85374665, + "learning_rate": 0.0009535845117325961, + "loss": 0.86543554, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.46533203, + "step": 853, + "time_per_iteration": 2.628973960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137862, + "balance_loss_mlp": 1.09511375, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.05551255594321189, + "language_loss": 0.94495642, + "learning_rate": 0.0009534533371564946, + "loss": 0.95633507, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.42724609, + "step": 854, + "time_per_iteration": 2.780510902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133546, + "balance_loss_mlp": 1.09003448, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.08632067881035285, + "language_loss": 0.90547508, + "learning_rate": 0.0009533219865341949, + "loss": 0.91681051, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.43530273, + "step": 855, + "time_per_iteration": 2.583874464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_mlp": 1.07188785, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.06082853882497287, + "language_loss": 0.88071746, + "learning_rate": 0.0009531904599166916, + "loss": 0.89188123, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.44482422, + "step": 856, + "time_per_iteration": 2.626354217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_mlp": 1.06231081, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.0709999882269981, + "language_loss": 0.86807954, + "learning_rate": 0.0009530587573550478, + "loss": 0.87915355, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.45068359, + "step": 857, + "time_per_iteration": 2.5761454105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142125, + "balance_loss_mlp": 1.11237001, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04095057850479287, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75461513, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.296875, + "step": 858, + "time_per_iteration": 5.055138349533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_mlp": 1.06165087, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.08838989258306214, + "language_loss": 0.91845137, + "learning_rate": 0.0009527948246039337, + "loss": 0.92946172, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.39379883, + "step": 859, + "time_per_iteration": 2.582608461380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111338, + "balance_loss_mlp": 1.0715934, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.06489567580347368, + "language_loss": 0.89263308, + "learning_rate": 0.000952662594516931, + "loss": 0.90374649, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.39746094, + "step": 860, + "time_per_iteration": 3.067707061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_mlp": 1.07018054, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.055059247831062384, + "language_loss": 0.88479781, + "learning_rate": 0.0009525301886907234, + "loss": 0.89590299, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.40307617, + "step": 861, + "time_per_iteration": 2.8873865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112184, + "balance_loss_mlp": 1.07758975, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.06995538812096423, + "language_loss": 0.89499515, + "learning_rate": 0.0009523976071767155, + "loss": 0.90621358, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.44262695, + "step": 862, + "time_per_iteration": 2.6588613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.08183372, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.06313062043432274, + "language_loss": 0.89038265, + "learning_rate": 0.00095226485002638, + "loss": 0.90163255, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.43115234, + "step": 863, + "time_per_iteration": 2.797896146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.07232881, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.054774526957085325, + "language_loss": 0.90381318, + "learning_rate": 0.0009521319172912576, + "loss": 0.91494584, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.40917969, + "step": 864, + "time_per_iteration": 2.7238612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_mlp": 1.08132839, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.05854649520245602, + "language_loss": 0.96491337, + "learning_rate": 0.0009519988090229579, + "loss": 0.97618109, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.4543457, + "step": 865, + "time_per_iteration": 2.683509111404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_mlp": 1.07907248, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.05699467986566688, + "language_loss": 0.89545953, + "learning_rate": 0.0009518655252731576, + "loss": 0.90669084, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.44067383, + "step": 866, + "time_per_iteration": 2.729865550994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_mlp": 1.08456326, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.06482393342324422, + "language_loss": 0.9171015, + "learning_rate": 0.0009517320660936022, + "loss": 0.9284128, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.46557617, + "step": 867, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133142, + "balance_loss_mlp": 1.08843839, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.06614373571764609, + "language_loss": 0.84472704, + "learning_rate": 0.0009515984315361051, + "loss": 0.85605848, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.44702148, + "step": 868, + "time_per_iteration": 2.796868085861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121806, + "balance_loss_mlp": 1.07657838, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08270078218547869, + "language_loss": 0.88773656, + "learning_rate": 0.000951464621652548, + "loss": 0.89895463, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.45239258, + "step": 869, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141117, + "balance_loss_mlp": 1.09751046, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.06072661062765564, + "language_loss": 0.80103016, + "learning_rate": 0.0009513306364948804, + "loss": 0.81244129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.43579102, + "step": 870, + "time_per_iteration": 2.799009084701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_mlp": 1.10373545, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09261319168225486, + "language_loss": 0.90277344, + "learning_rate": 0.0009511964761151197, + "loss": 0.91426206, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.45117188, + "step": 871, + "time_per_iteration": 2.5934712886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158743, + "balance_loss_mlp": 1.1145407, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.06739805293344515, + "language_loss": 0.91524243, + "learning_rate": 0.0009510621405653521, + "loss": 0.92682987, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.44213867, + "step": 872, + "time_per_iteration": 2.5557620525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11627746, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.06267535529199315, + "language_loss": 0.85553813, + "learning_rate": 0.0009509276298977309, + "loss": 0.86710668, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.40576172, + "step": 873, + "time_per_iteration": 2.9965007305145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187981, + "balance_loss_mlp": 1.13760364, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.07409010972210926, + "language_loss": 0.82916558, + "learning_rate": 0.0009507929441644778, + "loss": 0.84104538, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.50415039, + "step": 874, + "time_per_iteration": 3.5573699474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118449, + "balance_loss_mlp": 1.14097893, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.07388150752212762, + "language_loss": 0.8737148, + "learning_rate": 0.0009506580834178826, + "loss": 0.88555974, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.43530273, + "step": 875, + "time_per_iteration": 2.7659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215839, + "balance_loss_mlp": 1.16841793, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.06935842584614806, + "language_loss": 0.92793226, + "learning_rate": 0.0009505230477103028, + "loss": 0.94009066, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.47436523, + "step": 876, + "time_per_iteration": 2.7306137084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_mlp": 1.18224776, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10053146783154573, + "language_loss": 0.82997662, + "learning_rate": 0.0009503878370941641, + "loss": 0.84224302, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.44433594, + "step": 877, + "time_per_iteration": 2.7356183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211684, + "balance_loss_mlp": 1.16793382, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.10508781605450683, + "language_loss": 0.9020679, + "learning_rate": 0.0009502524516219595, + "loss": 0.91418481, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.43798828, + "step": 878, + "time_per_iteration": 2.7525370121002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185028, + "balance_loss_mlp": 1.14232683, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.07887273759437702, + "language_loss": 0.91364408, + "learning_rate": 0.0009501168913462506, + "loss": 0.92549431, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.42724609, + "step": 879, + "time_per_iteration": 2.7009639739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115086, + "balance_loss_mlp": 1.11919844, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04902821320434346, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80272782, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.31640625, + "step": 880, + "time_per_iteration": 4.812703609466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116281, + "balance_loss_mlp": 1.11748707, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.06555145426806878, + "language_loss": 0.86756283, + "learning_rate": 0.0009498452465949042, + "loss": 0.87919092, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.453125, + "step": 881, + "time_per_iteration": 3.230407476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159747, + "balance_loss_mlp": 1.1133033, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.0753185527775994, + "language_loss": 0.92756218, + "learning_rate": 0.0009497091622247285, + "loss": 0.93915963, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.46459961, + "step": 882, + "time_per_iteration": 2.7412030696868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141259, + "balance_loss_mlp": 1.09734213, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.07197762243887564, + "language_loss": 0.94941783, + "learning_rate": 0.0009495729032619723, + "loss": 0.96083045, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.43945312, + "step": 883, + "time_per_iteration": 2.6705245971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_mlp": 1.09724283, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07033792867334165, + "language_loss": 0.85310471, + "learning_rate": 0.0009494364697595354, + "loss": 0.86451751, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.44018555, + "step": 884, + "time_per_iteration": 2.9024457931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115677, + "balance_loss_mlp": 1.10977769, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.0673266035955572, + "language_loss": 0.90739167, + "learning_rate": 0.0009492998617703867, + "loss": 0.91895938, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.47045898, + "step": 885, + "time_per_iteration": 2.6497459411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151127, + "balance_loss_mlp": 1.10813999, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.0863252086663651, + "language_loss": 0.89101255, + "learning_rate": 0.0009491630793475619, + "loss": 0.90252388, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.42993164, + "step": 886, + "time_per_iteration": 2.6258063316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159756, + "balance_loss_mlp": 1.11231089, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.0686214928272948, + "language_loss": 0.85993534, + "learning_rate": 0.0009490261225441643, + "loss": 0.87153292, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.47412109, + "step": 887, + "time_per_iteration": 2.9036519527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168173, + "balance_loss_mlp": 1.12370825, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07914830411429463, + "language_loss": 0.91452426, + "learning_rate": 0.0009488889914133656, + "loss": 0.92620599, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.44458008, + "step": 888, + "time_per_iteration": 3.0038132667541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155268, + "balance_loss_mlp": 1.10706019, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07300075385020723, + "language_loss": 0.90558064, + "learning_rate": 0.0009487516860084047, + "loss": 0.91713333, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.48193359, + "step": 889, + "time_per_iteration": 2.7158679962158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147984, + "balance_loss_mlp": 1.0996089, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.09172908653222724, + "language_loss": 0.90068781, + "learning_rate": 0.0009486142063825884, + "loss": 0.91216767, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.48364258, + "step": 890, + "time_per_iteration": 2.5330443382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.06175303, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.031797672969882694, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73511147, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.23144531, + "step": 891, + "time_per_iteration": 4.953175783157349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_mlp": 1.11835372, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.06989736404119995, + "language_loss": 0.91231126, + "learning_rate": 0.0009483387246819542, + "loss": 0.92398739, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.49243164, + "step": 892, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.0426023, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.022698270048783192, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83350885, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.2265625, + "step": 893, + "time_per_iteration": 4.662828683853149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12312233, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.06047387129149895, + "language_loss": 0.90360647, + "learning_rate": 0.0009480625467392688, + "loss": 0.91527206, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.43481445, + "step": 894, + "time_per_iteration": 2.615447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046079, + "balance_loss_mlp": 1.02433491, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.017910617622931155, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79040754, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.21777344, + "step": 895, + "time_per_iteration": 4.802469968795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196327, + "balance_loss_mlp": 1.15264833, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0591778940977726, + "language_loss": 0.88960874, + "learning_rate": 0.0009477856729834196, + "loss": 0.90157199, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.43652344, + "step": 896, + "time_per_iteration": 2.743036985397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214543, + "balance_loss_mlp": 1.17217648, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.09709817551063968, + "language_loss": 0.91585428, + "learning_rate": 0.0009476469753098809, + "loss": 0.92799973, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.42358398, + "step": 897, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206917, + "balance_loss_mlp": 1.16080689, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08785360527314089, + "language_loss": 0.87616539, + "learning_rate": 0.0009475081038443738, + "loss": 0.88823456, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.46118164, + "step": 898, + "time_per_iteration": 2.5958664417266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178905, + "balance_loss_mlp": 1.13436794, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.08099470404026293, + "language_loss": 0.87109447, + "learning_rate": 0.0009473690586408124, + "loss": 0.88288355, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.44482422, + "step": 899, + "time_per_iteration": 2.885279417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.13184392, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.060075693842180825, + "language_loss": 0.87349975, + "learning_rate": 0.0009472298397531792, + "loss": 0.88526928, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.45141602, + "step": 900, + "time_per_iteration": 2.6987335681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117213, + "balance_loss_mlp": 1.12244344, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.06597136758704356, + "language_loss": 0.87749296, + "learning_rate": 0.0009470904472355235, + "loss": 0.88921428, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.49707031, + "step": 901, + "time_per_iteration": 2.6920526027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_mlp": 1.08898544, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.06929151708835651, + "language_loss": 0.8084361, + "learning_rate": 0.0009469508811419626, + "loss": 0.81977129, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.44555664, + "step": 902, + "time_per_iteration": 2.7087764739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_mlp": 1.01825094, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.018918236495105482, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7265144, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.831868648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130376, + "balance_loss_mlp": 1.08429003, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.06904883588321564, + "language_loss": 0.84871197, + "learning_rate": 0.0009466712284439292, + "loss": 0.86001575, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.46118164, + "step": 904, + "time_per_iteration": 2.727154493331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135856, + "balance_loss_mlp": 1.08867335, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.0797697294198037, + "language_loss": 0.90077758, + "learning_rate": 0.0009465311419480276, + "loss": 0.9121362, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.47216797, + "step": 905, + "time_per_iteration": 2.659696340560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130539, + "balance_loss_mlp": 1.0859549, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.0780460064240459, + "language_loss": 0.89685637, + "learning_rate": 0.0009463908820933622, + "loss": 0.90816176, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.44604492, + "step": 906, + "time_per_iteration": 2.845508337020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_mlp": 1.10657179, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.06621529993663824, + "language_loss": 0.83420271, + "learning_rate": 0.0009462504489343868, + "loss": 0.84573436, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.46582031, + "step": 907, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152101, + "balance_loss_mlp": 1.10246193, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0823987818854668, + "language_loss": 0.9018122, + "learning_rate": 0.0009461098425256222, + "loss": 0.91333324, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.49633789, + "step": 908, + "time_per_iteration": 2.5904529094696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.11457169, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.0762262609163865, + "language_loss": 0.87090451, + "learning_rate": 0.0009459690629216567, + "loss": 0.88250846, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.45874023, + "step": 909, + "time_per_iteration": 2.61710524559021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155202, + "balance_loss_mlp": 1.10921121, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06657664395828655, + "language_loss": 0.88943893, + "learning_rate": 0.0009458281101771457, + "loss": 0.90099096, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.46020508, + "step": 910, + "time_per_iteration": 2.6421282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176316, + "balance_loss_mlp": 1.12810779, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.08799417436837091, + "language_loss": 0.8354404, + "learning_rate": 0.0009456869843468122, + "loss": 0.84720349, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.48217773, + "step": 911, + "time_per_iteration": 2.8633837699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178078, + "balance_loss_mlp": 1.12688971, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.08410877580390771, + "language_loss": 0.79552639, + "learning_rate": 0.0009455456854854459, + "loss": 0.80730712, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.51220703, + "step": 912, + "time_per_iteration": 2.661038875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180916, + "balance_loss_mlp": 1.13564038, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.17307911593328887, + "language_loss": 0.85480136, + "learning_rate": 0.0009454042136479039, + "loss": 0.86661053, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.45263672, + "step": 913, + "time_per_iteration": 2.561790943145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198041, + "balance_loss_mlp": 1.15183568, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.06959724621682493, + "language_loss": 0.8438077, + "learning_rate": 0.0009452625688891103, + "loss": 0.85578811, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.4621582, + "step": 914, + "time_per_iteration": 2.5396227836608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092507, + "balance_loss_mlp": 1.07600832, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.034614734916794516, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79827243, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.16503906, + "step": 915, + "time_per_iteration": 4.550157308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_mlp": 1.21347213, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08235911171958209, + "language_loss": 0.94223297, + "learning_rate": 0.0009449787608278015, + "loss": 0.95488179, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.51489258, + "step": 916, + "time_per_iteration": 2.8292665481567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243722, + "balance_loss_mlp": 1.19525158, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08361954447634375, + "language_loss": 0.9338274, + "learning_rate": 0.0009448365976354704, + "loss": 0.94626462, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.48461914, + "step": 917, + "time_per_iteration": 2.543883800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216482, + "balance_loss_mlp": 1.16622329, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.08482517786251102, + "language_loss": 0.91736883, + "learning_rate": 0.0009446942617422558, + "loss": 0.9295336, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.50317383, + "step": 918, + "time_per_iteration": 2.6130669116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118235, + "balance_loss_mlp": 1.13740778, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.07957198864097685, + "language_loss": 0.8648746, + "learning_rate": 0.0009445517532034176, + "loss": 0.87669808, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.44970703, + "step": 919, + "time_per_iteration": 2.7341010570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116033, + "balance_loss_mlp": 1.11002386, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.08371374964142012, + "language_loss": 0.9020586, + "learning_rate": 0.0009444090720742824, + "loss": 0.9136619, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.50341797, + "step": 920, + "time_per_iteration": 2.628169298171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158892, + "balance_loss_mlp": 1.1083951, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.07483188289837522, + "language_loss": 0.89025688, + "learning_rate": 0.0009442662184102439, + "loss": 0.90184581, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.50512695, + "step": 921, + "time_per_iteration": 2.7538435459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154828, + "balance_loss_mlp": 1.11210358, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.05276545299780942, + "language_loss": 0.88537991, + "learning_rate": 0.000944123192266763, + "loss": 0.89692819, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.42724609, + "step": 922, + "time_per_iteration": 2.788759469985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190709, + "balance_loss_mlp": 1.13887644, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.07681776188261369, + "language_loss": 0.84657156, + "learning_rate": 0.0009439799936993671, + "loss": 0.85847867, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.51904297, + "step": 923, + "time_per_iteration": 2.7123734951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196866, + "balance_loss_mlp": 1.14787149, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.09732559260361714, + "language_loss": 0.89131558, + "learning_rate": 0.0009438366227636511, + "loss": 0.90328419, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.49047852, + "step": 924, + "time_per_iteration": 2.6907341480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171193, + "balance_loss_mlp": 1.12396216, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07379366042998667, + "language_loss": 0.86971134, + "learning_rate": 0.0009436930795152763, + "loss": 0.88142323, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.47241211, + "step": 925, + "time_per_iteration": 2.865673065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168215, + "balance_loss_mlp": 1.12174773, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07469970420174622, + "language_loss": 0.8767308, + "learning_rate": 0.0009435493640099713, + "loss": 0.88841295, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.46411133, + "step": 926, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_mlp": 1.10388088, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.06972760602295516, + "language_loss": 0.85458124, + "learning_rate": 0.0009434054763035314, + "loss": 0.86612737, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.50756836, + "step": 927, + "time_per_iteration": 2.5972957611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 1.09983397, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.05666425765353489, + "language_loss": 0.86302543, + "learning_rate": 0.0009432614164518185, + "loss": 0.8745054, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.48168945, + "step": 928, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150828, + "balance_loss_mlp": 1.09780383, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07484249942420804, + "language_loss": 0.85464913, + "learning_rate": 0.000943117184510762, + "loss": 0.86615741, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 2.9855945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124448, + "balance_loss_mlp": 1.10556555, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.03465095249088487, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79914415, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.18847656, + "step": 930, + "time_per_iteration": 5.016055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148447, + "balance_loss_mlp": 1.09997642, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.07304481613225793, + "language_loss": 0.89790976, + "learning_rate": 0.0009428282045846674, + "loss": 0.90939426, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.48461914, + "step": 931, + "time_per_iteration": 2.787473678588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134009, + "balance_loss_mlp": 1.08797026, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.05043968313129053, + "language_loss": 0.90432143, + "learning_rate": 0.0009426834567118214, + "loss": 0.91566151, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.46044922, + "step": 932, + "time_per_iteration": 3.1106340885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149699, + "balance_loss_mlp": 1.10091829, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.0884624873286247, + "language_loss": 0.81563932, + "learning_rate": 0.0009425385369740155, + "loss": 0.82713628, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.48779297, + "step": 933, + "time_per_iteration": 3.056328296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.1138767, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.0672899912264689, + "language_loss": 0.88411558, + "learning_rate": 0.0009423934454275125, + "loss": 0.8957603, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.50561523, + "step": 934, + "time_per_iteration": 2.827507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162381, + "balance_loss_mlp": 1.11333871, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.07880287247644589, + "language_loss": 0.92845738, + "learning_rate": 0.0009422481821286418, + "loss": 0.94008112, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.49072266, + "step": 935, + "time_per_iteration": 2.7188265323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164456, + "balance_loss_mlp": 1.11918044, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.07978340192275198, + "language_loss": 0.88968349, + "learning_rate": 0.0009421027471337998, + "loss": 0.90132797, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.45239258, + "step": 936, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176728, + "balance_loss_mlp": 1.1271131, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.07049523693926517, + "language_loss": 0.83782339, + "learning_rate": 0.0009419571404994493, + "loss": 0.84959066, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.49584961, + "step": 937, + "time_per_iteration": 2.641847610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_mlp": 1.11354589, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.06745021535989586, + "language_loss": 0.91665328, + "learning_rate": 0.00094181136228212, + "loss": 0.92827624, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.48803711, + "step": 938, + "time_per_iteration": 2.622314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146811, + "balance_loss_mlp": 1.10334706, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06209482952821168, + "language_loss": 0.87085009, + "learning_rate": 0.0009416654125384077, + "loss": 0.88231826, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.43432617, + "step": 939, + "time_per_iteration": 2.735565423965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167753, + "balance_loss_mlp": 1.15230346, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.039552666267989665, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80940127, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15429688, + "step": 940, + "time_per_iteration": 4.9464662075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_mlp": 1.10293126, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.06405620484007693, + "language_loss": 0.85002685, + "learning_rate": 0.000941372998698552, + "loss": 0.86150396, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.44750977, + "step": 941, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152914, + "balance_loss_mlp": 1.10344219, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.07883971857950696, + "language_loss": 0.82437575, + "learning_rate": 0.0009412265347159336, + "loss": 0.8359049, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.49487305, + "step": 942, + "time_per_iteration": 2.727071762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135445, + "balance_loss_mlp": 1.09083664, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.10057326993772005, + "language_loss": 0.85614288, + "learning_rate": 0.0009410798994339829, + "loss": 0.86749732, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.44604492, + "step": 943, + "time_per_iteration": 2.6305696964263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.09248304, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.05478952043416941, + "language_loss": 0.88907182, + "learning_rate": 0.000940933092909628, + "loss": 0.90042174, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.42529297, + "step": 944, + "time_per_iteration": 2.631101369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.10530019, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.06051663433249254, + "language_loss": 0.84961444, + "learning_rate": 0.0009407861151998649, + "loss": 0.8611083, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.44067383, + "step": 945, + "time_per_iteration": 2.5717978477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116837, + "balance_loss_mlp": 1.12040067, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.06666982795430461, + "language_loss": 0.87044382, + "learning_rate": 0.0009406389663617552, + "loss": 0.88212758, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.47998047, + "step": 946, + "time_per_iteration": 2.6768407821655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170796, + "balance_loss_mlp": 1.12757087, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0759743739596538, + "language_loss": 0.87192827, + "learning_rate": 0.000940491646452427, + "loss": 0.88363624, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.43212891, + "step": 947, + "time_per_iteration": 2.7174758911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174804, + "balance_loss_mlp": 1.1271199, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.06285362616764655, + "language_loss": 0.91503757, + "learning_rate": 0.000940344155529075, + "loss": 0.92678559, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.47680664, + "step": 948, + "time_per_iteration": 2.6130924224853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175811, + "balance_loss_mlp": 1.12643504, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.07182633578445446, + "language_loss": 0.88395435, + "learning_rate": 0.0009401964936489605, + "loss": 0.89571244, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.4934082, + "step": 949, + "time_per_iteration": 2.518735885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154077, + "balance_loss_mlp": 1.11173368, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.08616214546245322, + "language_loss": 0.86381257, + "learning_rate": 0.0009400486608694108, + "loss": 0.87535334, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.42358398, + "step": 950, + "time_per_iteration": 2.7356269359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_mlp": 1.10071373, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.05684050086710682, + "language_loss": 0.88146299, + "learning_rate": 0.0009399006572478195, + "loss": 0.89294124, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.47119141, + "step": 951, + "time_per_iteration": 3.0829784870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113898, + "balance_loss_mlp": 1.09449124, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06809630737889293, + "language_loss": 0.91594249, + "learning_rate": 0.0009397524828416468, + "loss": 0.92733228, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.44482422, + "step": 952, + "time_per_iteration": 2.710500478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141316, + "balance_loss_mlp": 1.09339356, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.06814185159234107, + "language_loss": 0.97457635, + "learning_rate": 0.0009396041377084192, + "loss": 0.98598951, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.47949219, + "step": 953, + "time_per_iteration": 2.6530585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011339, + "balance_loss_mlp": 1.08716977, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.06688505748067412, + "language_loss": 0.88496006, + "learning_rate": 0.0009394556219057295, + "loss": 0.896299, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.46704102, + "step": 954, + "time_per_iteration": 2.662543773651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.08948374, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08148035498798997, + "language_loss": 0.84775722, + "learning_rate": 0.0009393069354912362, + "loss": 0.85911626, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.46386719, + "step": 955, + "time_per_iteration": 2.7262632846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_mlp": 1.0954181, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07343823471440349, + "language_loss": 0.83466816, + "learning_rate": 0.0009391580785226649, + "loss": 0.8460598, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.43774414, + "step": 956, + "time_per_iteration": 2.8661141395568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_mlp": 1.04708123, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.029557521366383285, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80407178, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.19628906, + "step": 957, + "time_per_iteration": 4.751030921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.08978534, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.06490118531587029, + "language_loss": 0.87677503, + "learning_rate": 0.0009388598531545196, + "loss": 0.88812232, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.44946289, + "step": 958, + "time_per_iteration": 2.8378970623016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143042, + "balance_loss_mlp": 1.09702718, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.07391212127287443, + "language_loss": 0.86896807, + "learning_rate": 0.000938710484870727, + "loss": 0.88039851, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.46044922, + "step": 959, + "time_per_iteration": 4.31168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128823, + "balance_loss_mlp": 1.08416748, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0638837232249089, + "language_loss": 0.86957002, + "learning_rate": 0.0009385609462644189, + "loss": 0.88085824, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.44702148, + "step": 960, + "time_per_iteration": 2.6793572902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_mlp": 1.07233214, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07248975394705585, + "language_loss": 0.86711299, + "learning_rate": 0.0009384112373936514, + "loss": 0.87830293, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.46679688, + "step": 961, + "time_per_iteration": 2.6220860481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119858, + "balance_loss_mlp": 1.07334304, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.06813544125014795, + "language_loss": 0.92053163, + "learning_rate": 0.0009382613583165467, + "loss": 0.93173021, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.46533203, + "step": 962, + "time_per_iteration": 2.8032093048095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108588, + "balance_loss_mlp": 1.06142831, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07296294799157402, + "language_loss": 0.9064188, + "learning_rate": 0.0009381113090912928, + "loss": 0.91750467, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.47167969, + "step": 963, + "time_per_iteration": 2.7358789443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_mlp": 1.06741881, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.07962159601741099, + "language_loss": 0.90353996, + "learning_rate": 0.000937961089776144, + "loss": 0.91463923, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.42480469, + "step": 964, + "time_per_iteration": 2.5761237144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.07924736, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09082243760489998, + "language_loss": 0.83673573, + "learning_rate": 0.0009378107004294208, + "loss": 0.84802246, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.49438477, + "step": 965, + "time_per_iteration": 2.9681291580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132442, + "balance_loss_mlp": 1.08542585, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08405098410424734, + "language_loss": 0.92054594, + "learning_rate": 0.0009376601411095096, + "loss": 0.93187034, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.4699707, + "step": 966, + "time_per_iteration": 2.696122407913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.09773731, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07104128547690361, + "language_loss": 0.87554526, + "learning_rate": 0.0009375094118748622, + "loss": 0.88693225, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.40991211, + "step": 967, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179223, + "balance_loss_mlp": 1.13373268, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.0728928893981835, + "language_loss": 0.91626799, + "learning_rate": 0.0009373585127839976, + "loss": 0.92806023, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.45507812, + "step": 968, + "time_per_iteration": 2.9854021072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212732, + "balance_loss_mlp": 1.16905367, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08777237711590531, + "language_loss": 0.91368866, + "learning_rate": 0.0009372074438954994, + "loss": 0.92581606, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.43652344, + "step": 969, + "time_per_iteration": 2.5014536380767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211792, + "balance_loss_mlp": 1.16539574, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.0704882552763471, + "language_loss": 0.92436379, + "learning_rate": 0.0009370562052680181, + "loss": 0.93648171, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.46411133, + "step": 970, + "time_per_iteration": 2.453458070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120766, + "balance_loss_mlp": 1.16183591, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.07372597108689087, + "language_loss": 0.89988613, + "learning_rate": 0.0009369047969602695, + "loss": 0.91196281, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.45825195, + "step": 971, + "time_per_iteration": 2.703948497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192702, + "balance_loss_mlp": 1.14396954, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.08557962606734577, + "language_loss": 0.8750906, + "learning_rate": 0.0009367532190310357, + "loss": 0.88701761, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.48657227, + "step": 972, + "time_per_iteration": 4.1564977169036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148896, + "balance_loss_mlp": 1.1052649, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.06811184838385763, + "language_loss": 0.89467651, + "learning_rate": 0.0009366014715391644, + "loss": 0.90616548, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.43603516, + "step": 973, + "time_per_iteration": 2.695730209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134701, + "balance_loss_mlp": 1.09307301, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.054567817192194557, + "language_loss": 0.84347546, + "learning_rate": 0.0009364495545435693, + "loss": 0.85482252, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.41625977, + "step": 974, + "time_per_iteration": 2.828831672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146218, + "balance_loss_mlp": 1.09970224, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.08256927623824414, + "language_loss": 0.89333141, + "learning_rate": 0.0009362974681032297, + "loss": 0.90479362, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.46484375, + "step": 975, + "time_per_iteration": 2.5982418060302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143654, + "balance_loss_mlp": 1.09909391, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.07754570301250979, + "language_loss": 0.89447427, + "learning_rate": 0.0009361452122771907, + "loss": 0.90591079, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.44555664, + "step": 976, + "time_per_iteration": 2.881242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_mlp": 1.08834195, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.0965092241218366, + "language_loss": 0.84541976, + "learning_rate": 0.0009359927871245635, + "loss": 0.85675669, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.45361328, + "step": 977, + "time_per_iteration": 2.4720265865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113596, + "balance_loss_mlp": 1.09039843, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09227923665031239, + "language_loss": 0.87538362, + "learning_rate": 0.0009358401927045246, + "loss": 0.88674331, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.45581055, + "step": 978, + "time_per_iteration": 2.8225297927856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_mlp": 1.0945406, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.05953389716062443, + "language_loss": 0.88990903, + "learning_rate": 0.0009356874290763166, + "loss": 0.90131652, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.46264648, + "step": 979, + "time_per_iteration": 3.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_mlp": 1.09494936, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.06969100284100371, + "language_loss": 0.89955008, + "learning_rate": 0.0009355344962992474, + "loss": 0.91095543, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.45581055, + "step": 980, + "time_per_iteration": 2.6008429527282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138568, + "balance_loss_mlp": 1.09291101, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07021551702573088, + "language_loss": 0.88888156, + "learning_rate": 0.0009353813944326908, + "loss": 0.90026724, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.45654297, + "step": 981, + "time_per_iteration": 2.9102253913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141833, + "balance_loss_mlp": 1.09352899, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0640154196439605, + "language_loss": 0.83560127, + "learning_rate": 0.0009352281235360863, + "loss": 0.84701967, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.4831543, + "step": 982, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.10627127, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.06254433649037737, + "language_loss": 0.85791624, + "learning_rate": 0.0009350746836689389, + "loss": 0.86940861, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.4296875, + "step": 983, + "time_per_iteration": 2.524491548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.02905524, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.024687708549402564, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82486492, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.18261719, + "step": 984, + "time_per_iteration": 5.200335741043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156154, + "balance_loss_mlp": 1.1069684, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08202626484000469, + "language_loss": 0.84151661, + "learning_rate": 0.0009347672972613634, + "loss": 0.85307819, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.49145508, + "step": 985, + "time_per_iteration": 2.6939473152160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011517, + "balance_loss_mlp": 1.10756862, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.061889675774481866, + "language_loss": 0.8651796, + "learning_rate": 0.0009346133508402735, + "loss": 0.87669659, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.44140625, + "step": 986, + "time_per_iteration": 2.695004463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146205, + "balance_loss_mlp": 1.1000948, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07730871241699967, + "language_loss": 0.84821075, + "learning_rate": 0.0009344592356873166, + "loss": 0.85967278, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.46118164, + "step": 987, + "time_per_iteration": 2.635143518447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_mlp": 1.0975666, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.058246004489727894, + "language_loss": 0.79289091, + "learning_rate": 0.0009343049518623255, + "loss": 0.80432773, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.46142578, + "step": 988, + "time_per_iteration": 2.7257165908813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126709, + "balance_loss_mlp": 1.08503366, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.06464318177286693, + "language_loss": 0.83752143, + "learning_rate": 0.0009341504994251985, + "loss": 0.84878862, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.41674805, + "step": 989, + "time_per_iteration": 2.8336057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_mlp": 1.03692603, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.01962059038868396, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74572587, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.15136719, + "step": 990, + "time_per_iteration": 4.980287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.07682681, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.06360467015426281, + "language_loss": 0.82411575, + "learning_rate": 0.0009338410889544574, + "loss": 0.83530033, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.41601562, + "step": 991, + "time_per_iteration": 3.0192768573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123102, + "balance_loss_mlp": 1.0790422, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.06107834506241764, + "language_loss": 0.88440853, + "learning_rate": 0.000933686131040967, + "loss": 0.89563954, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.44067383, + "step": 992, + "time_per_iteration": 2.795952796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118187, + "balance_loss_mlp": 1.07479525, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.08075044213119366, + "language_loss": 0.91145802, + "learning_rate": 0.0009335310047555883, + "loss": 0.92263985, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.43383789, + "step": 993, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144036, + "balance_loss_mlp": 1.10052443, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.06789475617385991, + "language_loss": 0.89048505, + "learning_rate": 0.0009333757101585467, + "loss": 0.90192544, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.43554688, + "step": 994, + "time_per_iteration": 2.659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_mlp": 1.11687493, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.05475551086737561, + "language_loss": 0.94071913, + "learning_rate": 0.0009332202473101329, + "loss": 0.95231587, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.42822266, + "step": 995, + "time_per_iteration": 2.672307014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153956, + "balance_loss_mlp": 1.11011088, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.060816834986447306, + "language_loss": 0.8370983, + "learning_rate": 0.0009330646162707028, + "loss": 0.84863788, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.4387207, + "step": 996, + "time_per_iteration": 2.7483248710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.11274719, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.05013127115514869, + "language_loss": 0.85195571, + "learning_rate": 0.0009329088171006779, + "loss": 0.86350954, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.42626953, + "step": 997, + "time_per_iteration": 3.1445202827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_mlp": 1.1197654, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.07353815647154911, + "language_loss": 0.86074895, + "learning_rate": 0.0009327528498605446, + "loss": 0.87238026, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.43383789, + "step": 998, + "time_per_iteration": 2.536146402359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159094, + "balance_loss_mlp": 1.11844337, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.06861677349241169, + "language_loss": 0.9080506, + "learning_rate": 0.0009325967146108548, + "loss": 0.91964149, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.40649414, + "step": 999, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151834, + "balance_loss_mlp": 1.11049271, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.0672850368289366, + "language_loss": 0.88138115, + "learning_rate": 0.0009324404114122258, + "loss": 0.89289951, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.41357422, + "step": 1000, + "time_per_iteration": 2.677651882171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.12221444, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.06402741154285656, + "language_loss": 0.8710497, + "learning_rate": 0.0009322839403253397, + "loss": 0.88269627, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.42431641, + "step": 1001, + "time_per_iteration": 2.7528679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169088, + "balance_loss_mlp": 1.12440836, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07104878229054386, + "language_loss": 0.84949791, + "learning_rate": 0.0009321273014109439, + "loss": 0.86118877, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.44702148, + "step": 1002, + "time_per_iteration": 2.9990484714508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114889, + "balance_loss_mlp": 1.10523582, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.0673469195429183, + "language_loss": 0.85240018, + "learning_rate": 0.0009319704947298513, + "loss": 0.8638891, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.43676758, + "step": 1003, + "time_per_iteration": 2.8755459785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141118, + "balance_loss_mlp": 1.10127831, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.0925310675323854, + "language_loss": 0.89122581, + "learning_rate": 0.0009318135203429393, + "loss": 0.902637, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.3984375, + "step": 1004, + "time_per_iteration": 2.771192789077759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_mlp": 1.0866611, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.05779097302789, + "language_loss": 0.88602638, + "learning_rate": 0.0009316563783111511, + "loss": 0.8973062, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.41308594, + "step": 1005, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_mlp": 1.08638334, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06006842888316194, + "language_loss": 0.83199531, + "learning_rate": 0.0009314990686954943, + "loss": 0.84330451, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.44506836, + "step": 1006, + "time_per_iteration": 2.935081720352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_mlp": 1.09561515, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.0666735983489841, + "language_loss": 0.81657201, + "learning_rate": 0.000931341591557042, + "loss": 0.82798046, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.45263672, + "step": 1007, + "time_per_iteration": 3.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155404, + "balance_loss_mlp": 1.1041683, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.08115294197805281, + "language_loss": 0.87899536, + "learning_rate": 0.0009311839469569325, + "loss": 0.89054936, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.51171875, + "step": 1008, + "time_per_iteration": 2.6384472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150065, + "balance_loss_mlp": 1.10030699, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.07776470075981182, + "language_loss": 0.88065994, + "learning_rate": 0.0009310261349563687, + "loss": 0.89216053, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.49804688, + "step": 1009, + "time_per_iteration": 2.703058958053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_mlp": 1.11160064, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.05519618089274153, + "language_loss": 0.86250293, + "learning_rate": 0.0009308681556166186, + "loss": 0.87407839, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.45922852, + "step": 1010, + "time_per_iteration": 2.8404791355133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177928, + "balance_loss_mlp": 1.12480855, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10323239067467582, + "language_loss": 0.8870275, + "learning_rate": 0.0009307100089990152, + "loss": 0.89880681, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.53100586, + "step": 1011, + "time_per_iteration": 2.7103512287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185201, + "balance_loss_mlp": 1.13530004, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.08766026563197518, + "language_loss": 0.84582877, + "learning_rate": 0.0009305516951649568, + "loss": 0.8576808, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.49902344, + "step": 1012, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175674, + "balance_loss_mlp": 1.12818122, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07259628373080033, + "language_loss": 0.87723738, + "learning_rate": 0.0009303932141759057, + "loss": 0.8889941, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.47485352, + "step": 1013, + "time_per_iteration": 2.7738490104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161359, + "balance_loss_mlp": 1.11200666, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.07589756885314788, + "language_loss": 0.84698361, + "learning_rate": 0.0009302345660933902, + "loss": 0.85859716, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.49291992, + "step": 1014, + "time_per_iteration": 2.7809414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152692, + "balance_loss_mlp": 1.10579538, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.06636914889533592, + "language_loss": 0.85938931, + "learning_rate": 0.0009300757509790026, + "loss": 0.87091625, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.46875, + "step": 1015, + "time_per_iteration": 2.886200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151123, + "balance_loss_mlp": 1.10324848, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.08384883211824797, + "language_loss": 0.91210115, + "learning_rate": 0.0009299167688944005, + "loss": 0.92361236, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.47827148, + "step": 1016, + "time_per_iteration": 2.5308799743652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135215, + "balance_loss_mlp": 1.09036839, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07612639660839114, + "language_loss": 0.86733758, + "learning_rate": 0.0009297576199013063, + "loss": 0.87868977, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.44873047, + "step": 1017, + "time_per_iteration": 2.699352264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_mlp": 1.14159799, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.04987694814110311, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74158609, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.1484375, + "step": 1018, + "time_per_iteration": 4.927512168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099249, + "balance_loss_mlp": 1.08494341, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.032347612483235935, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80525547, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14257812, + "step": 1019, + "time_per_iteration": 5.494646787643433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_mlp": 1.08855522, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.06601293097738069, + "language_loss": 0.87223667, + "learning_rate": 0.0009292791720892659, + "loss": 0.88352561, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.40332031, + "step": 1020, + "time_per_iteration": 2.8718464374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_mlp": 1.08823943, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07136038826441608, + "language_loss": 0.89387941, + "learning_rate": 0.0009291193560807218, + "loss": 0.90521628, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.45483398, + "step": 1021, + "time_per_iteration": 2.588604211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132851, + "balance_loss_mlp": 1.09141409, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.06738480994857221, + "language_loss": 0.87651652, + "learning_rate": 0.0009289593734732688, + "loss": 0.88784504, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.41430664, + "step": 1022, + "time_per_iteration": 2.5915818214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.09036541, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.06942729809827348, + "language_loss": 0.94984972, + "learning_rate": 0.0009287992243290175, + "loss": 0.96114612, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.39282227, + "step": 1023, + "time_per_iteration": 2.4477546215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142342, + "balance_loss_mlp": 1.09880638, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.1017247644504036, + "language_loss": 0.91891634, + "learning_rate": 0.0009286389087101435, + "loss": 0.93033981, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 2.765334129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142412, + "balance_loss_mlp": 1.09942544, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07195718640229302, + "language_loss": 0.8893857, + "learning_rate": 0.0009284784266788864, + "loss": 0.90080982, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.42993164, + "step": 1025, + "time_per_iteration": 2.7323853969573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_mlp": 1.10327554, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.069193395974369, + "language_loss": 0.93259764, + "learning_rate": 0.0009283177782975512, + "loss": 0.94401753, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.38696289, + "step": 1026, + "time_per_iteration": 2.9729068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114551, + "balance_loss_mlp": 1.10142589, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08755988500201482, + "language_loss": 0.88955659, + "learning_rate": 0.000928156963628507, + "loss": 0.90101171, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.44067383, + "step": 1027, + "time_per_iteration": 2.594200849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138947, + "balance_loss_mlp": 1.09855926, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.07316483198701504, + "language_loss": 0.89277303, + "learning_rate": 0.0009279959827341877, + "loss": 0.90416259, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.40405273, + "step": 1028, + "time_per_iteration": 2.7378368377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140451, + "balance_loss_mlp": 1.09727335, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.059550544329949856, + "language_loss": 0.88526183, + "learning_rate": 0.0009278348356770915, + "loss": 0.89666629, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.43188477, + "step": 1029, + "time_per_iteration": 2.5737922191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133825, + "balance_loss_mlp": 1.0914098, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.06393748023743129, + "language_loss": 0.8587814, + "learning_rate": 0.0009276735225197814, + "loss": 0.87011963, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.42431641, + "step": 1030, + "time_per_iteration": 2.648477077484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146668, + "balance_loss_mlp": 1.10170269, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.06069855374703422, + "language_loss": 0.86812896, + "learning_rate": 0.0009275120433248847, + "loss": 0.87959564, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.44946289, + "step": 1031, + "time_per_iteration": 2.6862802505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.10327268, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.06482797348212818, + "language_loss": 0.87033594, + "learning_rate": 0.0009273503981550931, + "loss": 0.8818205, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.45166016, + "step": 1032, + "time_per_iteration": 3.0549416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157268, + "balance_loss_mlp": 1.11235023, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.07571303407420105, + "language_loss": 0.87661642, + "learning_rate": 0.0009271885870731626, + "loss": 0.88818914, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.44946289, + "step": 1033, + "time_per_iteration": 2.4938008785247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172373, + "balance_loss_mlp": 1.12495148, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.07801561202279184, + "language_loss": 0.89466584, + "learning_rate": 0.0009270266101419143, + "loss": 0.90638959, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.47460938, + "step": 1034, + "time_per_iteration": 2.61181378364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.12681675, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.07487269237991181, + "language_loss": 0.85762119, + "learning_rate": 0.0009268644674242328, + "loss": 0.86931992, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.43066406, + "step": 1035, + "time_per_iteration": 2.6761085987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163027, + "balance_loss_mlp": 1.1147716, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.06997084642295975, + "language_loss": 0.81697071, + "learning_rate": 0.0009267021589830678, + "loss": 0.828601, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.4831543, + "step": 1036, + "time_per_iteration": 2.6166343688964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162737, + "balance_loss_mlp": 1.14547551, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.04224955266067769, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78789818, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.17285156, + "step": 1037, + "time_per_iteration": 4.932336330413818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124804, + "balance_loss_mlp": 1.08224678, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.07370646472771722, + "language_loss": 0.9354341, + "learning_rate": 0.000926377045182406, + "loss": 0.94668216, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.42553711, + "step": 1038, + "time_per_iteration": 2.89486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122228, + "balance_loss_mlp": 1.07704759, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.06351485696264159, + "language_loss": 0.88915765, + "learning_rate": 0.0009262142399491296, + "loss": 0.9003799, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.4519043, + "step": 1039, + "time_per_iteration": 3.0843544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132553, + "balance_loss_mlp": 1.08784938, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.06429886269356283, + "language_loss": 0.89007306, + "learning_rate": 0.0009260512692448105, + "loss": 0.9013986, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.44677734, + "step": 1040, + "time_per_iteration": 2.7221181392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143871, + "balance_loss_mlp": 1.10071695, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0714265416650486, + "language_loss": 0.85044324, + "learning_rate": 0.000925888133132719, + "loss": 0.86188197, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.43164062, + "step": 1041, + "time_per_iteration": 2.7112865447998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113685, + "balance_loss_mlp": 1.09566069, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.0301437897992815, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072412, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.18066406, + "step": 1042, + "time_per_iteration": 4.913869380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.13338971, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.11345429965909062, + "language_loss": 0.82242954, + "learning_rate": 0.0009255613649386244, + "loss": 0.83422714, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.46337891, + "step": 1043, + "time_per_iteration": 2.6586339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153581, + "balance_loss_mlp": 1.11133325, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07362734504976313, + "language_loss": 0.79954398, + "learning_rate": 0.0009253977329834838, + "loss": 0.81107974, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.42236328, + "step": 1044, + "time_per_iteration": 2.7028462886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143902, + "balance_loss_mlp": 1.0951457, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.07842723007783056, + "language_loss": 0.8753317, + "learning_rate": 0.0009252339358742965, + "loss": 0.88677073, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.48779297, + "step": 1045, + "time_per_iteration": 2.8069612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139165, + "balance_loss_mlp": 1.0902648, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07197327624603128, + "language_loss": 0.84128577, + "learning_rate": 0.000925069973674654, + "loss": 0.85267735, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.48925781, + "step": 1046, + "time_per_iteration": 2.603602409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136955, + "balance_loss_mlp": 1.09303868, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06199919012721526, + "language_loss": 0.89849102, + "learning_rate": 0.000924905846448212, + "loss": 0.90986055, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.43896484, + "step": 1047, + "time_per_iteration": 2.733009099960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166193, + "balance_loss_mlp": 1.11726964, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.08010189097684783, + "language_loss": 0.86224002, + "learning_rate": 0.0009247415542586906, + "loss": 0.87390196, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.48950195, + "step": 1048, + "time_per_iteration": 2.8471555709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186895, + "balance_loss_mlp": 1.13675559, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.050762349186412876, + "language_loss": 0.83535373, + "learning_rate": 0.0009245770971698735, + "loss": 0.84722269, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.50170898, + "step": 1049, + "time_per_iteration": 2.889474630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183513, + "balance_loss_mlp": 1.13671136, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.07506320746734087, + "language_loss": 0.8918047, + "learning_rate": 0.0009244124752456087, + "loss": 0.90363979, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.46826172, + "step": 1050, + "time_per_iteration": 2.5762786865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205722, + "balance_loss_mlp": 1.15453339, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.08917577036116058, + "language_loss": 0.86475039, + "learning_rate": 0.0009242476885498081, + "loss": 0.87680757, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.51220703, + "step": 1051, + "time_per_iteration": 2.720395565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193502, + "balance_loss_mlp": 1.14009643, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.08090891256677915, + "language_loss": 0.81871718, + "learning_rate": 0.0009240827371464474, + "loss": 0.83065224, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.53442383, + "step": 1052, + "time_per_iteration": 2.535388231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162702, + "balance_loss_mlp": 1.11833251, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.08177732735855556, + "language_loss": 0.84886205, + "learning_rate": 0.0009239176210995666, + "loss": 0.86048913, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.4440918, + "step": 1053, + "time_per_iteration": 3.4955379962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148392, + "balance_loss_mlp": 1.0973227, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.9822109545682867, + "language_loss": 0.94933617, + "learning_rate": 0.0009237523404732695, + "loss": 0.96082008, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51074219, + "step": 1054, + "time_per_iteration": 2.90132737159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137464, + "balance_loss_mlp": 1.09118664, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.09331279688006895, + "language_loss": 0.85504258, + "learning_rate": 0.0009235868953317235, + "loss": 0.86641729, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.46264648, + "step": 1055, + "time_per_iteration": 2.813202381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212355, + "balance_loss_mlp": 1.16388512, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08645469446577787, + "language_loss": 0.86679947, + "learning_rate": 0.0009234212857391602, + "loss": 0.87892294, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.48486328, + "step": 1056, + "time_per_iteration": 3.184723377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_mlp": 1.23723245, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.11402704661401492, + "language_loss": 0.90548229, + "learning_rate": 0.000923255511759875, + "loss": 0.91837716, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.52319336, + "step": 1057, + "time_per_iteration": 2.8404476642608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374128, + "balance_loss_mlp": 1.3215096, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.12448379126392096, + "language_loss": 0.86306804, + "learning_rate": 0.000923089573458227, + "loss": 0.87680936, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.52661133, + "step": 1058, + "time_per_iteration": 2.921942949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411943, + "balance_loss_mlp": 1.35701096, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.12614323996078466, + "language_loss": 0.84856015, + "learning_rate": 0.0009229234708986392, + "loss": 0.8626796, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.54931641, + "step": 1059, + "time_per_iteration": 2.922795057296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01629047, + "balance_loss_mlp": 1.60253465, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.12493252943786969, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83295941, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.265625, + "step": 1060, + "time_per_iteration": 4.733684062957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333622, + "balance_loss_mlp": 1.27976346, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.0936460184690869, + "language_loss": 0.86563337, + "learning_rate": 0.0009225907732636548, + "loss": 0.87896961, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.53881836, + "step": 1061, + "time_per_iteration": 2.761353015899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296883, + "balance_loss_mlp": 1.24183202, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.09002543594031559, + "language_loss": 0.87698424, + "learning_rate": 0.0009224241783174227, + "loss": 0.88995302, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.55078125, + "step": 1062, + "time_per_iteration": 2.7161052227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252808, + "balance_loss_mlp": 1.19947362, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.08928798499879465, + "language_loss": 0.87254798, + "learning_rate": 0.0009222574193715802, + "loss": 0.88507611, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.53369141, + "step": 1063, + "time_per_iteration": 2.779623031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122391, + "balance_loss_mlp": 1.16757131, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06606001070927259, + "language_loss": 0.87212694, + "learning_rate": 0.000922090496490869, + "loss": 0.88436604, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.56323242, + "step": 1064, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217897, + "balance_loss_mlp": 1.16120076, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.3109146854617931, + "language_loss": 0.90918952, + "learning_rate": 0.0009219234097400937, + "loss": 0.92136848, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.56665039, + "step": 1065, + "time_per_iteration": 2.804588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245438, + "balance_loss_mlp": 1.18359244, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06908392980849179, + "language_loss": 0.84456235, + "learning_rate": 0.0009217561591841237, + "loss": 0.85701674, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.61816406, + "step": 1066, + "time_per_iteration": 3.303875207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287048, + "balance_loss_mlp": 1.21867001, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.1162597514909173, + "language_loss": 0.82140827, + "learning_rate": 0.0009215887448878913, + "loss": 0.83427876, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.68408203, + "step": 1067, + "time_per_iteration": 2.568690776824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293452, + "balance_loss_mlp": 1.22288036, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08586469474305494, + "language_loss": 0.85986763, + "learning_rate": 0.0009214211669163922, + "loss": 0.87280214, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.70654297, + "step": 1068, + "time_per_iteration": 2.700090169906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_mlp": 1.21408105, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.06609725061841937, + "language_loss": 0.94520444, + "learning_rate": 0.0009212534253346862, + "loss": 0.95800096, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.65478516, + "step": 1069, + "time_per_iteration": 2.696699857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285979, + "balance_loss_mlp": 1.21912634, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.07442061186670905, + "language_loss": 0.85475862, + "learning_rate": 0.0009210855202078964, + "loss": 0.86761844, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.66845703, + "step": 1070, + "time_per_iteration": 2.5769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_mlp": 1.21771979, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.07631989099853977, + "language_loss": 0.88063252, + "learning_rate": 0.0009209174516012091, + "loss": 0.89347488, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.66601562, + "step": 1071, + "time_per_iteration": 2.6154239177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261362, + "balance_loss_mlp": 1.19317448, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.05883273983798781, + "language_loss": 0.90461957, + "learning_rate": 0.0009207492195798747, + "loss": 0.91723317, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.68164062, + "step": 1072, + "time_per_iteration": 2.764965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261739, + "balance_loss_mlp": 1.18997467, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.07316980575900926, + "language_loss": 0.86156094, + "learning_rate": 0.0009205808242092061, + "loss": 0.87417829, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.71728516, + "step": 1073, + "time_per_iteration": 2.6222856044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258718, + "balance_loss_mlp": 1.18952858, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.06600331144021966, + "language_loss": 0.83598334, + "learning_rate": 0.0009204122655545808, + "loss": 0.84857053, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.69189453, + "step": 1074, + "time_per_iteration": 3.313964605331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252953, + "balance_loss_mlp": 1.18571925, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.06834339296378739, + "language_loss": 0.82186073, + "learning_rate": 0.0009202435436814388, + "loss": 0.83439028, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.67236328, + "step": 1075, + "time_per_iteration": 2.68725848197937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260409, + "balance_loss_mlp": 1.1926024, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.07476886245144747, + "language_loss": 0.91110998, + "learning_rate": 0.0009200746586552836, + "loss": 0.92371404, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.67773438, + "step": 1076, + "time_per_iteration": 2.889910936355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238308, + "balance_loss_mlp": 1.17145491, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.06855298516082668, + "language_loss": 0.84957182, + "learning_rate": 0.0009199056105416825, + "loss": 0.86195493, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.66894531, + "step": 1077, + "time_per_iteration": 3.0826096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242457, + "balance_loss_mlp": 1.17312455, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.0732932371665923, + "language_loss": 0.87494361, + "learning_rate": 0.0009197363994062654, + "loss": 0.8873682, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.69287109, + "step": 1078, + "time_per_iteration": 2.814481735229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121373, + "balance_loss_mlp": 1.15455508, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.060498447021287705, + "language_loss": 0.85097158, + "learning_rate": 0.0009195670253147262, + "loss": 0.86310887, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.59179688, + "step": 1079, + "time_per_iteration": 2.989818572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216427, + "balance_loss_mlp": 1.15286458, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.0563328194871683, + "language_loss": 0.83052152, + "learning_rate": 0.0009193974883328216, + "loss": 0.84268576, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.63574219, + "step": 1080, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209864, + "balance_loss_mlp": 1.14553857, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06150097183917509, + "language_loss": 0.87932825, + "learning_rate": 0.0009192277885263718, + "loss": 0.89142686, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.64306641, + "step": 1081, + "time_per_iteration": 2.65731143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198356, + "balance_loss_mlp": 1.13264751, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.05302154537588453, + "language_loss": 0.86579674, + "learning_rate": 0.0009190579259612602, + "loss": 0.87778032, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.65722656, + "step": 1082, + "time_per_iteration": 3.2999303340911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207666, + "balance_loss_mlp": 1.14300656, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.07988409962843289, + "language_loss": 0.87673134, + "learning_rate": 0.000918887900703433, + "loss": 0.88880801, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.64648438, + "step": 1083, + "time_per_iteration": 2.7956981658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204411, + "balance_loss_mlp": 1.14361465, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07357181622228276, + "language_loss": 0.91242653, + "learning_rate": 0.0009187177128188999, + "loss": 0.92447066, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.60693359, + "step": 1084, + "time_per_iteration": 2.4656450748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194774, + "balance_loss_mlp": 1.16902518, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.038082499218869, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78351313, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.2578125, + "step": 1085, + "time_per_iteration": 4.855400323867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181967, + "balance_loss_mlp": 1.12419796, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07376491342946172, + "language_loss": 0.86747313, + "learning_rate": 0.000918376849434071, + "loss": 0.87929279, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.57739258, + "step": 1086, + "time_per_iteration": 2.5493998527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192292, + "balance_loss_mlp": 1.1305418, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.07728027722551846, + "language_loss": 0.9155581, + "learning_rate": 0.0009182061740661098, + "loss": 0.92748106, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.61767578, + "step": 1087, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192286, + "balance_loss_mlp": 1.13144195, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.057753656338862314, + "language_loss": 0.85712528, + "learning_rate": 0.0009180353363361127, + "loss": 0.86904812, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.60888672, + "step": 1088, + "time_per_iteration": 3.1143646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180296, + "balance_loss_mlp": 1.11868906, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.07221088423930573, + "language_loss": 0.83469599, + "learning_rate": 0.0009178643363104044, + "loss": 0.84649897, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.61621094, + "step": 1089, + "time_per_iteration": 3.092656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199477, + "balance_loss_mlp": 1.138394, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.08745424257973078, + "language_loss": 0.92463166, + "learning_rate": 0.0009176931740553735, + "loss": 0.93662637, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.61083984, + "step": 1090, + "time_per_iteration": 2.53558349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207875, + "balance_loss_mlp": 1.14850855, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.07295134358518522, + "language_loss": 0.83623219, + "learning_rate": 0.0009175218496374708, + "loss": 0.84831095, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.59277344, + "step": 1091, + "time_per_iteration": 3.3514459133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226261, + "balance_loss_mlp": 1.16503549, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.0645587086921242, + "language_loss": 0.86590576, + "learning_rate": 0.0009173503631232103, + "loss": 0.87816834, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.61181641, + "step": 1092, + "time_per_iteration": 3.3893167972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122226, + "balance_loss_mlp": 1.16194034, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.12026645314545058, + "language_loss": 0.8245008, + "learning_rate": 0.0009171787145791691, + "loss": 0.83672333, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.60351562, + "step": 1093, + "time_per_iteration": 3.251084327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251584, + "balance_loss_mlp": 1.18854666, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.08481501206118727, + "language_loss": 0.8143028, + "learning_rate": 0.000917006904071987, + "loss": 0.82681859, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.63037109, + "step": 1094, + "time_per_iteration": 2.613060712814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_mlp": 1.20551634, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.08143629367900677, + "language_loss": 0.87639427, + "learning_rate": 0.0009168349316683669, + "loss": 0.88911939, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.66992188, + "step": 1095, + "time_per_iteration": 2.705172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269472, + "balance_loss_mlp": 1.20462179, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.05512017255927588, + "language_loss": 0.83512938, + "learning_rate": 0.0009166627974350741, + "loss": 0.8478241, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.64746094, + "step": 1096, + "time_per_iteration": 2.8979411125183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259233, + "balance_loss_mlp": 1.19390619, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.06519728045913388, + "language_loss": 0.90715098, + "learning_rate": 0.0009164905014389373, + "loss": 0.91974336, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.65283203, + "step": 1097, + "time_per_iteration": 2.7965359687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291926, + "balance_loss_mlp": 1.22445381, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.07891140172991894, + "language_loss": 0.87571776, + "learning_rate": 0.0009163180437468476, + "loss": 0.88863701, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.67480469, + "step": 1098, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012877, + "balance_loss_mlp": 1.22065675, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.06282838131309415, + "language_loss": 0.86816525, + "learning_rate": 0.000916145424425759, + "loss": 0.88104224, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.67041016, + "step": 1099, + "time_per_iteration": 2.6685678958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305165, + "balance_loss_mlp": 1.23554707, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.08616648204830919, + "language_loss": 0.916682, + "learning_rate": 0.0009159726435426885, + "loss": 0.92973363, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.69628906, + "step": 1100, + "time_per_iteration": 3.0852713584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282199, + "balance_loss_mlp": 1.21677744, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.07323647205544051, + "language_loss": 0.91053265, + "learning_rate": 0.0009157997011647154, + "loss": 0.92335469, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.65380859, + "step": 1101, + "time_per_iteration": 2.6137943267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_mlp": 1.20784807, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.05451247925490285, + "language_loss": 0.87014931, + "learning_rate": 0.0009156265973589817, + "loss": 0.88285577, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.62792969, + "step": 1102, + "time_per_iteration": 2.7920916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255362, + "balance_loss_mlp": 1.1928488, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.06310879580708054, + "language_loss": 0.90527534, + "learning_rate": 0.0009154533321926926, + "loss": 0.91782892, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.62548828, + "step": 1103, + "time_per_iteration": 2.646440029144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234037, + "balance_loss_mlp": 1.17214394, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.07831819024350671, + "language_loss": 0.88472342, + "learning_rate": 0.0009152799057331156, + "loss": 0.89706385, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.61865234, + "step": 1104, + "time_per_iteration": 3.122450590133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214804, + "balance_loss_mlp": 1.15462673, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.06719929320387279, + "language_loss": 0.91964042, + "learning_rate": 0.0009151063180475805, + "loss": 0.9317885, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.6015625, + "step": 1105, + "time_per_iteration": 2.5321173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181276, + "balance_loss_mlp": 1.12772751, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.07726558156265032, + "language_loss": 0.8518455, + "learning_rate": 0.0009149325692034803, + "loss": 0.86365819, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.53613281, + "step": 1106, + "time_per_iteration": 2.6019790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129115, + "balance_loss_mlp": 1.10660839, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.0458739418309424, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80332541, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.22460938, + "step": 1107, + "time_per_iteration": 4.859830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180766, + "balance_loss_mlp": 1.12478542, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08338906086238376, + "language_loss": 0.88186961, + "learning_rate": 0.0009145845883094678, + "loss": 0.89367729, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.56005859, + "step": 1108, + "time_per_iteration": 3.04249906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.10114598, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07708602471843919, + "language_loss": 0.85793281, + "learning_rate": 0.000914410356394654, + "loss": 0.86946738, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.5234375, + "step": 1109, + "time_per_iteration": 4.412867307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163449, + "balance_loss_mlp": 1.10751617, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.08187458054057056, + "language_loss": 0.85334879, + "learning_rate": 0.0009142359635914709, + "loss": 0.86498332, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.55957031, + "step": 1110, + "time_per_iteration": 3.023928642272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148781, + "balance_loss_mlp": 1.09570932, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.0669404625356857, + "language_loss": 0.85089076, + "learning_rate": 0.0009140614099676245, + "loss": 0.86237848, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.53076172, + "step": 1111, + "time_per_iteration": 2.625797748565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148537, + "balance_loss_mlp": 1.09632301, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.06784083874149466, + "language_loss": 0.83744586, + "learning_rate": 0.0009138866955908821, + "loss": 0.84893119, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.52246094, + "step": 1112, + "time_per_iteration": 2.9033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152374, + "balance_loss_mlp": 1.10042286, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.0756009236441896, + "language_loss": 0.81778276, + "learning_rate": 0.0009137118205290738, + "loss": 0.82930648, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.51977539, + "step": 1113, + "time_per_iteration": 3.00955867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163421, + "balance_loss_mlp": 1.10677314, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.07649003777848401, + "language_loss": 0.90946341, + "learning_rate": 0.0009135367848500924, + "loss": 0.92109764, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.56591797, + "step": 1114, + "time_per_iteration": 2.50858211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167845, + "balance_loss_mlp": 1.11472559, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.0823134598214501, + "language_loss": 0.87556803, + "learning_rate": 0.0009133615886218927, + "loss": 0.88724649, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.53125, + "step": 1115, + "time_per_iteration": 2.717454195022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178358, + "balance_loss_mlp": 1.11651218, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.06887665628973552, + "language_loss": 0.89567351, + "learning_rate": 0.0009131862319124917, + "loss": 0.90745711, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.61816406, + "step": 1116, + "time_per_iteration": 2.623767852783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176568, + "balance_loss_mlp": 1.1235671, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08365937432877864, + "language_loss": 0.85244483, + "learning_rate": 0.0009130107147899691, + "loss": 0.86421049, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.53051758, + "step": 1117, + "time_per_iteration": 2.795011281967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178642, + "balance_loss_mlp": 1.12561774, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.06665693704910039, + "language_loss": 0.8600654, + "learning_rate": 0.0009128350373224665, + "loss": 0.8718518, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.53076172, + "step": 1118, + "time_per_iteration": 2.5644795894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011837, + "balance_loss_mlp": 1.15928602, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.058896568697900505, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82640129, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.24414062, + "step": 1119, + "time_per_iteration": 4.683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204932, + "balance_loss_mlp": 1.15031052, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07135490421069918, + "language_loss": 0.85804355, + "learning_rate": 0.0009124832016254005, + "loss": 0.87009287, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.54663086, + "step": 1120, + "time_per_iteration": 2.6158647537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206508, + "balance_loss_mlp": 1.14571166, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.055578106746994274, + "language_loss": 0.89113355, + "learning_rate": 0.0009123070435324316, + "loss": 0.9031986, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.60791016, + "step": 1121, + "time_per_iteration": 2.755823850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.07988179, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.03051163671975961, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78977883, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.22265625, + "step": 1122, + "time_per_iteration": 4.996071100234985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211089, + "balance_loss_mlp": 1.15358257, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.06035521524280068, + "language_loss": 0.87145722, + "learning_rate": 0.0009119542471995752, + "loss": 0.88356811, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.57446289, + "step": 1123, + "time_per_iteration": 2.8323612213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204972, + "balance_loss_mlp": 1.14675009, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.060035653180353525, + "language_loss": 0.8248235, + "learning_rate": 0.0009117776090966554, + "loss": 0.83687323, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.58251953, + "step": 1124, + "time_per_iteration": 2.954216480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216387, + "balance_loss_mlp": 1.1558764, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.07791040933307145, + "language_loss": 0.876288, + "learning_rate": 0.0009116008111274899, + "loss": 0.88845193, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.60498047, + "step": 1125, + "time_per_iteration": 3.2826616764068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08216333, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.030294405796961115, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80209303, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.20214844, + "step": 1126, + "time_per_iteration": 4.8284173011779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.1455152, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.10762952047928877, + "language_loss": 0.8553561, + "learning_rate": 0.0009112467358650396, + "loss": 0.86737764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.56640625, + "step": 1127, + "time_per_iteration": 3.1621291637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192643, + "balance_loss_mlp": 1.13561273, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.06435190440672867, + "language_loss": 0.87181705, + "learning_rate": 0.0009110694587092192, + "loss": 0.88374346, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.56982422, + "step": 1128, + "time_per_iteration": 2.7597765922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194699, + "balance_loss_mlp": 1.13452196, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.06894978951163175, + "language_loss": 0.8223331, + "learning_rate": 0.0009108920219620815, + "loss": 0.83428001, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.6015625, + "step": 1129, + "time_per_iteration": 2.6658482551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198898, + "balance_loss_mlp": 1.14072335, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06550313542995663, + "language_loss": 0.90210444, + "learning_rate": 0.0009107144256925133, + "loss": 0.91409343, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.58154297, + "step": 1130, + "time_per_iteration": 2.7298777103424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211101, + "balance_loss_mlp": 1.15464389, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08430456831611369, + "language_loss": 0.82975614, + "learning_rate": 0.0009105366699694638, + "loss": 0.84186715, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.56445312, + "step": 1131, + "time_per_iteration": 2.7422807216644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121305, + "balance_loss_mlp": 1.15263498, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.05499133039406014, + "language_loss": 0.82219702, + "learning_rate": 0.0009103587548619439, + "loss": 0.83432752, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.60400391, + "step": 1132, + "time_per_iteration": 2.8834011554718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202147, + "balance_loss_mlp": 1.14468873, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.12855794167944481, + "language_loss": 0.87174821, + "learning_rate": 0.0009101806804390261, + "loss": 0.88376963, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.57421875, + "step": 1133, + "time_per_iteration": 2.8493435382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186046, + "balance_loss_mlp": 1.13082814, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.07046865468216726, + "language_loss": 0.91345453, + "learning_rate": 0.0009100024467698453, + "loss": 0.92531502, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.55175781, + "step": 1134, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184337, + "balance_loss_mlp": 1.12613893, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.07929007457036284, + "language_loss": 0.8353889, + "learning_rate": 0.0009098240539235981, + "loss": 0.84723222, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.58227539, + "step": 1135, + "time_per_iteration": 2.6736483573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176396, + "balance_loss_mlp": 1.12122619, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.06661367385494366, + "language_loss": 0.88575935, + "learning_rate": 0.0009096455019695423, + "loss": 0.89752328, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.55224609, + "step": 1136, + "time_per_iteration": 2.8438823223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172318, + "balance_loss_mlp": 1.1156702, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07075177433605506, + "language_loss": 0.90707165, + "learning_rate": 0.000909466790976998, + "loss": 0.91879487, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.56616211, + "step": 1137, + "time_per_iteration": 2.4795870780944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185042, + "balance_loss_mlp": 1.12801182, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07051320604800417, + "language_loss": 0.83409071, + "learning_rate": 0.0009092879210153473, + "loss": 0.84594113, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.57080078, + "step": 1138, + "time_per_iteration": 3.1328911781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186779, + "balance_loss_mlp": 1.13284826, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.06458215213012623, + "language_loss": 0.89566886, + "learning_rate": 0.0009091088921540333, + "loss": 0.90753663, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.54003906, + "step": 1139, + "time_per_iteration": 2.5608675479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_mlp": 1.03115106, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.027642480599540168, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76555562, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.15722656, + "step": 1140, + "time_per_iteration": 4.908522605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117392, + "balance_loss_mlp": 1.11908412, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.0906322081519832, + "language_loss": 0.84775734, + "learning_rate": 0.0009087503580104985, + "loss": 0.85949653, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.54882812, + "step": 1141, + "time_per_iteration": 2.696129083633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181851, + "balance_loss_mlp": 1.12558413, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.16226849767110665, + "language_loss": 0.80068243, + "learning_rate": 0.0009085708528674728, + "loss": 0.81250095, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.56347656, + "step": 1142, + "time_per_iteration": 2.7995505332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157793, + "balance_loss_mlp": 1.09985733, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08217329602320493, + "language_loss": 0.874843, + "learning_rate": 0.0009083911891031745, + "loss": 0.88642091, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.57958984, + "step": 1143, + "time_per_iteration": 3.1351919174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115076, + "balance_loss_mlp": 1.09578109, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.06169995263224583, + "language_loss": 0.92273706, + "learning_rate": 0.0009082113667873553, + "loss": 0.93424463, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.55029297, + "step": 1144, + "time_per_iteration": 3.1171934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153616, + "balance_loss_mlp": 1.10087752, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.07183124767141379, + "language_loss": 0.91221762, + "learning_rate": 0.0009080313859898283, + "loss": 0.9237538, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.52758789, + "step": 1145, + "time_per_iteration": 2.506591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153774, + "balance_loss_mlp": 1.09986758, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07077080612529597, + "language_loss": 0.92340779, + "learning_rate": 0.0009078512467804684, + "loss": 0.93494552, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.53881836, + "step": 1146, + "time_per_iteration": 2.591327667236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172392, + "balance_loss_mlp": 1.11800838, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.07651793216141736, + "language_loss": 0.91144007, + "learning_rate": 0.0009076709492292119, + "loss": 0.92316401, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.54418945, + "step": 1147, + "time_per_iteration": 2.609628438949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.11723804, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.07920780045429675, + "language_loss": 0.89603102, + "learning_rate": 0.0009074904934060562, + "loss": 0.90772295, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.51928711, + "step": 1148, + "time_per_iteration": 2.6755712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173959, + "balance_loss_mlp": 1.11697721, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.08245317941840166, + "language_loss": 0.8559376, + "learning_rate": 0.0009073098793810607, + "loss": 0.86767721, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.57006836, + "step": 1149, + "time_per_iteration": 2.9874348640441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177675, + "balance_loss_mlp": 1.12293434, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.08525751827962168, + "language_loss": 0.88982397, + "learning_rate": 0.000907129107224346, + "loss": 0.90160072, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.54785156, + "step": 1150, + "time_per_iteration": 2.739461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180589, + "balance_loss_mlp": 1.12658715, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.05205595876874212, + "language_loss": 0.88991034, + "learning_rate": 0.0009069481770060939, + "loss": 0.90171623, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.54077148, + "step": 1151, + "time_per_iteration": 2.7024669647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187248, + "balance_loss_mlp": 1.13212562, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06739531662392768, + "language_loss": 0.84448045, + "learning_rate": 0.000906767088796548, + "loss": 0.85635293, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.55126953, + "step": 1152, + "time_per_iteration": 3.4467508792877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117836, + "balance_loss_mlp": 1.12571764, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.05411857974090042, + "language_loss": 0.8779093, + "learning_rate": 0.0009065858426660127, + "loss": 0.8896929, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.52661133, + "step": 1153, + "time_per_iteration": 2.6216752529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182109, + "balance_loss_mlp": 1.12736845, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.07769931213358174, + "language_loss": 0.84979808, + "learning_rate": 0.0009064044386848543, + "loss": 0.86161917, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.54833984, + "step": 1154, + "time_per_iteration": 2.91601824760437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172512, + "balance_loss_mlp": 1.11381316, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.0711084155390928, + "language_loss": 0.89741302, + "learning_rate": 0.0009062228769234997, + "loss": 0.90913814, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.58691406, + "step": 1155, + "time_per_iteration": 2.5972864627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116208, + "balance_loss_mlp": 1.10690951, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.09100503083112628, + "language_loss": 0.81526613, + "learning_rate": 0.0009060411574524376, + "loss": 0.82688695, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.55224609, + "step": 1156, + "time_per_iteration": 2.6763274669647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182591, + "balance_loss_mlp": 1.12684917, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.06563385289017937, + "language_loss": 0.88585329, + "learning_rate": 0.0009058592803422178, + "loss": 0.89767921, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.55810547, + "step": 1157, + "time_per_iteration": 3.1414153575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_mlp": 1.00955701, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.012760142008093896, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79737109, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.17285156, + "step": 1158, + "time_per_iteration": 4.802858352661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171905, + "balance_loss_mlp": 1.12126482, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.060083734909452326, + "language_loss": 0.90886426, + "learning_rate": 0.00090549505348681, + "loss": 0.92058331, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.50683594, + "step": 1159, + "time_per_iteration": 2.5810928344726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168069, + "balance_loss_mlp": 1.11137354, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07069918091424116, + "language_loss": 0.85149121, + "learning_rate": 0.0009053127038830275, + "loss": 0.86317194, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.56689453, + "step": 1160, + "time_per_iteration": 3.009434223175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162107, + "balance_loss_mlp": 1.1050297, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.07200535138488619, + "language_loss": 0.87409687, + "learning_rate": 0.000905130196922898, + "loss": 0.88571799, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.57080078, + "step": 1161, + "time_per_iteration": 2.5972068309783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157457, + "balance_loss_mlp": 1.10223973, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.053497533436564174, + "language_loss": 0.8808614, + "learning_rate": 0.0009049475326772769, + "loss": 0.89243597, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.55224609, + "step": 1162, + "time_per_iteration": 2.580254316329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167432, + "balance_loss_mlp": 1.11092722, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.105825736895628, + "language_loss": 0.83639884, + "learning_rate": 0.0009047647112170811, + "loss": 0.84807312, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.56469727, + "step": 1163, + "time_per_iteration": 2.7509572505950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170262, + "balance_loss_mlp": 1.11041939, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.11729347611284674, + "language_loss": 0.8833853, + "learning_rate": 0.0009045817326132876, + "loss": 0.89508796, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.59814453, + "step": 1164, + "time_per_iteration": 3.6648380756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170775, + "balance_loss_mlp": 1.11226714, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.05704665841604838, + "language_loss": 0.83974147, + "learning_rate": 0.0009043985969369357, + "loss": 0.85144925, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.58544922, + "step": 1165, + "time_per_iteration": 2.868560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176977, + "balance_loss_mlp": 1.11665666, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.059940537627208516, + "language_loss": 0.84960037, + "learning_rate": 0.0009042153042591245, + "loss": 0.86137015, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.60302734, + "step": 1166, + "time_per_iteration": 2.8023743629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116839, + "balance_loss_mlp": 1.11271954, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.054742371261080745, + "language_loss": 0.85761929, + "learning_rate": 0.0009040318546510146, + "loss": 0.86930317, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.55639648, + "step": 1167, + "time_per_iteration": 3.141993999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117745, + "balance_loss_mlp": 1.1215651, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.07712318573741421, + "language_loss": 0.8582288, + "learning_rate": 0.0009038482481838275, + "loss": 0.87000328, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.55957031, + "step": 1168, + "time_per_iteration": 2.675204038619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116517, + "balance_loss_mlp": 1.1128844, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05640688657343365, + "language_loss": 0.88303328, + "learning_rate": 0.0009036644849288455, + "loss": 0.89468497, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.52319336, + "step": 1169, + "time_per_iteration": 3.0777511596679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.09441662, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.07174166621143864, + "language_loss": 0.86291218, + "learning_rate": 0.0009034805649574118, + "loss": 0.87439895, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.54394531, + "step": 1170, + "time_per_iteration": 2.7120091915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157496, + "balance_loss_mlp": 1.10513926, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.05497638968028837, + "language_loss": 0.85883957, + "learning_rate": 0.0009032964883409308, + "loss": 0.87041461, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.52441406, + "step": 1171, + "time_per_iteration": 2.8770556449890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_mlp": 1.03001809, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.027786176955518046, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74097812, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.17285156, + "step": 1172, + "time_per_iteration": 4.997943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150837, + "balance_loss_mlp": 1.0977174, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.06380875138992877, + "language_loss": 0.87640917, + "learning_rate": 0.0009029278654587462, + "loss": 0.88791752, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.53173828, + "step": 1173, + "time_per_iteration": 2.6070940494537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148484, + "balance_loss_mlp": 1.09546018, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.057211485944593306, + "language_loss": 0.83189976, + "learning_rate": 0.0009027433193361548, + "loss": 0.84338462, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.53027344, + "step": 1174, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.09708285, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06182212989299174, + "language_loss": 0.86948568, + "learning_rate": 0.00090255861685474, + "loss": 0.88097882, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.52246094, + "step": 1175, + "time_per_iteration": 2.7387607097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146248, + "balance_loss_mlp": 1.09284246, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.06871471519475823, + "language_loss": 0.92170686, + "learning_rate": 0.0009023737580862095, + "loss": 0.93316931, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.53442383, + "step": 1176, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160546, + "balance_loss_mlp": 1.11035883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0563237464245993, + "language_loss": 0.83948356, + "learning_rate": 0.0009021887431023321, + "loss": 0.851089, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.50219727, + "step": 1177, + "time_per_iteration": 2.5911412239074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161678, + "balance_loss_mlp": 1.11063254, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.06510699727290163, + "language_loss": 0.88054293, + "learning_rate": 0.0009020035719749369, + "loss": 0.8921597, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.51098633, + "step": 1178, + "time_per_iteration": 2.747715473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_mlp": 1.1255312, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0760827261000747, + "language_loss": 0.78592283, + "learning_rate": 0.0009018182447759136, + "loss": 0.79774463, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.56616211, + "step": 1179, + "time_per_iteration": 2.9912376403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177946, + "balance_loss_mlp": 1.12287188, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.05857060866656224, + "language_loss": 0.80403864, + "learning_rate": 0.0009016327615772126, + "loss": 0.81581813, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.55126953, + "step": 1180, + "time_per_iteration": 2.951934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178867, + "balance_loss_mlp": 1.1241498, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.07803208794693026, + "language_loss": 0.88654709, + "learning_rate": 0.0009014471224508451, + "loss": 0.8983357, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.54711914, + "step": 1181, + "time_per_iteration": 2.6834704875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175396, + "balance_loss_mlp": 1.12280107, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.07891792311297686, + "language_loss": 0.84171915, + "learning_rate": 0.0009012613274688823, + "loss": 0.85347319, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.52636719, + "step": 1182, + "time_per_iteration": 2.6773135662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193932, + "balance_loss_mlp": 1.13711679, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.06685387295915801, + "language_loss": 0.88334668, + "learning_rate": 0.0009010753767034565, + "loss": 0.89528602, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.56811523, + "step": 1183, + "time_per_iteration": 2.53671932220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192681, + "balance_loss_mlp": 1.13732028, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.05676884979808662, + "language_loss": 0.79381895, + "learning_rate": 0.0009008892702267599, + "loss": 0.80574578, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.55297852, + "step": 1184, + "time_per_iteration": 2.9609317779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218637, + "balance_loss_mlp": 1.16055822, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.11080255811352213, + "language_loss": 0.897048, + "learning_rate": 0.0009007030081110457, + "loss": 0.9092344, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.58105469, + "step": 1185, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212656, + "balance_loss_mlp": 1.15872598, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.06215110995007368, + "language_loss": 0.8510564, + "learning_rate": 0.000900516590428627, + "loss": 0.8631829, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.53955078, + "step": 1186, + "time_per_iteration": 2.66469407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206955, + "balance_loss_mlp": 1.15416956, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07510292852734143, + "language_loss": 0.90231287, + "learning_rate": 0.0009003300172518778, + "loss": 0.91438246, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.52807617, + "step": 1187, + "time_per_iteration": 2.6872987747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189379, + "balance_loss_mlp": 1.13559163, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.06187047573177096, + "language_loss": 0.84854043, + "learning_rate": 0.0009001432886532321, + "loss": 0.86043417, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.53808594, + "step": 1188, + "time_per_iteration": 2.961327314376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185796, + "balance_loss_mlp": 1.13248527, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0670290505569486, + "language_loss": 0.87277937, + "learning_rate": 0.0008999564047051843, + "loss": 0.88463724, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.53320312, + "step": 1189, + "time_per_iteration": 2.5120058059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119823, + "balance_loss_mlp": 1.14773321, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.07775817493182749, + "language_loss": 0.85562766, + "learning_rate": 0.0008997693654802894, + "loss": 0.86760998, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.50537109, + "step": 1190, + "time_per_iteration": 2.6584115028381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203195, + "balance_loss_mlp": 1.15343666, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08092173087187808, + "language_loss": 0.87245274, + "learning_rate": 0.0008995821710511625, + "loss": 0.88448465, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49780273, + "step": 1191, + "time_per_iteration": 2.75514817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189711, + "balance_loss_mlp": 1.14376771, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.058050392882622655, + "language_loss": 0.85975361, + "learning_rate": 0.0008993948214904786, + "loss": 0.8716507, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.45922852, + "step": 1192, + "time_per_iteration": 2.5808064937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132885, + "balance_loss_mlp": 1.11629128, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.04438752541684951, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.795551, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.16601562, + "step": 1193, + "time_per_iteration": 4.915351629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170271, + "balance_loss_mlp": 1.11338401, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.06516354982073377, + "language_loss": 0.79226351, + "learning_rate": 0.0008990196572654427, + "loss": 0.80396616, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.56933594, + "step": 1194, + "time_per_iteration": 2.914353609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159508, + "balance_loss_mlp": 1.10982203, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.053033431306196574, + "language_loss": 0.88186455, + "learning_rate": 0.0008988318427467426, + "loss": 0.89345956, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.49707031, + "step": 1195, + "time_per_iteration": 2.763303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146949, + "balance_loss_mlp": 1.09754825, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.06471781599702997, + "language_loss": 0.87142104, + "learning_rate": 0.0008986438733877887, + "loss": 0.88289052, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.49414062, + "step": 1196, + "time_per_iteration": 3.453037738800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138036, + "balance_loss_mlp": 1.08901691, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.05831436273017673, + "language_loss": 0.84795159, + "learning_rate": 0.0008984557492615576, + "loss": 0.85933197, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.49023438, + "step": 1197, + "time_per_iteration": 2.9209883213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147831, + "balance_loss_mlp": 1.09816873, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.06183090029168821, + "language_loss": 0.90001792, + "learning_rate": 0.0008982674704410854, + "loss": 0.91149628, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.49658203, + "step": 1198, + "time_per_iteration": 2.723980665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.10364521, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.06439147944581719, + "language_loss": 0.78128076, + "learning_rate": 0.0008980790369994682, + "loss": 0.7928164, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.49926758, + "step": 1199, + "time_per_iteration": 2.968733787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148741, + "balance_loss_mlp": 1.09817219, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.060755539801175186, + "language_loss": 0.8790828, + "learning_rate": 0.000897890449009863, + "loss": 0.89057022, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.50561523, + "step": 1200, + "time_per_iteration": 2.7373695373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159063, + "balance_loss_mlp": 1.11052144, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09508340337221405, + "language_loss": 0.9041636, + "learning_rate": 0.0008977017065454853, + "loss": 0.91575426, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.4855957, + "step": 1201, + "time_per_iteration": 2.6561479568481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172289, + "balance_loss_mlp": 1.12393796, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06896397472633412, + "language_loss": 0.8110497, + "learning_rate": 0.0008975128096796121, + "loss": 0.82277262, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48413086, + "step": 1202, + "time_per_iteration": 2.850882053375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166428, + "balance_loss_mlp": 1.11583591, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.07234791297382964, + "language_loss": 0.86751068, + "learning_rate": 0.0008973237584855794, + "loss": 0.87917495, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.50610352, + "step": 1203, + "time_per_iteration": 2.898651599884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201199, + "balance_loss_mlp": 1.14912796, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.0647782155366788, + "language_loss": 0.82535917, + "learning_rate": 0.0008971345530367832, + "loss": 0.83737111, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.52172852, + "step": 1204, + "time_per_iteration": 2.479710102081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188056, + "balance_loss_mlp": 1.13743997, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07976239468268423, + "language_loss": 0.86050093, + "learning_rate": 0.0008969451934066799, + "loss": 0.87238145, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.50561523, + "step": 1205, + "time_per_iteration": 2.7891948223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190172, + "balance_loss_mlp": 1.13834012, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08603625620414594, + "language_loss": 0.8068459, + "learning_rate": 0.0008967556796687854, + "loss": 0.81874764, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.51879883, + "step": 1206, + "time_per_iteration": 2.879742383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182453, + "balance_loss_mlp": 1.1313839, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.06613018456643845, + "language_loss": 0.8416872, + "learning_rate": 0.0008965660118966752, + "loss": 0.85351169, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.51098633, + "step": 1207, + "time_per_iteration": 2.8900513648986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.11610246, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06058209183838784, + "language_loss": 0.90754479, + "learning_rate": 0.0008963761901639851, + "loss": 0.91918385, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.47802734, + "step": 1208, + "time_per_iteration": 2.805534601211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176094, + "balance_loss_mlp": 1.12457156, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.06993420403149982, + "language_loss": 0.83909518, + "learning_rate": 0.0008961862145444103, + "loss": 0.85085618, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.51538086, + "step": 1209, + "time_per_iteration": 2.6882550716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197419, + "balance_loss_mlp": 1.14587319, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08646594069324176, + "language_loss": 0.85994279, + "learning_rate": 0.0008959960851117059, + "loss": 0.87191701, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.51611328, + "step": 1210, + "time_per_iteration": 2.6176648139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118703, + "balance_loss_mlp": 1.13340998, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06670419812311852, + "language_loss": 0.84013158, + "learning_rate": 0.0008958058019396868, + "loss": 0.85200191, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.53637695, + "step": 1211, + "time_per_iteration": 2.7867624759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177443, + "balance_loss_mlp": 1.12754154, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08722593193124767, + "language_loss": 0.87226063, + "learning_rate": 0.0008956153651022274, + "loss": 0.88403505, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.49926758, + "step": 1212, + "time_per_iteration": 2.671705961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169147, + "balance_loss_mlp": 1.11726665, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.06082314874639417, + "language_loss": 0.84296238, + "learning_rate": 0.0008954247746732618, + "loss": 0.85465384, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.51904297, + "step": 1213, + "time_per_iteration": 2.58005952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163838, + "balance_loss_mlp": 1.1156534, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.06006865966510304, + "language_loss": 0.91204965, + "learning_rate": 0.0008952340307267837, + "loss": 0.92368799, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48144531, + "step": 1214, + "time_per_iteration": 2.842824697494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149903, + "balance_loss_mlp": 1.09983516, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.07140080071894721, + "language_loss": 0.84202802, + "learning_rate": 0.0008950431333368468, + "loss": 0.85352707, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.50097656, + "step": 1215, + "time_per_iteration": 2.5616672039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155221, + "balance_loss_mlp": 1.10656011, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.083723319453273, + "language_loss": 0.85366404, + "learning_rate": 0.0008948520825775634, + "loss": 0.86521626, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48657227, + "step": 1216, + "time_per_iteration": 3.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114764, + "balance_loss_mlp": 1.09895492, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.05781662545039131, + "language_loss": 0.84181142, + "learning_rate": 0.0008946608785231067, + "loss": 0.85328782, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48706055, + "step": 1217, + "time_per_iteration": 2.861449956893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131497, + "balance_loss_mlp": 1.08352745, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.06428977242182035, + "language_loss": 0.85432529, + "learning_rate": 0.0008944695212477084, + "loss": 0.86564028, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.47973633, + "step": 1218, + "time_per_iteration": 2.540524959564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148618, + "balance_loss_mlp": 1.09907508, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.060914019840806265, + "language_loss": 0.86493349, + "learning_rate": 0.0008942780108256599, + "loss": 0.87641972, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.49560547, + "step": 1219, + "time_per_iteration": 2.613769769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142119, + "balance_loss_mlp": 1.09100199, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.05108155821019921, + "language_loss": 0.87340164, + "learning_rate": 0.0008940863473313121, + "loss": 0.88482285, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.51123047, + "step": 1220, + "time_per_iteration": 2.4549899101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145999, + "balance_loss_mlp": 1.09702742, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07702998226564757, + "language_loss": 0.8851074, + "learning_rate": 0.0008938945308390756, + "loss": 0.8965674, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48974609, + "step": 1221, + "time_per_iteration": 2.6133854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149321, + "balance_loss_mlp": 1.10211444, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.057479910137590906, + "language_loss": 0.88199294, + "learning_rate": 0.00089370256142342, + "loss": 0.89348614, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.47192383, + "step": 1222, + "time_per_iteration": 2.713489532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.09286284, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.05442066188859713, + "language_loss": 0.85879123, + "learning_rate": 0.0008935104391588746, + "loss": 0.87021047, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.49121094, + "step": 1223, + "time_per_iteration": 2.7304563522338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145924, + "balance_loss_mlp": 1.09447336, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.05049406517739995, + "language_loss": 0.8341555, + "learning_rate": 0.0008933181641200276, + "loss": 0.84561473, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.51513672, + "step": 1224, + "time_per_iteration": 3.122603416442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.09279394, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.0678885239417847, + "language_loss": 0.8627063, + "learning_rate": 0.0008931257363815271, + "loss": 0.87410253, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.46826172, + "step": 1225, + "time_per_iteration": 2.86014986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142208, + "balance_loss_mlp": 1.09490585, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.0639396043769501, + "language_loss": 0.90318632, + "learning_rate": 0.0008929331560180798, + "loss": 0.91460842, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.47338867, + "step": 1226, + "time_per_iteration": 2.9069020748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158077, + "balance_loss_mlp": 1.10924876, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.05735405278544162, + "language_loss": 0.9124881, + "learning_rate": 0.0008927404231044525, + "loss": 0.92406881, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48828125, + "step": 1227, + "time_per_iteration": 2.745591163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154284, + "balance_loss_mlp": 1.10571766, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.062458312515348655, + "language_loss": 0.8233285, + "learning_rate": 0.0008925475377154703, + "loss": 0.83487129, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48583984, + "step": 1228, + "time_per_iteration": 2.7165796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147881, + "balance_loss_mlp": 1.09664452, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.06307879716822463, + "language_loss": 0.82915187, + "learning_rate": 0.0008923544999260183, + "loss": 0.84063065, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.51293945, + "step": 1229, + "time_per_iteration": 2.787444829940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156102, + "balance_loss_mlp": 1.10567617, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.06236445133400911, + "language_loss": 0.92471206, + "learning_rate": 0.00089216130981104, + "loss": 0.9362731, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.50439453, + "step": 1230, + "time_per_iteration": 3.0671463012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148349, + "balance_loss_mlp": 1.09816241, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.06420697058211047, + "language_loss": 0.82893002, + "learning_rate": 0.000891967967445539, + "loss": 0.84041357, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.50195312, + "step": 1231, + "time_per_iteration": 2.692819356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147263, + "balance_loss_mlp": 1.09733796, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.044472050821074895, + "language_loss": 0.89257467, + "learning_rate": 0.0008917744729045772, + "loss": 0.90404725, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.49975586, + "step": 1232, + "time_per_iteration": 2.911123037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.10190618, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.055115174481180494, + "language_loss": 0.84317499, + "learning_rate": 0.0008915808262632757, + "loss": 0.85468972, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.49633789, + "step": 1233, + "time_per_iteration": 2.8429055213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164245, + "balance_loss_mlp": 1.1117928, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.07089823280283834, + "language_loss": 0.93916011, + "learning_rate": 0.0008913870275968148, + "loss": 0.95080256, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.52392578, + "step": 1234, + "time_per_iteration": 2.7355082035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152305, + "balance_loss_mlp": 1.10321498, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06512180670183462, + "language_loss": 0.87916219, + "learning_rate": 0.0008911930769804342, + "loss": 0.8906852, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.49145508, + "step": 1235, + "time_per_iteration": 3.320653200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115702, + "balance_loss_mlp": 1.10549772, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.04926889071384256, + "language_loss": 0.91928077, + "learning_rate": 0.0008909989744894318, + "loss": 0.93085092, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.51513672, + "step": 1236, + "time_per_iteration": 2.860095500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114863, + "balance_loss_mlp": 1.09808517, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.06373579401102465, + "language_loss": 0.81724823, + "learning_rate": 0.0008908047201991649, + "loss": 0.82873452, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.50512695, + "step": 1237, + "time_per_iteration": 2.7173092365264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146815, + "balance_loss_mlp": 1.10065758, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.06973577397583665, + "language_loss": 0.86895192, + "learning_rate": 0.0008906103141850502, + "loss": 0.88042009, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.46142578, + "step": 1238, + "time_per_iteration": 2.9070518016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149112, + "balance_loss_mlp": 1.10068893, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.07438040904238923, + "language_loss": 0.88608682, + "learning_rate": 0.0008904157565225621, + "loss": 0.897578, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48461914, + "step": 1239, + "time_per_iteration": 2.598175287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114606, + "balance_loss_mlp": 1.09758997, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.07265689268382322, + "language_loss": 0.82424903, + "learning_rate": 0.000890221047287235, + "loss": 0.83570957, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48486328, + "step": 1240, + "time_per_iteration": 3.5255463123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149116, + "balance_loss_mlp": 1.10207629, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07692592831537566, + "language_loss": 0.91524613, + "learning_rate": 0.0008900261865546615, + "loss": 0.92673725, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47021484, + "step": 1241, + "time_per_iteration": 2.626298189163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150585, + "balance_loss_mlp": 1.10101807, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.06193436068824588, + "language_loss": 0.85487348, + "learning_rate": 0.0008898311744004936, + "loss": 0.86637932, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.49584961, + "step": 1242, + "time_per_iteration": 2.6845884323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143159, + "balance_loss_mlp": 1.09638107, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06489370510499948, + "language_loss": 0.87195957, + "learning_rate": 0.0008896360109004414, + "loss": 0.88339114, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.46801758, + "step": 1243, + "time_per_iteration": 2.6279244422912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149339, + "balance_loss_mlp": 1.10239482, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.05690023470638135, + "language_loss": 0.84913921, + "learning_rate": 0.0008894406961302742, + "loss": 0.8606326, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.46948242, + "step": 1244, + "time_per_iteration": 2.5823607444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161498, + "balance_loss_mlp": 1.11591244, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.06599652790645752, + "language_loss": 0.84225279, + "learning_rate": 0.0008892452301658201, + "loss": 0.85386777, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.45581055, + "step": 1245, + "time_per_iteration": 3.0007240772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153792, + "balance_loss_mlp": 1.1045351, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.05569216777143309, + "language_loss": 0.83851659, + "learning_rate": 0.0008890496130829653, + "loss": 0.8500545, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.49316406, + "step": 1246, + "time_per_iteration": 2.656524658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.10424757, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.0643203237989141, + "language_loss": 0.85808307, + "learning_rate": 0.0008888538449576555, + "loss": 0.86958289, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.45751953, + "step": 1247, + "time_per_iteration": 2.5420141220092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148571, + "balance_loss_mlp": 1.09993315, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.07931889136759729, + "language_loss": 0.83083689, + "learning_rate": 0.0008886579258658944, + "loss": 0.84232259, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48632812, + "step": 1248, + "time_per_iteration": 2.574025869369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136833, + "balance_loss_mlp": 1.08786154, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.057547694087262784, + "language_loss": 0.85210383, + "learning_rate": 0.0008884618558837446, + "loss": 0.8634721, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.48974609, + "step": 1249, + "time_per_iteration": 2.808790922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146334, + "balance_loss_mlp": 1.09407234, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.05843363394571656, + "language_loss": 0.87170362, + "learning_rate": 0.0008882656350873273, + "loss": 0.88316691, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.52319336, + "step": 1250, + "time_per_iteration": 2.839341163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139888, + "balance_loss_mlp": 1.08998704, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.06920486589868534, + "language_loss": 0.87495792, + "learning_rate": 0.0008880692635528219, + "loss": 0.88635677, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.49975586, + "step": 1251, + "time_per_iteration": 3.0422415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141134, + "balance_loss_mlp": 1.09404635, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09445201185980338, + "language_loss": 0.89987123, + "learning_rate": 0.0008878727413564669, + "loss": 0.91128266, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47094727, + "step": 1252, + "time_per_iteration": 2.7974343299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110917, + "balance_loss_mlp": 1.09066832, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.0270998190046769, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81244767, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.18457031, + "step": 1253, + "time_per_iteration": 4.892668724060059 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150056, + "balance_loss_mlp": 1.09707963, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.06472275672686992, + "language_loss": 0.79044139, + "learning_rate": 0.0008874792452834528, + "loss": 0.80194199, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.53076172, + "step": 1254, + "time_per_iteration": 2.759533643722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144784, + "balance_loss_mlp": 1.09397733, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08671647217417044, + "language_loss": 0.87847424, + "learning_rate": 0.0008872822715595626, + "loss": 0.88992208, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.50878906, + "step": 1255, + "time_per_iteration": 2.6758921146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136115, + "balance_loss_mlp": 1.08731091, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.07818195128513271, + "language_loss": 0.87750483, + "learning_rate": 0.0008870851474793598, + "loss": 0.88886595, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.48803711, + "step": 1256, + "time_per_iteration": 2.5903451442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140246, + "balance_loss_mlp": 1.09196591, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.06462138017812241, + "language_loss": 0.90108514, + "learning_rate": 0.0008868878731193752, + "loss": 0.91248751, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48291016, + "step": 1257, + "time_per_iteration": 2.9156484603881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131411, + "balance_loss_mlp": 1.08611095, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.06839520252820154, + "language_loss": 0.89823216, + "learning_rate": 0.0008866904485561973, + "loss": 0.90954626, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.45361328, + "step": 1258, + "time_per_iteration": 2.709073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128152, + "balance_loss_mlp": 1.07698727, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.061516465429869265, + "language_loss": 0.83619797, + "learning_rate": 0.000886492873866473, + "loss": 0.84747952, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.51245117, + "step": 1259, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122174, + "balance_loss_mlp": 1.07315516, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.07532562043269028, + "language_loss": 0.85057306, + "learning_rate": 0.000886295149126908, + "loss": 0.86179483, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.49023438, + "step": 1260, + "time_per_iteration": 2.7702596187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_mlp": 1.07291138, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.06506459806255929, + "language_loss": 0.86249155, + "learning_rate": 0.0008860972744142655, + "loss": 0.87369466, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47363281, + "step": 1261, + "time_per_iteration": 2.9010353088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111356, + "balance_loss_mlp": 1.06575668, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.05333874014607912, + "language_loss": 0.82215619, + "learning_rate": 0.0008858992498053671, + "loss": 0.83329183, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47729492, + "step": 1262, + "time_per_iteration": 2.8307647705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_mlp": 1.08506405, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.04388178085496151, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77694511, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.20703125, + "step": 1263, + "time_per_iteration": 4.839150428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113047, + "balance_loss_mlp": 1.06517243, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07576677138650743, + "language_loss": 0.83877796, + "learning_rate": 0.0008855027512063817, + "loss": 0.84990847, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47924805, + "step": 1264, + "time_per_iteration": 2.6955387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116702, + "balance_loss_mlp": 1.06847, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.08737911579836782, + "language_loss": 0.86160326, + "learning_rate": 0.0008853042773702292, + "loss": 0.87277025, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.48217773, + "step": 1265, + "time_per_iteration": 2.718477725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123795, + "balance_loss_mlp": 1.07191551, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.05410456343654981, + "language_loss": 0.87916005, + "learning_rate": 0.0008851056539456896, + "loss": 0.89039803, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.51855469, + "step": 1266, + "time_per_iteration": 2.668398380279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127352, + "balance_loss_mlp": 1.07792759, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.06341671281787149, + "language_loss": 0.82546353, + "learning_rate": 0.0008849068810098755, + "loss": 0.8367371, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.49414062, + "step": 1267, + "time_per_iteration": 3.348644971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132731, + "balance_loss_mlp": 1.08523834, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.08675992555990221, + "language_loss": 0.8333391, + "learning_rate": 0.0008847079586399575, + "loss": 0.84466636, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47509766, + "step": 1268, + "time_per_iteration": 2.549433946609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126198, + "balance_loss_mlp": 1.07994461, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07249150513377325, + "language_loss": 0.8672694, + "learning_rate": 0.0008845088869131641, + "loss": 0.87853134, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.46289062, + "step": 1269, + "time_per_iteration": 2.6586451530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.10145724, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.06266770628228314, + "language_loss": 0.89411461, + "learning_rate": 0.0008843096659067818, + "loss": 0.90561438, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.48510742, + "step": 1270, + "time_per_iteration": 2.626946210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146652, + "balance_loss_mlp": 1.10228229, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.056965438466979365, + "language_loss": 0.86992264, + "learning_rate": 0.000884110295698155, + "loss": 0.88138914, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.44335938, + "step": 1271, + "time_per_iteration": 2.970078706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160922, + "balance_loss_mlp": 1.11080623, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.06894839907125858, + "language_loss": 0.86557794, + "learning_rate": 0.0008839107763646861, + "loss": 0.87718713, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.5012207, + "step": 1272, + "time_per_iteration": 2.592349052429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183532, + "balance_loss_mlp": 1.13437057, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.06647703149266906, + "language_loss": 0.90856385, + "learning_rate": 0.0008837111079838353, + "loss": 0.92039919, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.49194336, + "step": 1273, + "time_per_iteration": 2.7098910808563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118943, + "balance_loss_mlp": 1.14289117, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.05923779703064254, + "language_loss": 0.90316379, + "learning_rate": 0.000883511290633121, + "loss": 0.91505814, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.46533203, + "step": 1274, + "time_per_iteration": 2.5714197158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.13739181, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.060927364177961095, + "language_loss": 0.92697686, + "learning_rate": 0.000883311324390119, + "loss": 0.93883693, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.48608398, + "step": 1275, + "time_per_iteration": 2.740896224975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189584, + "balance_loss_mlp": 1.13474798, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.07775603238406727, + "language_loss": 0.82056022, + "learning_rate": 0.0008831112093324629, + "loss": 0.83245611, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.5480957, + "step": 1276, + "time_per_iteration": 3.0821468830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190059, + "balance_loss_mlp": 1.13927567, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.05600773018776359, + "language_loss": 0.89543378, + "learning_rate": 0.0008829109455378444, + "loss": 0.90733445, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.50830078, + "step": 1277, + "time_per_iteration": 2.7299413681030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192458, + "balance_loss_mlp": 1.14241397, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.05156937738675093, + "language_loss": 0.87083036, + "learning_rate": 0.000882710533084013, + "loss": 0.88275498, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.5, + "step": 1278, + "time_per_iteration": 2.6295228004455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185847, + "balance_loss_mlp": 1.13568354, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.05927927368096647, + "language_loss": 0.90088928, + "learning_rate": 0.0008825099720487755, + "loss": 0.91274774, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.50195312, + "step": 1279, + "time_per_iteration": 2.630868434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149494, + "balance_loss_mlp": 1.13461673, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04555367127523109, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76410633, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.1484375, + "step": 1280, + "time_per_iteration": 4.843670129776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118256, + "balance_loss_mlp": 1.10366488, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.038204832859796624, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79062366, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.14550781, + "step": 1281, + "time_per_iteration": 4.784554481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115452, + "balance_loss_mlp": 1.10547721, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.05852441511604794, + "language_loss": 0.89541078, + "learning_rate": 0.0008819073982335619, + "loss": 0.90695602, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.49047852, + "step": 1282, + "time_per_iteration": 2.8370161056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141297, + "balance_loss_mlp": 1.09726083, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07515840278086762, + "language_loss": 0.84908974, + "learning_rate": 0.0008817062436519235, + "loss": 0.86050272, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.44042969, + "step": 1283, + "time_per_iteration": 2.6532042026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114078, + "balance_loss_mlp": 1.09164214, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.051214690731677004, + "language_loss": 0.9022612, + "learning_rate": 0.0008815049408787788, + "loss": 0.91366905, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.49072266, + "step": 1284, + "time_per_iteration": 2.577040195465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145806, + "balance_loss_mlp": 1.09857535, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.06399849872592922, + "language_loss": 0.86388409, + "learning_rate": 0.0008813034899922805, + "loss": 0.87534213, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47216797, + "step": 1285, + "time_per_iteration": 2.586411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153157, + "balance_loss_mlp": 1.10366094, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.05962621730359375, + "language_loss": 0.90523338, + "learning_rate": 0.0008811018910706387, + "loss": 0.91676497, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.49536133, + "step": 1286, + "time_per_iteration": 2.558340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150564, + "balance_loss_mlp": 1.0996381, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08171747444285254, + "language_loss": 0.82914776, + "learning_rate": 0.0008809001441921211, + "loss": 0.84065336, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.50976562, + "step": 1287, + "time_per_iteration": 2.7096829414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134679, + "balance_loss_mlp": 1.08651865, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.061876473909820096, + "language_loss": 0.86037469, + "learning_rate": 0.0008806982494350528, + "loss": 0.87172151, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.48144531, + "step": 1288, + "time_per_iteration": 2.6826744079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.0885514, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.05818805427718153, + "language_loss": 0.90965348, + "learning_rate": 0.0008804962068778161, + "loss": 0.92104065, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.50195312, + "step": 1289, + "time_per_iteration": 2.9314775466918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137271, + "balance_loss_mlp": 1.08872867, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.06661216201088474, + "language_loss": 0.81390089, + "learning_rate": 0.0008802940165988511, + "loss": 0.82527363, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.48510742, + "step": 1290, + "time_per_iteration": 2.8629136085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113117, + "balance_loss_mlp": 1.08389127, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.06960392685137955, + "language_loss": 0.89268786, + "learning_rate": 0.000880091678676655, + "loss": 0.90399957, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47265625, + "step": 1291, + "time_per_iteration": 2.8345038890838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.08882165, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.058047960295431696, + "language_loss": 0.89150697, + "learning_rate": 0.0008798891931897821, + "loss": 0.90286887, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47338867, + "step": 1292, + "time_per_iteration": 2.7299227714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128008, + "balance_loss_mlp": 1.07949018, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.09954343743221296, + "language_loss": 0.84998739, + "learning_rate": 0.0008796865602168447, + "loss": 0.86126745, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.48535156, + "step": 1293, + "time_per_iteration": 2.5342278480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127533, + "balance_loss_mlp": 1.08220935, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.05777797953149353, + "language_loss": 0.89527249, + "learning_rate": 0.0008794837798365115, + "loss": 0.90654784, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.45361328, + "step": 1294, + "time_per_iteration": 2.6889185905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.08886147, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.07754051928079464, + "language_loss": 0.89232659, + "learning_rate": 0.0008792808521275089, + "loss": 0.90369469, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47924805, + "step": 1295, + "time_per_iteration": 2.7635927200317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136837, + "balance_loss_mlp": 1.09027398, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.09989296116771008, + "language_loss": 0.87984705, + "learning_rate": 0.0008790777771686206, + "loss": 0.89121538, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.46557617, + "step": 1296, + "time_per_iteration": 2.579235076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124595, + "balance_loss_mlp": 1.07853234, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.08251132162328097, + "language_loss": 0.85680348, + "learning_rate": 0.0008788745550386872, + "loss": 0.86804938, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.46044922, + "step": 1297, + "time_per_iteration": 2.598031759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128617, + "balance_loss_mlp": 1.08152938, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.06717402893383145, + "language_loss": 0.80945367, + "learning_rate": 0.0008786711858166063, + "loss": 0.82073987, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47070312, + "step": 1298, + "time_per_iteration": 2.9720141887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133144, + "balance_loss_mlp": 1.08696246, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.058753985131359356, + "language_loss": 0.84356344, + "learning_rate": 0.0008784676695813332, + "loss": 0.85489488, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.46166992, + "step": 1299, + "time_per_iteration": 3.003113031387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154452, + "balance_loss_mlp": 1.10700631, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07081449776085671, + "language_loss": 0.85444576, + "learning_rate": 0.0008782640064118796, + "loss": 0.86599028, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47436523, + "step": 1300, + "time_per_iteration": 2.8769848346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166343, + "balance_loss_mlp": 1.14946294, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.041859158942630086, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77351093, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.16894531, + "step": 1301, + "time_per_iteration": 4.951652526855469 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191692, + "balance_loss_mlp": 1.14701271, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.07273634964220443, + "language_loss": 0.8750245, + "learning_rate": 0.0008778562395867648, + "loss": 0.88694143, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.44677734, + "step": 1302, + "time_per_iteration": 2.604402542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181408, + "balance_loss_mlp": 1.13629961, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07562070017846675, + "language_loss": 0.84288502, + "learning_rate": 0.0008776521360894127, + "loss": 0.85469913, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.45092773, + "step": 1303, + "time_per_iteration": 2.5878565311431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08784008, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0317480068151838, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80065739, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.15820312, + "step": 1304, + "time_per_iteration": 4.7717835903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116688, + "balance_loss_mlp": 1.12220049, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.05690422496958516, + "language_loss": 0.90951985, + "learning_rate": 0.0008772434893213186, + "loss": 0.92118865, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.44702148, + "step": 1305, + "time_per_iteration": 2.604490280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160948, + "balance_loss_mlp": 1.11405063, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.058263181320018995, + "language_loss": 0.85050523, + "learning_rate": 0.0008770389462092276, + "loss": 0.86211473, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46875, + "step": 1306, + "time_per_iteration": 2.6470468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011567, + "balance_loss_mlp": 1.1099937, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.058464254330546805, + "language_loss": 0.87023067, + "learning_rate": 0.0008768342567176357, + "loss": 0.88179767, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.46704102, + "step": 1307, + "time_per_iteration": 2.8168630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155047, + "balance_loss_mlp": 1.10753012, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.05479935706331158, + "language_loss": 0.90999937, + "learning_rate": 0.0008766294209260107, + "loss": 0.9215498, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.4753418, + "step": 1308, + "time_per_iteration": 2.721531629562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144469, + "balance_loss_mlp": 1.09704781, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.06755027454964987, + "language_loss": 0.91936618, + "learning_rate": 0.0008764244389138767, + "loss": 0.93081093, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47436523, + "step": 1309, + "time_per_iteration": 2.574913263320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146846, + "balance_loss_mlp": 1.10061693, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09614568206927013, + "language_loss": 0.82912982, + "learning_rate": 0.000876219310760815, + "loss": 0.84059829, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.46240234, + "step": 1310, + "time_per_iteration": 2.8861234188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140262, + "balance_loss_mlp": 1.09419942, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.07943381545238665, + "language_loss": 0.82026285, + "learning_rate": 0.0008760140365464631, + "loss": 0.83166546, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.46020508, + "step": 1311, + "time_per_iteration": 2.615981340408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157686, + "balance_loss_mlp": 1.11212397, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.0923524312347507, + "language_loss": 0.8768574, + "learning_rate": 0.0008758086163505156, + "loss": 0.88843429, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.45532227, + "step": 1312, + "time_per_iteration": 2.6723434925079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144164, + "balance_loss_mlp": 1.09872115, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.06443576206069311, + "language_loss": 0.90026277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91170442, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.45458984, + "step": 1313, + "time_per_iteration": 2.841367721557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114771, + "balance_loss_mlp": 1.10291111, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.057466156915965357, + "language_loss": 0.90976274, + "learning_rate": 0.0008753973383328954, + "loss": 0.92123979, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.44824219, + "step": 1314, + "time_per_iteration": 2.7198092937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135642, + "balance_loss_mlp": 1.08912706, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.0651730634150067, + "language_loss": 0.84640622, + "learning_rate": 0.0008751914806708952, + "loss": 0.85776269, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.46508789, + "step": 1315, + "time_per_iteration": 2.619739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138249, + "balance_loss_mlp": 1.0955956, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.06535523514746128, + "language_loss": 0.82706141, + "learning_rate": 0.0008749854773466439, + "loss": 0.83844388, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.42700195, + "step": 1316, + "time_per_iteration": 2.6750850677490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126734, + "balance_loss_mlp": 1.08594072, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07438570972797282, + "language_loss": 0.85103095, + "learning_rate": 0.0008747793284401192, + "loss": 0.86229837, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.40771484, + "step": 1317, + "time_per_iteration": 2.667684316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127851, + "balance_loss_mlp": 1.08231306, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.06662830476911753, + "language_loss": 0.8637262, + "learning_rate": 0.0008745730340313551, + "loss": 0.87500465, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.45532227, + "step": 1318, + "time_per_iteration": 2.783167839050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_mlp": 1.08298802, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.06014849970215255, + "language_loss": 0.84828806, + "learning_rate": 0.0008743665942004422, + "loss": 0.85955328, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.43554688, + "step": 1319, + "time_per_iteration": 2.6454880237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128022, + "balance_loss_mlp": 1.08334279, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.10116204644494126, + "language_loss": 0.93301231, + "learning_rate": 0.0008741600090275277, + "loss": 0.94429255, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.44702148, + "step": 1320, + "time_per_iteration": 2.565373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112488, + "balance_loss_mlp": 1.07884121, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.06655436432492466, + "language_loss": 0.84446663, + "learning_rate": 0.0008739532785928151, + "loss": 0.85571539, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.45996094, + "step": 1321, + "time_per_iteration": 3.479727268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080328, + "balance_loss_mlp": 1.06325758, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.0281051137535917, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7597391, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.17089844, + "step": 1322, + "time_per_iteration": 4.7930076122283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136178, + "balance_loss_mlp": 1.08921003, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.06285601142266005, + "language_loss": 0.83366752, + "learning_rate": 0.0008735393822590908, + "loss": 0.84502923, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.46923828, + "step": 1323, + "time_per_iteration": 2.672137498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145864, + "balance_loss_mlp": 1.10192394, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.05471127015298985, + "language_loss": 0.8775813, + "learning_rate": 0.0008733322165207681, + "loss": 0.88903993, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.43969727, + "step": 1324, + "time_per_iteration": 2.6422736644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157775, + "balance_loss_mlp": 1.11292815, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.058409122955484685, + "language_loss": 0.83687508, + "learning_rate": 0.0008731249058420247, + "loss": 0.84845281, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.44824219, + "step": 1325, + "time_per_iteration": 3.02577805519104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165947, + "balance_loss_mlp": 1.11995602, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.0843662219595253, + "language_loss": 0.90814316, + "learning_rate": 0.0008729174503033459, + "loss": 0.91980267, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.45947266, + "step": 1326, + "time_per_iteration": 2.700956344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160817, + "balance_loss_mlp": 1.11418188, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.07395752020353057, + "language_loss": 0.83274329, + "learning_rate": 0.0008727098499852728, + "loss": 0.84435147, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.46630859, + "step": 1327, + "time_per_iteration": 2.8289363384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.0946734, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.05433597882612883, + "language_loss": 0.90389377, + "learning_rate": 0.0008725021049684034, + "loss": 0.91528177, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.44165039, + "step": 1328, + "time_per_iteration": 2.766871452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.09057808, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.04999939134312536, + "language_loss": 0.83732843, + "learning_rate": 0.000872294215333391, + "loss": 0.84867573, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.44165039, + "step": 1329, + "time_per_iteration": 3.181687116622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133543, + "balance_loss_mlp": 1.08941174, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.053270875218317436, + "language_loss": 0.83338815, + "learning_rate": 0.0008720861811609457, + "loss": 0.84472358, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.44140625, + "step": 1330, + "time_per_iteration": 2.753095865249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139869, + "balance_loss_mlp": 1.09282851, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0744958299593676, + "language_loss": 0.83801699, + "learning_rate": 0.0008718780025318338, + "loss": 0.84941566, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.4699707, + "step": 1331, + "time_per_iteration": 2.74076771736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141571, + "balance_loss_mlp": 1.09913218, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.06658506014654758, + "language_loss": 0.84681445, + "learning_rate": 0.0008716696795268771, + "loss": 0.85823017, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.42456055, + "step": 1332, + "time_per_iteration": 2.6771953105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141914, + "balance_loss_mlp": 1.09718704, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.06458865940403113, + "language_loss": 0.86108088, + "learning_rate": 0.0008714612122269538, + "loss": 0.87250006, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.44750977, + "step": 1333, + "time_per_iteration": 2.872405767440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145867, + "balance_loss_mlp": 1.09944701, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.06078246423813374, + "language_loss": 0.89285004, + "learning_rate": 0.0008712526007129982, + "loss": 0.90430868, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46411133, + "step": 1334, + "time_per_iteration": 2.575467586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148778, + "balance_loss_mlp": 1.10517156, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.06822349657501799, + "language_loss": 0.91275418, + "learning_rate": 0.0008710438450660003, + "loss": 0.92424202, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.43603516, + "step": 1335, + "time_per_iteration": 2.6461987495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149209, + "balance_loss_mlp": 1.10157323, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.08158488021096956, + "language_loss": 0.88278055, + "learning_rate": 0.0008708349453670064, + "loss": 0.89427269, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47583008, + "step": 1336, + "time_per_iteration": 2.5001657009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128588, + "balance_loss_mlp": 1.08297849, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.0603403973753485, + "language_loss": 0.91654134, + "learning_rate": 0.0008706259016971185, + "loss": 0.92782724, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.45629883, + "step": 1337, + "time_per_iteration": 2.817657947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127771, + "balance_loss_mlp": 1.07865644, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.08421665296665147, + "language_loss": 0.83723027, + "learning_rate": 0.0008704167141374944, + "loss": 0.848508, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.49145508, + "step": 1338, + "time_per_iteration": 2.808487892150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_mlp": 1.08003271, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.05813050369368248, + "language_loss": 0.88781357, + "learning_rate": 0.0008702073827693482, + "loss": 0.89909494, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.48144531, + "step": 1339, + "time_per_iteration": 2.687836170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131918, + "balance_loss_mlp": 1.08711886, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.05714278292432699, + "language_loss": 0.89388514, + "learning_rate": 0.0008699979076739494, + "loss": 0.9052043, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.44799805, + "step": 1340, + "time_per_iteration": 2.9907524585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157888, + "balance_loss_mlp": 1.11089551, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.06321899043923618, + "language_loss": 0.8949765, + "learning_rate": 0.0008697882889326234, + "loss": 0.90655541, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.4699707, + "step": 1341, + "time_per_iteration": 2.5261731147766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182653, + "balance_loss_mlp": 1.13513625, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.06545350512623192, + "language_loss": 0.87013066, + "learning_rate": 0.0008695785266267515, + "loss": 0.88195717, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.4753418, + "step": 1342, + "time_per_iteration": 2.719949722290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194656, + "balance_loss_mlp": 1.14585173, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.07227104516109029, + "language_loss": 0.8379634, + "learning_rate": 0.0008693686208377704, + "loss": 0.84991002, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.48828125, + "step": 1343, + "time_per_iteration": 2.789046049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011909, + "balance_loss_mlp": 1.14572012, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.08291144049116697, + "language_loss": 0.89388204, + "learning_rate": 0.0008691585716471733, + "loss": 0.90579104, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.45214844, + "step": 1344, + "time_per_iteration": 2.63281512260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182868, + "balance_loss_mlp": 1.1348505, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.05462335243620436, + "language_loss": 0.86349607, + "learning_rate": 0.0008689483791365079, + "loss": 0.87532479, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.48022461, + "step": 1345, + "time_per_iteration": 2.8293464183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165648, + "balance_loss_mlp": 1.11879873, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.060641418043912716, + "language_loss": 0.89744675, + "learning_rate": 0.0008687380433873786, + "loss": 0.90910327, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46875, + "step": 1346, + "time_per_iteration": 2.757361650466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150314, + "balance_loss_mlp": 1.100389, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.0738804898683007, + "language_loss": 0.83070856, + "learning_rate": 0.0008685275644814448, + "loss": 0.84221172, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.49926758, + "step": 1347, + "time_per_iteration": 2.716006278991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147842, + "balance_loss_mlp": 1.10087395, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07544817120788133, + "language_loss": 0.85244781, + "learning_rate": 0.0008683169425004216, + "loss": 0.86392623, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46972656, + "step": 1348, + "time_per_iteration": 2.900754451751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114914, + "balance_loss_mlp": 1.09842825, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.08404854247051008, + "language_loss": 0.83688962, + "learning_rate": 0.0008681061775260799, + "loss": 0.84838104, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.50708008, + "step": 1349, + "time_per_iteration": 2.8356235027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140271, + "balance_loss_mlp": 1.09356534, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08196022848482862, + "language_loss": 0.92983842, + "learning_rate": 0.0008678952696402458, + "loss": 0.94124115, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46704102, + "step": 1350, + "time_per_iteration": 2.5051889419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_mlp": 1.0865308, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.052642437263987304, + "language_loss": 0.86759204, + "learning_rate": 0.000867684218924801, + "loss": 0.87891388, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.45629883, + "step": 1351, + "time_per_iteration": 2.8635144233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089623, + "balance_loss_mlp": 1.0725522, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.04013302579778462, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80036712, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.17089844, + "step": 1352, + "time_per_iteration": 4.89817476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121624, + "balance_loss_mlp": 1.07587171, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.055845692832442596, + "language_loss": 0.85694808, + "learning_rate": 0.0008672616893328834, + "loss": 0.8681643, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.45751953, + "step": 1353, + "time_per_iteration": 2.9335103034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123767, + "balance_loss_mlp": 1.07877684, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.07010977425409264, + "language_loss": 0.9082427, + "learning_rate": 0.0008670502106204512, + "loss": 0.91948032, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.44970703, + "step": 1354, + "time_per_iteration": 2.8469178676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138616, + "balance_loss_mlp": 1.08840501, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.056353527093492256, + "language_loss": 0.82360619, + "learning_rate": 0.0008668385894064892, + "loss": 0.83499235, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.50195312, + "step": 1355, + "time_per_iteration": 2.672883987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149756, + "balance_loss_mlp": 1.10321617, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.05383030346289838, + "language_loss": 0.89593899, + "learning_rate": 0.0008666268257731562, + "loss": 0.90743661, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46557617, + "step": 1356, + "time_per_iteration": 3.1050939559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169178, + "balance_loss_mlp": 1.12127948, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.05849819020383372, + "language_loss": 0.85968256, + "learning_rate": 0.0008664149198026662, + "loss": 0.87137431, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.47900391, + "step": 1357, + "time_per_iteration": 3.226966619491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156465, + "balance_loss_mlp": 1.10932934, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.07293583935871151, + "language_loss": 0.89518476, + "learning_rate": 0.0008662028715772883, + "loss": 0.90674949, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.47143555, + "step": 1358, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163078, + "balance_loss_mlp": 1.11718237, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.05890556701012809, + "language_loss": 0.86217821, + "learning_rate": 0.0008659906811793467, + "loss": 0.87380904, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.45898438, + "step": 1359, + "time_per_iteration": 2.651193857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151481, + "balance_loss_mlp": 1.10699224, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.06298146111957026, + "language_loss": 0.90418088, + "learning_rate": 0.0008657783486912215, + "loss": 0.91569573, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.44482422, + "step": 1360, + "time_per_iteration": 2.723550319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156338, + "balance_loss_mlp": 1.11022782, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.055299708084911615, + "language_loss": 0.90110713, + "learning_rate": 0.0008655658741953472, + "loss": 0.91267049, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.4609375, + "step": 1361, + "time_per_iteration": 3.216830015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139946, + "balance_loss_mlp": 1.09564757, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.04868556149108388, + "language_loss": 0.89168048, + "learning_rate": 0.0008653532577742136, + "loss": 0.90307987, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.44311523, + "step": 1362, + "time_per_iteration": 2.718886375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143457, + "balance_loss_mlp": 1.0986346, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.058057923999792295, + "language_loss": 0.87558335, + "learning_rate": 0.0008651404995103659, + "loss": 0.88701797, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.44824219, + "step": 1363, + "time_per_iteration": 2.594294309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.09338474, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.06330728330165165, + "language_loss": 0.87334514, + "learning_rate": 0.0008649275994864041, + "loss": 0.88471884, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.43994141, + "step": 1364, + "time_per_iteration": 2.707449197769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144999, + "balance_loss_mlp": 1.09879303, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.05276541609050752, + "language_loss": 0.84391934, + "learning_rate": 0.0008647145577849834, + "loss": 0.85536933, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46191406, + "step": 1365, + "time_per_iteration": 2.8216350078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131283, + "balance_loss_mlp": 1.08560157, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.05376997595185902, + "language_loss": 0.83317888, + "learning_rate": 0.0008645013744888139, + "loss": 0.84449172, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.45678711, + "step": 1366, + "time_per_iteration": 2.866891622543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149616, + "balance_loss_mlp": 1.10536587, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.06316724717597957, + "language_loss": 0.87992281, + "learning_rate": 0.0008642880496806607, + "loss": 0.89141893, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.44287109, + "step": 1367, + "time_per_iteration": 2.7763173580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142909, + "balance_loss_mlp": 1.09772861, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.05877759558608074, + "language_loss": 0.84959197, + "learning_rate": 0.0008640745834433437, + "loss": 0.86102104, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.4519043, + "step": 1368, + "time_per_iteration": 2.738328218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134336, + "balance_loss_mlp": 1.09018087, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.05935956886320276, + "language_loss": 0.87054664, + "learning_rate": 0.000863860975859738, + "loss": 0.88189, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.44165039, + "step": 1369, + "time_per_iteration": 2.9206831455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131372, + "balance_loss_mlp": 1.0855242, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.06691392922801855, + "language_loss": 0.88684422, + "learning_rate": 0.0008636472270127733, + "loss": 0.89815795, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.45825195, + "step": 1370, + "time_per_iteration": 2.6078739166259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116186, + "balance_loss_mlp": 1.07021928, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.06515524250359679, + "language_loss": 0.90367895, + "learning_rate": 0.0008634333369854345, + "loss": 0.91484082, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.45947266, + "step": 1371, + "time_per_iteration": 2.6001384258270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110327, + "balance_loss_mlp": 1.0667206, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.056061894150206536, + "language_loss": 0.87892628, + "learning_rate": 0.0008632193058607608, + "loss": 0.89002955, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.43554688, + "step": 1372, + "time_per_iteration": 2.711435317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113538, + "balance_loss_mlp": 1.06628299, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.060513983317086996, + "language_loss": 0.81023312, + "learning_rate": 0.0008630051337218466, + "loss": 0.82136846, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47314453, + "step": 1373, + "time_per_iteration": 2.656416893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110026, + "balance_loss_mlp": 1.0668484, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0689512550651149, + "language_loss": 0.82808203, + "learning_rate": 0.0008627908206518409, + "loss": 0.83918226, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.43188477, + "step": 1374, + "time_per_iteration": 2.673738956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_mlp": 1.02716982, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.01820003864645097, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76191109, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12695312, + "step": 1375, + "time_per_iteration": 5.317140817642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115308, + "balance_loss_mlp": 1.07272696, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.062338636090573274, + "language_loss": 0.91769958, + "learning_rate": 0.0008623617720514241, + "loss": 0.92885268, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.42578125, + "step": 1376, + "time_per_iteration": 2.666618585586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117829, + "balance_loss_mlp": 1.07255304, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.08321054400070194, + "language_loss": 0.85169828, + "learning_rate": 0.0008621470366875848, + "loss": 0.86287659, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.45288086, + "step": 1377, + "time_per_iteration": 2.5939900875091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011137, + "balance_loss_mlp": 1.0724293, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.0756812485553519, + "language_loss": 0.88528687, + "learning_rate": 0.0008619321607257966, + "loss": 0.89642382, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.41259766, + "step": 1378, + "time_per_iteration": 2.675719976425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.08109117, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.05967522341676015, + "language_loss": 0.8244732, + "learning_rate": 0.000861717144249482, + "loss": 0.8357054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.42138672, + "step": 1379, + "time_per_iteration": 2.8289949893951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132354, + "balance_loss_mlp": 1.09170318, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06486885922060631, + "language_loss": 0.90334523, + "learning_rate": 0.0008615019873421175, + "loss": 0.91466868, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.40649414, + "step": 1380, + "time_per_iteration": 2.4665510654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141805, + "balance_loss_mlp": 1.09798408, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.06471812563896691, + "language_loss": 0.86262017, + "learning_rate": 0.0008612866900872349, + "loss": 0.87403822, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.43823242, + "step": 1381, + "time_per_iteration": 2.553489923477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140972, + "balance_loss_mlp": 1.10017824, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.07006288293307902, + "language_loss": 0.88817614, + "learning_rate": 0.0008610712525684197, + "loss": 0.89958596, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.40771484, + "step": 1382, + "time_per_iteration": 2.623844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156525, + "balance_loss_mlp": 1.11341906, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.06690376769295572, + "language_loss": 0.85084939, + "learning_rate": 0.0008608556748693121, + "loss": 0.8624146, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.43115234, + "step": 1383, + "time_per_iteration": 3.248947858810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149603, + "balance_loss_mlp": 1.10549557, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.05893966497122096, + "language_loss": 0.86648834, + "learning_rate": 0.000860639957073607, + "loss": 0.8779844, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.44116211, + "step": 1384, + "time_per_iteration": 2.6954376697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161137, + "balance_loss_mlp": 1.11838901, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.05777577847879513, + "language_loss": 0.88325369, + "learning_rate": 0.0008604240992650534, + "loss": 0.8948651, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.42749023, + "step": 1385, + "time_per_iteration": 2.6810553073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116884, + "balance_loss_mlp": 1.12613928, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.1266990207417539, + "language_loss": 0.89650941, + "learning_rate": 0.0008602081015274545, + "loss": 0.90819776, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.42724609, + "step": 1386, + "time_per_iteration": 2.7079007625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169207, + "balance_loss_mlp": 1.12602973, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.05666517988787923, + "language_loss": 0.83684492, + "learning_rate": 0.0008599919639446684, + "loss": 0.84853697, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.43139648, + "step": 1387, + "time_per_iteration": 2.67275333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184027, + "balance_loss_mlp": 1.13755894, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.06873806966805297, + "language_loss": 0.80686462, + "learning_rate": 0.000859775686600607, + "loss": 0.81870484, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46459961, + "step": 1388, + "time_per_iteration": 2.568384885787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192065, + "balance_loss_mlp": 1.14676547, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.07413400256287127, + "language_loss": 0.85524642, + "learning_rate": 0.0008595592695792367, + "loss": 0.86716712, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.453125, + "step": 1389, + "time_per_iteration": 2.6748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182907, + "balance_loss_mlp": 1.13884759, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06676524761439688, + "language_loss": 0.9117986, + "learning_rate": 0.0008593427129645778, + "loss": 0.92362767, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.44042969, + "step": 1390, + "time_per_iteration": 2.5506954193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186114, + "balance_loss_mlp": 1.14205468, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.056989477345309104, + "language_loss": 0.85532665, + "learning_rate": 0.0008591260168407052, + "loss": 0.86718786, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.44067383, + "step": 1391, + "time_per_iteration": 2.759000778198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_mlp": 1.13714194, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.12230490659722075, + "language_loss": 0.83154678, + "learning_rate": 0.0008589091812917479, + "loss": 0.84336257, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.4440918, + "step": 1392, + "time_per_iteration": 2.6213910579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183464, + "balance_loss_mlp": 1.14030981, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07403824045185783, + "language_loss": 0.8547672, + "learning_rate": 0.0008586922064018887, + "loss": 0.86660182, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.43139648, + "step": 1393, + "time_per_iteration": 2.6706490516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170846, + "balance_loss_mlp": 1.12375855, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.06891205333434622, + "language_loss": 0.89827204, + "learning_rate": 0.0008584750922553651, + "loss": 0.90998048, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.47021484, + "step": 1394, + "time_per_iteration": 3.1465976238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164798, + "balance_loss_mlp": 1.1222403, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.06253124916771012, + "language_loss": 0.84102368, + "learning_rate": 0.0008582578389364677, + "loss": 0.85267168, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.42529297, + "step": 1395, + "time_per_iteration": 2.853278875350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170721, + "balance_loss_mlp": 1.12573135, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.0656545534576685, + "language_loss": 0.92268932, + "learning_rate": 0.0008580404465295422, + "loss": 0.93439656, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.44970703, + "step": 1396, + "time_per_iteration": 2.773932695388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152323, + "balance_loss_mlp": 1.10826349, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07972324646927738, + "language_loss": 0.88789833, + "learning_rate": 0.0008578229151189876, + "loss": 0.89942157, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.44067383, + "step": 1397, + "time_per_iteration": 2.934276819229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10151267, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.10010461149900847, + "language_loss": 0.8178823, + "learning_rate": 0.0008576052447892573, + "loss": 0.82932794, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.43115234, + "step": 1398, + "time_per_iteration": 2.5337071418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131122, + "balance_loss_mlp": 1.08768189, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.07718983812215899, + "language_loss": 0.86768365, + "learning_rate": 0.000857387435624858, + "loss": 0.87899494, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.43457031, + "step": 1399, + "time_per_iteration": 2.5189273357391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127749, + "balance_loss_mlp": 1.08404672, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0707561541840249, + "language_loss": 0.88852745, + "learning_rate": 0.0008571694877103513, + "loss": 0.89980495, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.43701172, + "step": 1400, + "time_per_iteration": 3.287325859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126684, + "balance_loss_mlp": 1.08372128, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.08476375879770352, + "language_loss": 0.88499445, + "learning_rate": 0.0008569514011303515, + "loss": 0.89626133, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.4296875, + "step": 1401, + "time_per_iteration": 2.849506378173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120044, + "balance_loss_mlp": 1.07770109, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.12418270059874827, + "language_loss": 0.88531977, + "learning_rate": 0.0008567331759695277, + "loss": 0.89652026, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.42358398, + "step": 1402, + "time_per_iteration": 2.7033023834228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119932, + "balance_loss_mlp": 1.07584798, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.09855769315853927, + "language_loss": 0.86756563, + "learning_rate": 0.0008565148123126023, + "loss": 0.87876499, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.44091797, + "step": 1403, + "time_per_iteration": 2.645425319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119876, + "balance_loss_mlp": 1.07769978, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.15226973878739974, + "language_loss": 0.86578166, + "learning_rate": 0.0008562963102443516, + "loss": 0.87698042, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.421875, + "step": 1404, + "time_per_iteration": 2.6965179443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130222, + "balance_loss_mlp": 1.08668637, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.09156828725831004, + "language_loss": 0.85926664, + "learning_rate": 0.0008560776698496056, + "loss": 0.87056887, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.43530273, + "step": 1405, + "time_per_iteration": 2.868159532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141969, + "balance_loss_mlp": 1.09707534, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.07226677638641436, + "language_loss": 0.86433703, + "learning_rate": 0.0008558588912132481, + "loss": 0.87575674, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.44873047, + "step": 1406, + "time_per_iteration": 2.8309988975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.05236614, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03207539465139433, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77525663, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.14257812, + "step": 1407, + "time_per_iteration": 4.926543235778809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09220862, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.06146298960376288, + "language_loss": 0.83448923, + "learning_rate": 0.0008554209195555016, + "loss": 0.84585381, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.44287109, + "step": 1408, + "time_per_iteration": 2.6698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136456, + "balance_loss_mlp": 1.08965421, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.1627330563817166, + "language_loss": 0.89102834, + "learning_rate": 0.0008552017267041483, + "loss": 0.90239286, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.46801758, + "step": 1409, + "time_per_iteration": 2.6957972049713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127578, + "balance_loss_mlp": 1.08349395, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06560812899143556, + "language_loss": 0.83656335, + "learning_rate": 0.0008549823959512549, + "loss": 0.84783912, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.44091797, + "step": 1410, + "time_per_iteration": 2.7068376541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011101, + "balance_loss_mlp": 1.06708908, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.08175260567644033, + "language_loss": 0.87610555, + "learning_rate": 0.0008547629273819728, + "loss": 0.88720655, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.43041992, + "step": 1411, + "time_per_iteration": 3.366260290145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_mlp": 1.06542349, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.10517352924457117, + "language_loss": 0.84009993, + "learning_rate": 0.0008545433210815074, + "loss": 0.85118002, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.42578125, + "step": 1412, + "time_per_iteration": 2.630105972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.07931852, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.09841738404648297, + "language_loss": 0.87974489, + "learning_rate": 0.0008543235771351176, + "loss": 0.89097011, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.43188477, + "step": 1413, + "time_per_iteration": 2.725048065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.08635998, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.059677420125308425, + "language_loss": 0.84918916, + "learning_rate": 0.0008541036956281154, + "loss": 0.86048239, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.42993164, + "step": 1414, + "time_per_iteration": 2.897216796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133545, + "balance_loss_mlp": 1.08898425, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.08487151018546404, + "language_loss": 0.82919049, + "learning_rate": 0.0008538836766458665, + "loss": 0.84052598, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.44580078, + "step": 1415, + "time_per_iteration": 2.8930981159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137425, + "balance_loss_mlp": 1.0942955, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09871518143765563, + "language_loss": 0.85738099, + "learning_rate": 0.0008536635202737897, + "loss": 0.86875528, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.43164062, + "step": 1416, + "time_per_iteration": 2.7891178131103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137299, + "balance_loss_mlp": 1.0931915, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.10766210404252562, + "language_loss": 0.82790214, + "learning_rate": 0.0008534432265973573, + "loss": 0.83927512, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.44091797, + "step": 1417, + "time_per_iteration": 2.6409006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141948, + "balance_loss_mlp": 1.09691095, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07824380469589887, + "language_loss": 0.88708508, + "learning_rate": 0.000853222795702095, + "loss": 0.89850456, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.45092773, + "step": 1418, + "time_per_iteration": 3.4312241077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115343, + "balance_loss_mlp": 1.10767758, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.06262628073505326, + "language_loss": 0.84196067, + "learning_rate": 0.0008530022276735813, + "loss": 0.85349494, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.45727539, + "step": 1419, + "time_per_iteration": 2.742341995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169959, + "balance_loss_mlp": 1.12742519, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07008703106338479, + "language_loss": 0.86301696, + "learning_rate": 0.0008527815225974489, + "loss": 0.87471658, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.42529297, + "step": 1420, + "time_per_iteration": 2.643151044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172801, + "balance_loss_mlp": 1.12731028, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10800570533054084, + "language_loss": 0.88767672, + "learning_rate": 0.0008525606805593829, + "loss": 0.8994047, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.45483398, + "step": 1421, + "time_per_iteration": 2.4374186992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115892, + "balance_loss_mlp": 1.11283422, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.11472023337789067, + "language_loss": 0.83181965, + "learning_rate": 0.0008523397016451213, + "loss": 0.84340894, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46142578, + "step": 1422, + "time_per_iteration": 2.585376739501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152063, + "balance_loss_mlp": 1.10824132, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.08784028487991961, + "language_loss": 0.87910116, + "learning_rate": 0.0008521185859404564, + "loss": 0.89062172, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.43847656, + "step": 1423, + "time_per_iteration": 3.399348020553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150781, + "balance_loss_mlp": 1.10634017, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06323160386311827, + "language_loss": 0.89755672, + "learning_rate": 0.0008518973335312326, + "loss": 0.90906453, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.44433594, + "step": 1424, + "time_per_iteration": 2.771397352218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141797, + "balance_loss_mlp": 1.09628344, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.0741893947597381, + "language_loss": 0.83755773, + "learning_rate": 0.0008516759445033477, + "loss": 0.84897572, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.45532227, + "step": 1425, + "time_per_iteration": 2.623136520385742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148744, + "balance_loss_mlp": 1.10227656, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08118081060083703, + "language_loss": 0.85448551, + "learning_rate": 0.0008514544189427526, + "loss": 0.865973, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.46484375, + "step": 1426, + "time_per_iteration": 2.695749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156426, + "balance_loss_mlp": 1.11208034, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.0837156631450272, + "language_loss": 0.86976963, + "learning_rate": 0.0008512327569354511, + "loss": 0.88133389, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.44360352, + "step": 1427, + "time_per_iteration": 2.5354061126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160051, + "balance_loss_mlp": 1.11353528, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.09189170382991782, + "language_loss": 0.84034801, + "learning_rate": 0.0008510109585675001, + "loss": 0.8519485, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.46508789, + "step": 1428, + "time_per_iteration": 2.5996179580688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093492, + "balance_loss_mlp": 1.07680273, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.03549776566589832, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.8224684, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.16699219, + "step": 1429, + "time_per_iteration": 4.714696407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172648, + "balance_loss_mlp": 1.1280638, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.1239425770540774, + "language_loss": 0.81035018, + "learning_rate": 0.0008505669530941415, + "loss": 0.82207668, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 3.346867322921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171144, + "balance_loss_mlp": 1.12613082, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.0741807723541833, + "language_loss": 0.84519219, + "learning_rate": 0.000850344746161112, + "loss": 0.85690367, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.45019531, + "step": 1431, + "time_per_iteration": 2.6365530490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178527, + "balance_loss_mlp": 1.13418126, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.09683250699138053, + "language_loss": 0.88287663, + "learning_rate": 0.0008501224032121894, + "loss": 0.8946619, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.44360352, + "step": 1432, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178788, + "balance_loss_mlp": 1.13406062, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06051880699738469, + "language_loss": 0.82098711, + "learning_rate": 0.0008498999243336946, + "loss": 0.832775, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.44726562, + "step": 1433, + "time_per_iteration": 2.643663167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198526, + "balance_loss_mlp": 1.15129471, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.07173936681504893, + "language_loss": 0.87897062, + "learning_rate": 0.0008496773096120021, + "loss": 0.89095587, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.47241211, + "step": 1434, + "time_per_iteration": 2.8680803775787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198281, + "balance_loss_mlp": 1.15164685, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.07924459326066897, + "language_loss": 0.84949142, + "learning_rate": 0.0008494545591335381, + "loss": 0.86147422, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46630859, + "step": 1435, + "time_per_iteration": 2.9436187744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197184, + "balance_loss_mlp": 1.15176487, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.05338969573395925, + "language_loss": 0.87283278, + "learning_rate": 0.0008492316729847823, + "loss": 0.88480461, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.4543457, + "step": 1436, + "time_per_iteration": 2.817201614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195413, + "balance_loss_mlp": 1.14739525, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08524745340475512, + "language_loss": 0.80082995, + "learning_rate": 0.0008490086512522664, + "loss": 0.81278408, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47998047, + "step": 1437, + "time_per_iteration": 2.7126290798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196486, + "balance_loss_mlp": 1.14870656, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.06867103991167788, + "language_loss": 0.90572739, + "learning_rate": 0.0008487854940225755, + "loss": 0.9176923, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47729492, + "step": 1438, + "time_per_iteration": 2.431755542755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207177, + "balance_loss_mlp": 1.15858746, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.13716227323677116, + "language_loss": 0.90202403, + "learning_rate": 0.0008485622013823466, + "loss": 0.91409582, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.48608398, + "step": 1439, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198257, + "balance_loss_mlp": 1.15062046, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.09985187013126534, + "language_loss": 0.836923, + "learning_rate": 0.00084833877341827, + "loss": 0.84890562, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47680664, + "step": 1440, + "time_per_iteration": 2.652665138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215433, + "balance_loss_mlp": 1.16562724, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09777751450797587, + "language_loss": 0.81022394, + "learning_rate": 0.000848115210217088, + "loss": 0.82237822, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.49853516, + "step": 1441, + "time_per_iteration": 2.550879955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120133, + "balance_loss_mlp": 1.15166724, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.06658099231370791, + "language_loss": 0.82249796, + "learning_rate": 0.0008478915118655952, + "loss": 0.83451128, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.49658203, + "step": 1442, + "time_per_iteration": 2.7541940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209129, + "balance_loss_mlp": 1.16261363, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.05385742523937431, + "language_loss": 0.86750221, + "learning_rate": 0.0008476676784506393, + "loss": 0.87959349, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.46557617, + "step": 1443, + "time_per_iteration": 2.6595921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120895, + "balance_loss_mlp": 1.16083765, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07541643273231594, + "language_loss": 0.82715142, + "learning_rate": 0.0008474437100591201, + "loss": 0.83924091, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.48120117, + "step": 1444, + "time_per_iteration": 3.285985231399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209577, + "balance_loss_mlp": 1.16258454, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.07952238187909891, + "language_loss": 0.8560605, + "learning_rate": 0.0008472196067779898, + "loss": 0.86815625, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47021484, + "step": 1445, + "time_per_iteration": 2.677077293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204567, + "balance_loss_mlp": 1.15600109, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10163023549653756, + "language_loss": 0.86494523, + "learning_rate": 0.0008469953686942531, + "loss": 0.87699091, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.48583984, + "step": 1446, + "time_per_iteration": 3.10603928565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.14158559, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.0769454608790312, + "language_loss": 0.83537692, + "learning_rate": 0.0008467709958949668, + "loss": 0.84726554, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.47265625, + "step": 1447, + "time_per_iteration": 2.7602903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116478, + "balance_loss_mlp": 1.11943233, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08244080074007111, + "language_loss": 0.86534739, + "learning_rate": 0.0008465464884672403, + "loss": 0.87699515, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.45410156, + "step": 1448, + "time_per_iteration": 2.702974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178355, + "balance_loss_mlp": 1.13424778, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.061441667483596626, + "language_loss": 0.85982984, + "learning_rate": 0.0008463218464982348, + "loss": 0.87161338, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.44091797, + "step": 1449, + "time_per_iteration": 2.832615852355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185601, + "balance_loss_mlp": 1.14058757, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07503412994840371, + "language_loss": 0.88168389, + "learning_rate": 0.0008460970700751645, + "loss": 0.89353991, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.45019531, + "step": 1450, + "time_per_iteration": 3.0487136840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185626, + "balance_loss_mlp": 1.13977861, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.06352945894963989, + "language_loss": 0.88538259, + "learning_rate": 0.000845872159285295, + "loss": 0.89723885, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.45849609, + "step": 1451, + "time_per_iteration": 2.715423822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_mlp": 1.04985404, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.02807340123185793, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78833961, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17285156, + "step": 1452, + "time_per_iteration": 4.906192302703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197684, + "balance_loss_mlp": 1.15064442, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.06703382456082828, + "language_loss": 0.86617672, + "learning_rate": 0.0008454219349544836, + "loss": 0.87815356, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47045898, + "step": 1453, + "time_per_iteration": 3.3534200191497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.15343201, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.08552050648295068, + "language_loss": 0.82341981, + "learning_rate": 0.000845196621588334, + "loss": 0.83540004, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.44580078, + "step": 1454, + "time_per_iteration": 2.743699073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204394, + "balance_loss_mlp": 1.1566391, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.05325666962256515, + "language_loss": 0.7637955, + "learning_rate": 0.0008449711742049706, + "loss": 0.77583951, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.4777832, + "step": 1455, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208188, + "balance_loss_mlp": 1.16222095, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.09912152167704158, + "language_loss": 0.84447122, + "learning_rate": 0.0008447455928919196, + "loss": 0.85655314, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.45996094, + "step": 1456, + "time_per_iteration": 2.597557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242882, + "balance_loss_mlp": 1.19460225, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.060789109492995964, + "language_loss": 0.87272859, + "learning_rate": 0.0008445198777367595, + "loss": 0.88515741, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.48291016, + "step": 1457, + "time_per_iteration": 2.5689990520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283391, + "balance_loss_mlp": 1.23394287, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.0840599244275116, + "language_loss": 0.80820799, + "learning_rate": 0.0008442940288271208, + "loss": 0.82104188, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.49365234, + "step": 1458, + "time_per_iteration": 2.674907922744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299064, + "balance_loss_mlp": 1.24899602, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06912303271008884, + "language_loss": 0.87410611, + "learning_rate": 0.0008440680462506856, + "loss": 0.88709676, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.50073242, + "step": 1459, + "time_per_iteration": 2.73905873298645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312423, + "balance_loss_mlp": 1.26221192, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.11964292138845481, + "language_loss": 0.86650789, + "learning_rate": 0.0008438419300951883, + "loss": 0.87963212, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.50219727, + "step": 1460, + "time_per_iteration": 2.6775193214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277494, + "balance_loss_mlp": 1.22690177, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.08967430845786024, + "language_loss": 0.86711442, + "learning_rate": 0.0008436156804484148, + "loss": 0.87988937, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.50610352, + "step": 1461, + "time_per_iteration": 2.8446624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225027, + "balance_loss_mlp": 1.17615128, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.06778030965882964, + "language_loss": 0.88354933, + "learning_rate": 0.0008433892973982031, + "loss": 0.89579964, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.48901367, + "step": 1462, + "time_per_iteration": 2.5101869106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212759, + "balance_loss_mlp": 1.16168988, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07940790981700917, + "language_loss": 0.85705763, + "learning_rate": 0.0008431627810324431, + "loss": 0.86918521, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.51098633, + "step": 1463, + "time_per_iteration": 2.6701931953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208608, + "balance_loss_mlp": 1.15906441, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.1112721524597414, + "language_loss": 0.81312853, + "learning_rate": 0.000842936131439076, + "loss": 0.82521462, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.49584961, + "step": 1464, + "time_per_iteration": 2.626397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182235, + "balance_loss_mlp": 1.13440847, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.10805991000078381, + "language_loss": 0.88305855, + "learning_rate": 0.0008427093487060951, + "loss": 0.89488095, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.4777832, + "step": 1465, + "time_per_iteration": 2.6287689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152025, + "balance_loss_mlp": 1.10815573, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.05392746655550109, + "language_loss": 0.85014635, + "learning_rate": 0.000842482432921545, + "loss": 0.86166662, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.4387207, + "step": 1466, + "time_per_iteration": 2.843055009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140929, + "balance_loss_mlp": 1.09691691, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.12216249404138245, + "language_loss": 0.8786549, + "learning_rate": 0.0008422553841735225, + "loss": 0.89006418, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.44018555, + "step": 1467, + "time_per_iteration": 2.4870855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130953, + "balance_loss_mlp": 1.08686972, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.0834179705505054, + "language_loss": 0.85186172, + "learning_rate": 0.0008420282025501757, + "loss": 0.86317128, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.44091797, + "step": 1468, + "time_per_iteration": 2.746919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.09730196, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07747841896553878, + "language_loss": 0.85862702, + "learning_rate": 0.0008418008881397043, + "loss": 0.8700223, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.42236328, + "step": 1469, + "time_per_iteration": 2.7157111167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011536, + "balance_loss_mlp": 1.11108959, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.09196817065592088, + "language_loss": 0.83090472, + "learning_rate": 0.0008415734410303595, + "loss": 0.84244066, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.42529297, + "step": 1470, + "time_per_iteration": 3.2546660900115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.1166662, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07745609031802311, + "language_loss": 0.91133046, + "learning_rate": 0.0008413458613104444, + "loss": 0.92292744, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.43017578, + "step": 1471, + "time_per_iteration": 2.683119773864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124215, + "balance_loss_mlp": 1.08091772, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06716648824100378, + "language_loss": 0.83225214, + "learning_rate": 0.0008411181490683129, + "loss": 0.84349424, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.43334961, + "step": 1472, + "time_per_iteration": 2.7247512340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112102, + "balance_loss_mlp": 1.06692195, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08730853561294576, + "language_loss": 0.83099282, + "learning_rate": 0.0008408903043923707, + "loss": 0.84211385, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.45166016, + "step": 1473, + "time_per_iteration": 2.9982750415802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_mlp": 1.06675041, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09441991509127853, + "language_loss": 0.81456125, + "learning_rate": 0.0008406623273710754, + "loss": 0.82569724, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.46826172, + "step": 1474, + "time_per_iteration": 2.6457254886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107143, + "balance_loss_mlp": 1.06482363, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.08147557265850319, + "language_loss": 0.83874208, + "learning_rate": 0.0008404342180929351, + "loss": 0.84981352, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.42358398, + "step": 1475, + "time_per_iteration": 2.6071481704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110668, + "balance_loss_mlp": 1.06758618, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0682383784230515, + "language_loss": 0.81900609, + "learning_rate": 0.00084020597664651, + "loss": 0.83011281, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.43066406, + "step": 1476, + "time_per_iteration": 2.831547260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118821, + "balance_loss_mlp": 1.07149458, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.08199753583087593, + "language_loss": 0.84526181, + "learning_rate": 0.0008399776031204111, + "loss": 0.85645002, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.47290039, + "step": 1477, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112444, + "balance_loss_mlp": 1.07832992, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07183050675580523, + "language_loss": 0.80975109, + "learning_rate": 0.0008397490976033009, + "loss": 0.82099551, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.46118164, + "step": 1478, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.03766239, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.035679392232843235, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933525, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.16210938, + "step": 1479, + "time_per_iteration": 4.813107252120972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132957, + "balance_loss_mlp": 1.08925462, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06426749014533666, + "language_loss": 0.85708797, + "learning_rate": 0.0008392916909509525, + "loss": 0.86841756, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.43725586, + "step": 1480, + "time_per_iteration": 3.105465888977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.10180378, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.12099224111333258, + "language_loss": 0.8583495, + "learning_rate": 0.0008390627899932954, + "loss": 0.86980623, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.43847656, + "step": 1481, + "time_per_iteration": 2.5961339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146403, + "balance_loss_mlp": 1.1041795, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.09850404509995118, + "language_loss": 0.88747412, + "learning_rate": 0.000838833757399789, + "loss": 0.89893812, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.42211914, + "step": 1482, + "time_per_iteration": 2.9445223808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160742, + "balance_loss_mlp": 1.11513209, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.09258701289693592, + "language_loss": 0.81233478, + "learning_rate": 0.0008386045932593515, + "loss": 0.82394218, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.45605469, + "step": 1483, + "time_per_iteration": 2.696171283721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172022, + "balance_loss_mlp": 1.12853456, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07718327666813503, + "language_loss": 0.8687939, + "learning_rate": 0.0008383752976609525, + "loss": 0.88051414, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.43481445, + "step": 1484, + "time_per_iteration": 2.948983907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159194, + "balance_loss_mlp": 1.11508679, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06564205880415652, + "language_loss": 0.80617285, + "learning_rate": 0.0008381458706936123, + "loss": 0.81776482, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.44116211, + "step": 1485, + "time_per_iteration": 2.689715623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117177, + "balance_loss_mlp": 1.12740064, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06570872016312425, + "language_loss": 0.87734085, + "learning_rate": 0.0008379163124464025, + "loss": 0.88905853, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.44384766, + "step": 1486, + "time_per_iteration": 2.7226197719573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166912, + "balance_loss_mlp": 1.12526059, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.0915307653224295, + "language_loss": 0.77564812, + "learning_rate": 0.0008376866230084452, + "loss": 0.78731728, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.41650391, + "step": 1487, + "time_per_iteration": 2.82708477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154293, + "balance_loss_mlp": 1.10901785, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07232162522245564, + "language_loss": 0.86754864, + "learning_rate": 0.000837456802468914, + "loss": 0.87909162, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.45239258, + "step": 1488, + "time_per_iteration": 2.6107335090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115391, + "balance_loss_mlp": 1.1082294, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.06580975478488113, + "language_loss": 0.85965604, + "learning_rate": 0.0008372268509170331, + "loss": 0.8711952, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.45678711, + "step": 1489, + "time_per_iteration": 2.682190418243408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147981, + "balance_loss_mlp": 1.10554218, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.0640942252200205, + "language_loss": 0.85215169, + "learning_rate": 0.0008369967684420779, + "loss": 0.86363149, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.42431641, + "step": 1490, + "time_per_iteration": 2.708315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.11154985, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.07293711729105107, + "language_loss": 0.84566355, + "learning_rate": 0.0008367665551333736, + "loss": 0.85722154, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.44262695, + "step": 1491, + "time_per_iteration": 2.605665445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159368, + "balance_loss_mlp": 1.11216116, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.0802107480821924, + "language_loss": 0.85808468, + "learning_rate": 0.0008365362110802977, + "loss": 0.86967838, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47241211, + "step": 1492, + "time_per_iteration": 2.879655122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155437, + "balance_loss_mlp": 1.109303, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.06007050516222503, + "language_loss": 0.82957923, + "learning_rate": 0.0008363057363722773, + "loss": 0.84113365, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.46142578, + "step": 1493, + "time_per_iteration": 2.8600335121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154458, + "balance_loss_mlp": 1.11085081, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.060904552171674266, + "language_loss": 0.8464222, + "learning_rate": 0.0008360751310987906, + "loss": 0.85796678, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.4362793, + "step": 1494, + "time_per_iteration": 2.602029800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151781, + "balance_loss_mlp": 1.11160707, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.06255193118064963, + "language_loss": 0.86073208, + "learning_rate": 0.0008358443953493666, + "loss": 0.87224984, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.40185547, + "step": 1495, + "time_per_iteration": 2.8682689666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116061, + "balance_loss_mlp": 1.11702669, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.06637793594414569, + "language_loss": 0.89093578, + "learning_rate": 0.0008356135292135851, + "loss": 0.90254188, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.43579102, + "step": 1496, + "time_per_iteration": 2.519700288772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162426, + "balance_loss_mlp": 1.11760294, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.07926576541007177, + "language_loss": 0.92873323, + "learning_rate": 0.0008353825327810758, + "loss": 0.94035745, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.44873047, + "step": 1497, + "time_per_iteration": 2.4195892810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.09852648, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.05522330058639147, + "language_loss": 0.81832987, + "learning_rate": 0.00083515140614152, + "loss": 0.82973409, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.41894531, + "step": 1498, + "time_per_iteration": 2.6989245414733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151843, + "balance_loss_mlp": 1.10992932, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.08112895482541128, + "language_loss": 0.87581354, + "learning_rate": 0.0008349201493846485, + "loss": 0.88733196, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.41894531, + "step": 1499, + "time_per_iteration": 2.647165298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113639, + "balance_loss_mlp": 1.09364128, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.06188269799142739, + "language_loss": 0.89485824, + "learning_rate": 0.0008346887626002432, + "loss": 0.90622216, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.42724609, + "step": 1500, + "time_per_iteration": 2.546494960784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.09546816, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.07756887509348087, + "language_loss": 0.86612689, + "learning_rate": 0.000834457245878137, + "loss": 0.87751424, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.43261719, + "step": 1501, + "time_per_iteration": 2.6271145343780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132854, + "balance_loss_mlp": 1.08993816, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07465598629984396, + "language_loss": 0.8176384, + "learning_rate": 0.000834225599308212, + "loss": 0.82896686, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.42895508, + "step": 1502, + "time_per_iteration": 3.2550971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150677, + "balance_loss_mlp": 1.10580611, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07581203663628927, + "language_loss": 0.85830456, + "learning_rate": 0.0008339938229804016, + "loss": 0.8698113, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.44897461, + "step": 1503, + "time_per_iteration": 2.704310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132016, + "balance_loss_mlp": 1.11475468, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04995777902546146, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76566839, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17285156, + "step": 1504, + "time_per_iteration": 4.959474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.10965538, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.06157445053236475, + "language_loss": 0.84505653, + "learning_rate": 0.0008335298814111094, + "loss": 0.85662901, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47607422, + "step": 1505, + "time_per_iteration": 2.5612986087799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178976, + "balance_loss_mlp": 1.13374829, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.05887296654917154, + "language_loss": 0.88222575, + "learning_rate": 0.0008332977163497455, + "loss": 0.89401549, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.4519043, + "step": 1506, + "time_per_iteration": 2.8017849922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183741, + "balance_loss_mlp": 1.13696313, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07773532252894584, + "language_loss": 0.83964998, + "learning_rate": 0.0008330654218907325, + "loss": 0.8514874, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.46801758, + "step": 1507, + "time_per_iteration": 2.6568052768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167782, + "balance_loss_mlp": 1.12016964, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.05364053536005051, + "language_loss": 0.82260346, + "learning_rate": 0.0008328329981242548, + "loss": 0.83428133, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47631836, + "step": 1508, + "time_per_iteration": 2.8732171058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161954, + "balance_loss_mlp": 1.11479485, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.06776855665971031, + "language_loss": 0.88091129, + "learning_rate": 0.0008326004451405475, + "loss": 0.8925308, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47143555, + "step": 1509, + "time_per_iteration": 2.762476921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.11104107, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.08089915602738365, + "language_loss": 0.82757521, + "learning_rate": 0.0008323677630298957, + "loss": 0.83914363, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.45800781, + "step": 1510, + "time_per_iteration": 2.554558753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152926, + "balance_loss_mlp": 1.1073643, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.07106066660777852, + "language_loss": 0.85773015, + "learning_rate": 0.0008321349518826345, + "loss": 0.86925942, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.45556641, + "step": 1511, + "time_per_iteration": 2.8341891765594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144812, + "balance_loss_mlp": 1.09870172, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.06994476337169399, + "language_loss": 0.95554525, + "learning_rate": 0.0008319020117891491, + "loss": 0.96699333, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.4609375, + "step": 1512, + "time_per_iteration": 2.6152215003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147304, + "balance_loss_mlp": 1.09902406, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.09218377020634298, + "language_loss": 0.87772787, + "learning_rate": 0.0008316689428398751, + "loss": 0.88920093, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.4831543, + "step": 1513, + "time_per_iteration": 2.687288522720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148068, + "balance_loss_mlp": 1.10407972, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05407373665960582, + "language_loss": 0.89050305, + "learning_rate": 0.0008314357451252979, + "loss": 0.90198368, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.44018555, + "step": 1514, + "time_per_iteration": 2.7870078086853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151939, + "balance_loss_mlp": 1.10644853, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.11283198751561448, + "language_loss": 0.88657945, + "learning_rate": 0.0008312024187359527, + "loss": 0.89809883, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.45483398, + "step": 1515, + "time_per_iteration": 2.6400256156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144613, + "balance_loss_mlp": 1.10060108, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.08270455580526427, + "language_loss": 0.87534022, + "learning_rate": 0.000830968963762425, + "loss": 0.8867864, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.43994141, + "step": 1516, + "time_per_iteration": 3.0442028045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151597, + "balance_loss_mlp": 1.10617828, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.06364079743342543, + "language_loss": 0.84482789, + "learning_rate": 0.0008307353802953497, + "loss": 0.85634387, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.45361328, + "step": 1517, + "time_per_iteration": 2.672921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171551, + "balance_loss_mlp": 1.12281811, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.060139597091390135, + "language_loss": 0.86612219, + "learning_rate": 0.0008305016684254125, + "loss": 0.87783766, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.48803711, + "step": 1518, + "time_per_iteration": 2.7845590114593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.12947094, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.09151635615922826, + "language_loss": 0.87469971, + "learning_rate": 0.0008302678282433479, + "loss": 0.88644284, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.44848633, + "step": 1519, + "time_per_iteration": 2.562605619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163342, + "balance_loss_mlp": 1.11999798, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07068722957296131, + "language_loss": 0.85016668, + "learning_rate": 0.0008300338598399411, + "loss": 0.86180007, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.43359375, + "step": 1520, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155651, + "balance_loss_mlp": 1.11111403, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07704766336953982, + "language_loss": 0.95187533, + "learning_rate": 0.0008297997633060263, + "loss": 0.96343178, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.44506836, + "step": 1521, + "time_per_iteration": 2.5206730365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_mlp": 1.08468485, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07256926042070597, + "language_loss": 0.85441822, + "learning_rate": 0.0008295655387324883, + "loss": 0.865695, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.42993164, + "step": 1522, + "time_per_iteration": 2.8186635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126011, + "balance_loss_mlp": 1.08090246, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.07210388942873598, + "language_loss": 0.8532753, + "learning_rate": 0.0008293311862102609, + "loss": 0.86453545, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.45092773, + "step": 1523, + "time_per_iteration": 2.4982752799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.07334912, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.0579845522804068, + "language_loss": 0.89434093, + "learning_rate": 0.0008290967058303275, + "loss": 0.90552431, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.44995117, + "step": 1524, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.07575774, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07735764089304721, + "language_loss": 0.86793721, + "learning_rate": 0.0008288620976837219, + "loss": 0.87910557, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.41088867, + "step": 1525, + "time_per_iteration": 2.4877853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.06881261, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.06064034312392981, + "language_loss": 0.83118868, + "learning_rate": 0.000828627361861527, + "loss": 0.84231043, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.43383789, + "step": 1526, + "time_per_iteration": 2.567406415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06620967, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.0729369607745646, + "language_loss": 0.84539104, + "learning_rate": 0.0008283924984548752, + "loss": 0.85648245, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.42919922, + "step": 1527, + "time_per_iteration": 2.8396716117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117649, + "balance_loss_mlp": 1.07480514, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.05516048868040139, + "language_loss": 0.85423326, + "learning_rate": 0.0008281575075549485, + "loss": 0.86540973, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.4284668, + "step": 1528, + "time_per_iteration": 2.596402645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093475, + "balance_loss_mlp": 1.0787884, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.03776357558455706, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78446174, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.14648438, + "step": 1529, + "time_per_iteration": 4.641916513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118614, + "balance_loss_mlp": 1.07436347, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.11599739785132454, + "language_loss": 0.90857148, + "learning_rate": 0.0008276871436402469, + "loss": 0.91975754, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.44238281, + "step": 1530, + "time_per_iteration": 2.8211593627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113901, + "balance_loss_mlp": 1.07239282, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.06834093724659761, + "language_loss": 0.87937176, + "learning_rate": 0.000827451770808083, + "loss": 0.8905108, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.41503906, + "step": 1531, + "time_per_iteration": 2.7127888202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.06357539, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.06489723039655686, + "language_loss": 0.8385976, + "learning_rate": 0.0008272162708478674, + "loss": 0.84966749, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.43457031, + "step": 1532, + "time_per_iteration": 2.580057144165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119293, + "balance_loss_mlp": 1.07749844, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.06938693493012958, + "language_loss": 0.86437017, + "learning_rate": 0.000826980643851029, + "loss": 0.87556309, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.41821289, + "step": 1533, + "time_per_iteration": 2.689450740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118363, + "balance_loss_mlp": 1.07518554, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.057495804655394826, + "language_loss": 0.85101378, + "learning_rate": 0.0008267448899090464, + "loss": 0.8621974, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.43188477, + "step": 1534, + "time_per_iteration": 2.5541234016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139738, + "balance_loss_mlp": 1.09460509, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.0763188518859088, + "language_loss": 0.81071836, + "learning_rate": 0.0008265090091134473, + "loss": 0.82211578, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.45117188, + "step": 1535, + "time_per_iteration": 2.851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.09309804, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06589165398662913, + "language_loss": 0.80565453, + "learning_rate": 0.0008262730015558088, + "loss": 0.8170197, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.43432617, + "step": 1536, + "time_per_iteration": 2.8671340942382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113965, + "balance_loss_mlp": 1.09423184, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.08099910548300644, + "language_loss": 0.82513618, + "learning_rate": 0.0008260368673277574, + "loss": 0.83653271, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.45410156, + "step": 1537, + "time_per_iteration": 3.114685297012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134888, + "balance_loss_mlp": 1.08973145, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06868209454347093, + "language_loss": 0.84501362, + "learning_rate": 0.0008258006065209682, + "loss": 0.85636258, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.45141602, + "step": 1538, + "time_per_iteration": 2.7343428134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112017, + "balance_loss_mlp": 1.07341647, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.07819005704771397, + "language_loss": 0.80795646, + "learning_rate": 0.0008255642192271657, + "loss": 0.8191582, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.4675293, + "step": 1539, + "time_per_iteration": 2.7900264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123831, + "balance_loss_mlp": 1.0775305, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06984070899888078, + "language_loss": 0.84251219, + "learning_rate": 0.0008253277055381241, + "loss": 0.85375053, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.46313477, + "step": 1540, + "time_per_iteration": 2.7936105728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126968, + "balance_loss_mlp": 1.08383858, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09213105437911238, + "language_loss": 0.86479163, + "learning_rate": 0.0008250910655456658, + "loss": 0.87606132, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.43115234, + "step": 1541, + "time_per_iteration": 3.119706392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141818, + "balance_loss_mlp": 1.09723353, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.06264221574110865, + "language_loss": 0.84348595, + "learning_rate": 0.0008248542993416625, + "loss": 0.85490412, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.44628906, + "step": 1542, + "time_per_iteration": 2.6273162364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.09224987, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.062187844768518095, + "language_loss": 0.838552, + "learning_rate": 0.0008246174070180352, + "loss": 0.84992176, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.44702148, + "step": 1543, + "time_per_iteration": 2.6559441089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155532, + "balance_loss_mlp": 1.11099529, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09249403217806111, + "language_loss": 0.84424686, + "learning_rate": 0.0008243803886667537, + "loss": 0.85580218, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.44506836, + "step": 1544, + "time_per_iteration": 3.161595582962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155762, + "balance_loss_mlp": 1.11196482, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.11473976054569617, + "language_loss": 0.79569989, + "learning_rate": 0.0008241432443798364, + "loss": 0.80725753, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.43774414, + "step": 1545, + "time_per_iteration": 2.8056137561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154045, + "balance_loss_mlp": 1.11160624, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05050947415994233, + "language_loss": 0.86053026, + "learning_rate": 0.0008239059742493512, + "loss": 0.87207067, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.42456055, + "step": 1546, + "time_per_iteration": 2.6890687942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146751, + "balance_loss_mlp": 1.10383546, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.060404475813103174, + "language_loss": 0.87675822, + "learning_rate": 0.0008236685783674142, + "loss": 0.88822567, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.42944336, + "step": 1547, + "time_per_iteration": 3.0594639778137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176135, + "balance_loss_mlp": 1.15439153, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05730794129930028, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77397329, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.21777344, + "step": 1548, + "time_per_iteration": 4.907459020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115635, + "balance_loss_mlp": 1.11174202, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08902597202075696, + "language_loss": 0.82813615, + "learning_rate": 0.0008231934097178955, + "loss": 0.83969963, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.44604492, + "step": 1549, + "time_per_iteration": 2.622082471847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.1013267, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.06733871211748228, + "language_loss": 0.85700476, + "learning_rate": 0.0008229556371347903, + "loss": 0.86848152, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.46362305, + "step": 1550, + "time_per_iteration": 3.0081942081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133769, + "balance_loss_mlp": 1.09018564, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.09176779567237862, + "language_loss": 0.79384351, + "learning_rate": 0.0008227177391691874, + "loss": 0.80518115, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.43554688, + "step": 1551, + "time_per_iteration": 3.1698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126053, + "balance_loss_mlp": 1.08218408, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07033401560901072, + "language_loss": 0.89799201, + "learning_rate": 0.0008224797159134463, + "loss": 0.90925252, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.4387207, + "step": 1552, + "time_per_iteration": 2.714494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.07816052, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.05144631995573129, + "language_loss": 0.83942962, + "learning_rate": 0.0008222415674599765, + "loss": 0.85061103, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.39990234, + "step": 1553, + "time_per_iteration": 3.0642828941345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130247, + "balance_loss_mlp": 1.08563888, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07574846124683007, + "language_loss": 0.83871847, + "learning_rate": 0.0008220032939012349, + "loss": 0.85002089, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.44628906, + "step": 1554, + "time_per_iteration": 2.714172840118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.08810425, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.05026836342639273, + "language_loss": 0.8851645, + "learning_rate": 0.0008217648953297277, + "loss": 0.89646089, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.41503906, + "step": 1555, + "time_per_iteration": 2.8413305282592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139651, + "balance_loss_mlp": 1.09692693, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07726233455877282, + "language_loss": 0.78621179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79760832, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.42749023, + "step": 1556, + "time_per_iteration": 2.6995439529418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153013, + "balance_loss_mlp": 1.10766625, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07367356569931041, + "language_loss": 0.8461448, + "learning_rate": 0.0008212877235186833, + "loss": 0.85767496, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.45361328, + "step": 1557, + "time_per_iteration": 2.655294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105489, + "balance_loss_mlp": 1.09290004, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.039126881386902713, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78843045, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12597656, + "step": 1558, + "time_per_iteration": 4.953773021697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148338, + "balance_loss_mlp": 1.10647154, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.07045252665170362, + "language_loss": 0.81300378, + "learning_rate": 0.0008208100527678611, + "loss": 0.82448721, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.41870117, + "step": 1559, + "time_per_iteration": 2.5706257820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142691, + "balance_loss_mlp": 1.10223174, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.09371754463761041, + "language_loss": 0.79173958, + "learning_rate": 0.0008205710305218135, + "loss": 0.80316657, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.40454102, + "step": 1560, + "time_per_iteration": 3.001490354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152428, + "balance_loss_mlp": 1.11292171, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.06044421333553386, + "language_loss": 0.90459639, + "learning_rate": 0.0008203318838190541, + "loss": 0.91612065, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.39501953, + "step": 1561, + "time_per_iteration": 2.753243923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166566, + "balance_loss_mlp": 1.1229353, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.07449479195038491, + "language_loss": 0.85542631, + "learning_rate": 0.0008200926127524281, + "loss": 0.86709195, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.43676758, + "step": 1562, + "time_per_iteration": 2.6388282775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184921, + "balance_loss_mlp": 1.14045644, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.07268784417656445, + "language_loss": 0.83160597, + "learning_rate": 0.0008198532174148289, + "loss": 0.8434552, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.44482422, + "step": 1563, + "time_per_iteration": 2.71712589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076623, + "balance_loss_mlp": 1.06308043, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03416296623034226, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81762791, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.13574219, + "step": 1564, + "time_per_iteration": 4.830719232559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194058, + "balance_loss_mlp": 1.15185785, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08914748552149089, + "language_loss": 0.88889605, + "learning_rate": 0.0008193740542985244, + "loss": 0.90083665, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.421875, + "step": 1565, + "time_per_iteration": 2.6047041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199035, + "balance_loss_mlp": 1.15647733, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.07863054385005203, + "language_loss": 0.8685202, + "learning_rate": 0.0008191342867058467, + "loss": 0.88051057, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.42578125, + "step": 1566, + "time_per_iteration": 2.715708017349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196816, + "balance_loss_mlp": 1.15280378, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.087093537774187, + "language_loss": 0.83839655, + "learning_rate": 0.0008188943952142509, + "loss": 0.85036469, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.43994141, + "step": 1567, + "time_per_iteration": 2.831888198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118972, + "balance_loss_mlp": 1.14663815, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09637975850341399, + "language_loss": 0.82476509, + "learning_rate": 0.0008186543799168711, + "loss": 0.83666229, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.43041992, + "step": 1568, + "time_per_iteration": 3.121755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_mlp": 1.13324285, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.08024736909630528, + "language_loss": 0.88665748, + "learning_rate": 0.0008184142409068892, + "loss": 0.89842814, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.43847656, + "step": 1569, + "time_per_iteration": 2.990497350692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163968, + "balance_loss_mlp": 1.12343669, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.05684047424393967, + "language_loss": 0.86850333, + "learning_rate": 0.000818173978277536, + "loss": 0.88014305, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.40551758, + "step": 1570, + "time_per_iteration": 2.636310338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171599, + "balance_loss_mlp": 1.12956595, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.07636807389642969, + "language_loss": 0.84349716, + "learning_rate": 0.000817933592122089, + "loss": 0.85521317, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.4206543, + "step": 1571, + "time_per_iteration": 2.699178695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163998, + "balance_loss_mlp": 1.11984301, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.07546742874281152, + "language_loss": 0.83585215, + "learning_rate": 0.0008176930825338749, + "loss": 0.8474921, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.44189453, + "step": 1572, + "time_per_iteration": 2.550837516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166441, + "balance_loss_mlp": 1.12385964, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07092433148156627, + "language_loss": 0.89086282, + "learning_rate": 0.0008174524496062679, + "loss": 0.90252721, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.42578125, + "step": 1573, + "time_per_iteration": 2.883683919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116421, + "balance_loss_mlp": 1.11907697, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.061103918995996154, + "language_loss": 0.8587321, + "learning_rate": 0.0008172116934326894, + "loss": 0.8703742, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.45092773, + "step": 1574, + "time_per_iteration": 2.7379467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162954, + "balance_loss_mlp": 1.12132585, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.07023429776023385, + "language_loss": 0.87709713, + "learning_rate": 0.0008169708141066097, + "loss": 0.88872665, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.41625977, + "step": 1575, + "time_per_iteration": 2.571963310241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154168, + "balance_loss_mlp": 1.11435199, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.11601472076904104, + "language_loss": 0.90864658, + "learning_rate": 0.0008167298117215465, + "loss": 0.92018831, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.39819336, + "step": 1576, + "time_per_iteration": 2.562636375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153517, + "balance_loss_mlp": 1.11141217, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08960201833145559, + "language_loss": 0.88355744, + "learning_rate": 0.0008164886863710649, + "loss": 0.89509267, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.42138672, + "step": 1577, + "time_per_iteration": 2.921163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151824, + "balance_loss_mlp": 1.11212754, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07034131144929774, + "language_loss": 0.86199445, + "learning_rate": 0.0008162474381487783, + "loss": 0.87351274, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.39697266, + "step": 1578, + "time_per_iteration": 3.029076337814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.11016417, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.07584256466560314, + "language_loss": 0.85196549, + "learning_rate": 0.0008160060671483475, + "loss": 0.86348867, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.42163086, + "step": 1579, + "time_per_iteration": 2.7073986530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142614, + "balance_loss_mlp": 1.10289371, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.08686038732079729, + "language_loss": 0.83729678, + "learning_rate": 0.0008157645734634809, + "loss": 0.84872293, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.3972168, + "step": 1580, + "time_per_iteration": 2.6613049507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090857, + "balance_loss_mlp": 1.07302368, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.0332286598930082, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77987349, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.17871094, + "step": 1581, + "time_per_iteration": 4.915473699569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074598, + "balance_loss_mlp": 1.05705047, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.028649014265593315, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74289095, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17578125, + "step": 1582, + "time_per_iteration": 4.889309883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129405, + "balance_loss_mlp": 1.08827806, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.06812522797045092, + "language_loss": 0.84052569, + "learning_rate": 0.000815039357240067, + "loss": 0.85181975, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.41113281, + "step": 1583, + "time_per_iteration": 2.6366286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138467, + "balance_loss_mlp": 1.09672034, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.06492424308297744, + "language_loss": 0.85869169, + "learning_rate": 0.0008147973737554952, + "loss": 0.87007636, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.41748047, + "step": 1584, + "time_per_iteration": 2.7854599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136804, + "balance_loss_mlp": 1.095963, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.08202571879527615, + "language_loss": 0.86834013, + "learning_rate": 0.000814555268055744, + "loss": 0.87970817, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.40844727, + "step": 1585, + "time_per_iteration": 2.6199045181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132861, + "balance_loss_mlp": 1.09130502, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07393752668393892, + "language_loss": 0.87929702, + "learning_rate": 0.0008143130402348073, + "loss": 0.89062566, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.41625977, + "step": 1586, + "time_per_iteration": 2.638741970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129239, + "balance_loss_mlp": 1.08868384, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.06849121050203105, + "language_loss": 0.7939502, + "learning_rate": 0.0008140706903867265, + "loss": 0.80524254, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.4050293, + "step": 1587, + "time_per_iteration": 2.810335874557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134042, + "balance_loss_mlp": 1.0908649, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.07851663365650921, + "language_loss": 0.91122121, + "learning_rate": 0.0008138282186055897, + "loss": 0.92256165, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.43188477, + "step": 1588, + "time_per_iteration": 2.7237448692321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137411, + "balance_loss_mlp": 1.09661722, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.06832590097240848, + "language_loss": 0.8307212, + "learning_rate": 0.0008135856249855331, + "loss": 0.84209532, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.40771484, + "step": 1589, + "time_per_iteration": 2.7399301528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153972, + "balance_loss_mlp": 1.11241579, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.09162978556143483, + "language_loss": 0.89933717, + "learning_rate": 0.0008133429096207398, + "loss": 0.91087687, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.41577148, + "step": 1590, + "time_per_iteration": 2.8074302673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_mlp": 1.0156827, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.025543227678258826, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76341486, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.13574219, + "step": 1591, + "time_per_iteration": 4.961095094680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153411, + "balance_loss_mlp": 1.11330891, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.05628096053427355, + "language_loss": 0.87358719, + "learning_rate": 0.0008128571140339123, + "loss": 0.88512129, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.40087891, + "step": 1592, + "time_per_iteration": 2.6484899520874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137482, + "balance_loss_mlp": 1.09497237, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.058132540851188214, + "language_loss": 0.87688839, + "learning_rate": 0.0008126140340004805, + "loss": 0.88826323, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.42529297, + "step": 1593, + "time_per_iteration": 2.509239912033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144438, + "balance_loss_mlp": 1.10316801, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06371804566889869, + "language_loss": 0.82466245, + "learning_rate": 0.0008123708325995172, + "loss": 0.83610678, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.4128418, + "step": 1594, + "time_per_iteration": 3.1773130893707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133345, + "balance_loss_mlp": 1.09240818, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06060698504548286, + "language_loss": 0.79972136, + "learning_rate": 0.0008121275099254414, + "loss": 0.81105477, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 2.9426517486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142693, + "balance_loss_mlp": 1.10244751, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06149446857353131, + "language_loss": 0.88748306, + "learning_rate": 0.0008118840660727194, + "loss": 0.89890993, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.40283203, + "step": 1596, + "time_per_iteration": 2.665166139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.09553957, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.15751252363629464, + "language_loss": 0.88104224, + "learning_rate": 0.0008116405011358644, + "loss": 0.89240128, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.40380859, + "step": 1597, + "time_per_iteration": 3.1415486335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.10291696, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.06428245482632208, + "language_loss": 0.80117774, + "learning_rate": 0.0008113968152094369, + "loss": 0.81262958, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.42285156, + "step": 1598, + "time_per_iteration": 2.50484037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140725, + "balance_loss_mlp": 1.09781003, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.069373282908973, + "language_loss": 0.82692802, + "learning_rate": 0.0008111530083880438, + "loss": 0.83833528, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.42895508, + "step": 1599, + "time_per_iteration": 2.9072136878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.11211586, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.09326308305844169, + "language_loss": 0.86715603, + "learning_rate": 0.0008109090807663399, + "loss": 0.87871301, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.43554688, + "step": 1600, + "time_per_iteration": 2.8556277751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154517, + "balance_loss_mlp": 1.1142, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.07163974647376076, + "language_loss": 0.89029115, + "learning_rate": 0.0008106650324390257, + "loss": 0.90183634, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.40307617, + "step": 1601, + "time_per_iteration": 2.8016483783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115055, + "balance_loss_mlp": 1.10768259, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.06437682840273379, + "language_loss": 0.81480461, + "learning_rate": 0.0008104208635008493, + "loss": 0.82631016, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.42871094, + "step": 1602, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150496, + "balance_loss_mlp": 1.10631728, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.13502170342564263, + "language_loss": 0.8243258, + "learning_rate": 0.0008101765740466058, + "loss": 0.83583081, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.44165039, + "step": 1603, + "time_per_iteration": 2.506427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144916, + "balance_loss_mlp": 1.10135674, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0649160929519563, + "language_loss": 0.84340334, + "learning_rate": 0.0008099321641711364, + "loss": 0.85485256, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.43579102, + "step": 1604, + "time_per_iteration": 2.6318166255950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151756, + "balance_loss_mlp": 1.10938883, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.0523010874933109, + "language_loss": 0.83940029, + "learning_rate": 0.0008096876339693295, + "loss": 0.85091782, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.42407227, + "step": 1605, + "time_per_iteration": 2.620199680328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150228, + "balance_loss_mlp": 1.1086241, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.07539888612246932, + "language_loss": 0.8184768, + "learning_rate": 0.0008094429835361206, + "loss": 0.82997912, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.41625977, + "step": 1606, + "time_per_iteration": 2.9251575469970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147276, + "balance_loss_mlp": 1.10679281, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.07700051037162058, + "language_loss": 0.85932112, + "learning_rate": 0.0008091982129664908, + "loss": 0.87079388, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.40478516, + "step": 1607, + "time_per_iteration": 2.7032129764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169169, + "balance_loss_mlp": 1.12427497, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.11394505928871175, + "language_loss": 0.83292013, + "learning_rate": 0.0008089533223554687, + "loss": 0.84461182, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.44897461, + "step": 1608, + "time_per_iteration": 2.6975207328796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161949, + "balance_loss_mlp": 1.12115526, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.06275490202685644, + "language_loss": 0.85402906, + "learning_rate": 0.0008087083117981294, + "loss": 0.86564851, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.40795898, + "step": 1609, + "time_per_iteration": 2.8709142208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158469, + "balance_loss_mlp": 1.11402774, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.06357956742359384, + "language_loss": 0.88521934, + "learning_rate": 0.0008084631813895943, + "loss": 0.89680409, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.44433594, + "step": 1610, + "time_per_iteration": 2.7704904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148821, + "balance_loss_mlp": 1.1059773, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07818022356789546, + "language_loss": 0.84349322, + "learning_rate": 0.0008082179312250315, + "loss": 0.85498142, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.42871094, + "step": 1611, + "time_per_iteration": 2.6352171897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118188, + "balance_loss_mlp": 1.10588562, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.03204939869531237, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8097403, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.12255859, + "step": 1612, + "time_per_iteration": 4.865812301635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095093, + "balance_loss_mlp": 1.08288634, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.024031397097536, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77724421, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.12207031, + "step": 1613, + "time_per_iteration": 5.057459831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163336, + "balance_loss_mlp": 1.12020612, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.056757119691581794, + "language_loss": 0.82255232, + "learning_rate": 0.0008074814631475545, + "loss": 0.83418566, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.43139648, + "step": 1614, + "time_per_iteration": 3.3026204109191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164621, + "balance_loss_mlp": 1.12153852, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.0685570598787085, + "language_loss": 0.79806983, + "learning_rate": 0.0008072357349114907, + "loss": 0.80971605, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.4309082, + "step": 1615, + "time_per_iteration": 2.663853645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187873, + "balance_loss_mlp": 1.14369345, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.06371446427292905, + "language_loss": 0.8904891, + "learning_rate": 0.0008069898873959363, + "loss": 0.90236783, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.44189453, + "step": 1616, + "time_per_iteration": 2.675607919692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199097, + "balance_loss_mlp": 1.15773141, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.10138062428343411, + "language_loss": 0.8626408, + "learning_rate": 0.0008067439206963375, + "loss": 0.87463176, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.41381836, + "step": 1617, + "time_per_iteration": 2.6264841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193178, + "balance_loss_mlp": 1.15119278, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.06654120721966555, + "language_loss": 0.8650856, + "learning_rate": 0.0008064978349081873, + "loss": 0.87701744, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.41967773, + "step": 1618, + "time_per_iteration": 2.9114232063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180658, + "balance_loss_mlp": 1.13712287, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.06279818174684408, + "language_loss": 0.86905777, + "learning_rate": 0.0008062516301270245, + "loss": 0.88086432, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.43530273, + "step": 1619, + "time_per_iteration": 2.697016477584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174783, + "balance_loss_mlp": 1.13341749, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.07259268941115717, + "language_loss": 0.89074606, + "learning_rate": 0.0008060053064484343, + "loss": 0.90249389, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.41381836, + "step": 1620, + "time_per_iteration": 2.9220941066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160759, + "balance_loss_mlp": 1.11996579, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.054906942105454146, + "language_loss": 0.85286081, + "learning_rate": 0.0008057588639680482, + "loss": 0.8644684, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.40795898, + "step": 1621, + "time_per_iteration": 2.7432475090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161698, + "balance_loss_mlp": 1.11754274, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.08428579582226577, + "language_loss": 0.83045304, + "learning_rate": 0.0008055123027815434, + "loss": 0.84207004, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.44165039, + "step": 1622, + "time_per_iteration": 2.888124465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149406, + "balance_loss_mlp": 1.10947073, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.06442378780427988, + "language_loss": 0.85635763, + "learning_rate": 0.0008052656229846436, + "loss": 0.86785173, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.39916992, + "step": 1623, + "time_per_iteration": 2.7215354442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.11259365, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.1013205930173775, + "language_loss": 0.90875685, + "learning_rate": 0.0008050188246731182, + "loss": 0.92030621, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.42333984, + "step": 1624, + "time_per_iteration": 2.6636321544647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146822, + "balance_loss_mlp": 1.10655355, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08961406202901398, + "language_loss": 0.82641953, + "learning_rate": 0.0008047719079427834, + "loss": 0.83788776, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.40283203, + "step": 1625, + "time_per_iteration": 2.9943442344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067888, + "balance_loss_mlp": 1.05425012, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.02225722433359613, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75419593, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.13671875, + "step": 1626, + "time_per_iteration": 4.865052700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124122, + "balance_loss_mlp": 1.0819937, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.05828883069087806, + "language_loss": 0.86570215, + "learning_rate": 0.0008042777196091757, + "loss": 0.87694335, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.42138672, + "step": 1627, + "time_per_iteration": 2.668349266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127492, + "balance_loss_mlp": 1.08481538, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08399253674550058, + "language_loss": 0.82332879, + "learning_rate": 0.0008040304481977643, + "loss": 0.83460367, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.42675781, + "step": 1628, + "time_per_iteration": 2.6445093154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130913, + "balance_loss_mlp": 1.09224153, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.06122809929096989, + "language_loss": 0.86751842, + "learning_rate": 0.0008037830587512649, + "loss": 0.87882763, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.38671875, + "step": 1629, + "time_per_iteration": 3.0830209255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131503, + "balance_loss_mlp": 1.09068549, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.06235185724616104, + "language_loss": 0.7940957, + "learning_rate": 0.0008035355513657224, + "loss": 0.80541074, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.40820312, + "step": 1630, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135326, + "balance_loss_mlp": 1.09326935, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.06249119555349938, + "language_loss": 0.9321425, + "learning_rate": 0.0008032879261372279, + "loss": 0.94349587, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.42089844, + "step": 1631, + "time_per_iteration": 2.7995047569274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_mlp": 1.01777005, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.019617221588718974, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80666578, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12988281, + "step": 1632, + "time_per_iteration": 5.3968565464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149415, + "balance_loss_mlp": 1.10959888, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.05783646939860944, + "language_loss": 0.87576675, + "learning_rate": 0.0008027923225359748, + "loss": 0.88726091, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.39819336, + "step": 1633, + "time_per_iteration": 2.5933566093444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153635, + "balance_loss_mlp": 1.11145878, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.05944909670445279, + "language_loss": 0.88579285, + "learning_rate": 0.0008025443443556267, + "loss": 0.89732921, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.421875, + "step": 1634, + "time_per_iteration": 2.728522777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149168, + "balance_loss_mlp": 1.109519, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.0772983201911997, + "language_loss": 0.88333809, + "learning_rate": 0.000802296248717147, + "loss": 0.89482975, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.39648438, + "step": 1635, + "time_per_iteration": 2.9030401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140791, + "balance_loss_mlp": 1.0971607, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06629024784700413, + "language_loss": 0.7930302, + "learning_rate": 0.0008020480357168554, + "loss": 0.80443811, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.43603516, + "step": 1636, + "time_per_iteration": 2.839134931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145583, + "balance_loss_mlp": 1.1038121, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.06656267016529639, + "language_loss": 0.88396037, + "learning_rate": 0.0008017997054511165, + "loss": 0.89541626, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.41796875, + "step": 1637, + "time_per_iteration": 2.5937085151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148115, + "balance_loss_mlp": 1.10424566, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06622170213435077, + "language_loss": 0.85649616, + "learning_rate": 0.0008015512580163407, + "loss": 0.86797726, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.43896484, + "step": 1638, + "time_per_iteration": 2.8432726860046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138121, + "balance_loss_mlp": 1.09639752, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.06676164925493694, + "language_loss": 0.81149763, + "learning_rate": 0.0008013026935089838, + "loss": 0.82287884, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.41699219, + "step": 1639, + "time_per_iteration": 2.8703761100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142857, + "balance_loss_mlp": 1.1031127, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.060786667538297263, + "language_loss": 0.84702241, + "learning_rate": 0.0008010540120255472, + "loss": 0.85845095, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.3972168, + "step": 1640, + "time_per_iteration": 2.6741273403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136768, + "balance_loss_mlp": 1.09511614, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.06934658167266547, + "language_loss": 0.86723542, + "learning_rate": 0.0008008052136625774, + "loss": 0.8786031, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.41650391, + "step": 1641, + "time_per_iteration": 2.8395094871520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135427, + "balance_loss_mlp": 1.09272623, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07613576058544219, + "language_loss": 0.87025082, + "learning_rate": 0.0008005562985166666, + "loss": 0.88160515, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.42675781, + "step": 1642, + "time_per_iteration": 2.708812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127626, + "balance_loss_mlp": 1.08621287, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05118616143218352, + "language_loss": 0.85440576, + "learning_rate": 0.0008003072666844524, + "loss": 0.86568201, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.41430664, + "step": 1643, + "time_per_iteration": 2.74019193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127922, + "balance_loss_mlp": 1.08746231, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.07457594622010144, + "language_loss": 0.82632107, + "learning_rate": 0.0008000581182626173, + "loss": 0.83760029, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.40478516, + "step": 1644, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011327, + "balance_loss_mlp": 1.09159672, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.0586598658040055, + "language_loss": 0.86714005, + "learning_rate": 0.0007998088533478894, + "loss": 0.87846708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.41137695, + "step": 1645, + "time_per_iteration": 2.674678087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130403, + "balance_loss_mlp": 1.08805966, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.10428151324619617, + "language_loss": 0.84319067, + "learning_rate": 0.000799559472037042, + "loss": 0.85449469, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.4230957, + "step": 1646, + "time_per_iteration": 2.5389983654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130022, + "balance_loss_mlp": 1.08939528, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05498023868715711, + "language_loss": 0.8798641, + "learning_rate": 0.0007993099744268932, + "loss": 0.8911643, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.40625, + "step": 1647, + "time_per_iteration": 2.919410467147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127448, + "balance_loss_mlp": 1.0858674, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.07648109375468225, + "language_loss": 0.88298547, + "learning_rate": 0.000799060360614307, + "loss": 0.89425999, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.41577148, + "step": 1648, + "time_per_iteration": 2.679098606109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132184, + "balance_loss_mlp": 1.09117627, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.17676844539598618, + "language_loss": 0.83707428, + "learning_rate": 0.0007988106306961917, + "loss": 0.84839618, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.41015625, + "step": 1649, + "time_per_iteration": 3.1304876804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139719, + "balance_loss_mlp": 1.09809113, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.06731506602110418, + "language_loss": 0.84557772, + "learning_rate": 0.0007985607847695014, + "loss": 0.85697484, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.41625977, + "step": 1650, + "time_per_iteration": 2.6152966022491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151307, + "balance_loss_mlp": 1.11087108, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.08658277444707524, + "language_loss": 0.83160597, + "learning_rate": 0.0007983108229312345, + "loss": 0.84311903, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.40454102, + "step": 1651, + "time_per_iteration": 2.9157605171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180085, + "balance_loss_mlp": 1.13864803, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.12326743545136284, + "language_loss": 0.86631948, + "learning_rate": 0.0007980607452784351, + "loss": 0.8781203, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.4140625, + "step": 1652, + "time_per_iteration": 2.5533528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170234, + "balance_loss_mlp": 1.12798643, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.07656805667485655, + "language_loss": 0.90550399, + "learning_rate": 0.0007978105519081919, + "loss": 0.91720629, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.42236328, + "step": 1653, + "time_per_iteration": 2.683962821960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162895, + "balance_loss_mlp": 1.12088561, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.06859901935764132, + "language_loss": 0.88378012, + "learning_rate": 0.0007975602429176385, + "loss": 0.89540899, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.42041016, + "step": 1654, + "time_per_iteration": 2.563507556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165514, + "balance_loss_mlp": 1.12421989, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.07830522948057009, + "language_loss": 0.81779003, + "learning_rate": 0.0007973098184039536, + "loss": 0.82944512, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.4128418, + "step": 1655, + "time_per_iteration": 2.6503560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154556, + "balance_loss_mlp": 1.11433494, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.07004293098644994, + "language_loss": 0.87212098, + "learning_rate": 0.0007970592784643602, + "loss": 0.88366652, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.40185547, + "step": 1656, + "time_per_iteration": 2.8598649501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167315, + "balance_loss_mlp": 1.12366056, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.08267452239342069, + "language_loss": 0.8563, + "learning_rate": 0.0007968086231961272, + "loss": 0.86797309, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.43676758, + "step": 1657, + "time_per_iteration": 2.637216806411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158327, + "balance_loss_mlp": 1.11343288, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.09173012098392071, + "language_loss": 0.83764172, + "learning_rate": 0.0007965578526965671, + "loss": 0.84922498, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.44897461, + "step": 1658, + "time_per_iteration": 2.607729911804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154777, + "balance_loss_mlp": 1.11307764, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.08650327787833377, + "language_loss": 0.86397582, + "learning_rate": 0.0007963069670630377, + "loss": 0.87552357, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.41723633, + "step": 1659, + "time_per_iteration": 2.7385904788970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154696, + "balance_loss_mlp": 1.11175728, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.06815630012467462, + "language_loss": 0.88107586, + "learning_rate": 0.0007960559663929416, + "loss": 0.89262283, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.4296875, + "step": 1660, + "time_per_iteration": 2.696936845779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155709, + "balance_loss_mlp": 1.11372399, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.07443207173064395, + "language_loss": 0.8773188, + "learning_rate": 0.0007958048507837259, + "loss": 0.88887584, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.41992188, + "step": 1661, + "time_per_iteration": 3.0276992321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.12168884, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.07361086812440759, + "language_loss": 0.87900233, + "learning_rate": 0.0007955536203328822, + "loss": 0.89066029, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.44116211, + "step": 1662, + "time_per_iteration": 2.9181947708129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167079, + "balance_loss_mlp": 1.12497449, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0536049497981301, + "language_loss": 0.8375597, + "learning_rate": 0.0007953022751379469, + "loss": 0.84923047, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.42089844, + "step": 1663, + "time_per_iteration": 2.8502774238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160364, + "balance_loss_mlp": 1.11749601, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.09076105210561375, + "language_loss": 0.82297581, + "learning_rate": 0.000795050815296501, + "loss": 0.83457941, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.42871094, + "step": 1664, + "time_per_iteration": 2.990253210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149395, + "balance_loss_mlp": 1.10821986, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.05392034602485258, + "language_loss": 0.93401325, + "learning_rate": 0.0007947992409061695, + "loss": 0.94550717, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.41162109, + "step": 1665, + "time_per_iteration": 2.5734803676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146101, + "balance_loss_mlp": 1.10456824, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07147454481835314, + "language_loss": 0.86398005, + "learning_rate": 0.0007945475520646226, + "loss": 0.87544107, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.4152832, + "step": 1666, + "time_per_iteration": 2.9147067070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144126, + "balance_loss_mlp": 1.10156846, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08541845552139904, + "language_loss": 0.85159481, + "learning_rate": 0.0007942957488695743, + "loss": 0.8630361, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.42578125, + "step": 1667, + "time_per_iteration": 2.6842408180236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138005, + "balance_loss_mlp": 1.09725952, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06001483498827303, + "language_loss": 0.81309706, + "learning_rate": 0.0007940438314187833, + "loss": 0.82447714, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.4074707, + "step": 1668, + "time_per_iteration": 3.0340676307678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128156, + "balance_loss_mlp": 1.08769631, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.06998559069767052, + "language_loss": 0.81191337, + "learning_rate": 0.0007937917998100529, + "loss": 0.82319492, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.40454102, + "step": 1669, + "time_per_iteration": 2.635629177093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.09313023, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.08304565240235381, + "language_loss": 0.79254091, + "learning_rate": 0.0007935396541412302, + "loss": 0.80392736, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.45532227, + "step": 1670, + "time_per_iteration": 2.6226065158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141669, + "balance_loss_mlp": 1.09896851, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07816166477955887, + "language_loss": 0.85914934, + "learning_rate": 0.0007932873945102068, + "loss": 0.87056601, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.42724609, + "step": 1671, + "time_per_iteration": 2.559443473815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_mlp": 1.03238678, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.025388272809080015, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76809424, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15234375, + "step": 1672, + "time_per_iteration": 4.8329596519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176113, + "balance_loss_mlp": 1.13319826, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.10680060394368475, + "language_loss": 0.86589128, + "learning_rate": 0.0007927825337533461, + "loss": 0.87765247, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.42895508, + "step": 1673, + "time_per_iteration": 2.670067071914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117015, + "balance_loss_mlp": 1.12651968, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.0659920492524482, + "language_loss": 0.84953517, + "learning_rate": 0.0007925299328235131, + "loss": 0.86123669, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.43652344, + "step": 1674, + "time_per_iteration": 2.6559884548187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169615, + "balance_loss_mlp": 1.12543643, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.10142438885407562, + "language_loss": 0.85307467, + "learning_rate": 0.000792277218323488, + "loss": 0.86477083, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.44189453, + "step": 1675, + "time_per_iteration": 2.5843372344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158765, + "balance_loss_mlp": 1.11673164, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.06840501438298492, + "language_loss": 0.85418063, + "learning_rate": 0.0007920243903513833, + "loss": 0.86576831, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.4206543, + "step": 1676, + "time_per_iteration": 2.562697649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.09280825, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.06731593225582447, + "language_loss": 0.84609574, + "learning_rate": 0.0007917714490053556, + "loss": 0.85747755, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.45361328, + "step": 1677, + "time_per_iteration": 2.685854434967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131879, + "balance_loss_mlp": 1.09029913, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06310440112326268, + "language_loss": 0.86562228, + "learning_rate": 0.0007915183943836055, + "loss": 0.87694108, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.41601562, + "step": 1678, + "time_per_iteration": 2.8568227291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128366, + "balance_loss_mlp": 1.08466363, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07690366782162197, + "language_loss": 0.84428912, + "learning_rate": 0.0007912652265843773, + "loss": 0.85557282, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.43725586, + "step": 1679, + "time_per_iteration": 3.079998254776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110817, + "balance_loss_mlp": 1.06930852, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.07712564159484636, + "language_loss": 0.8213551, + "learning_rate": 0.0007910119457059597, + "loss": 0.83246326, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.4152832, + "step": 1680, + "time_per_iteration": 2.6812973022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112077, + "balance_loss_mlp": 1.06975782, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.10745693955939492, + "language_loss": 0.81109858, + "learning_rate": 0.0007907585518466849, + "loss": 0.82221937, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.42333984, + "step": 1681, + "time_per_iteration": 2.9406683444976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115262, + "balance_loss_mlp": 1.07265627, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07157404686533678, + "language_loss": 0.89948541, + "learning_rate": 0.000790505045104929, + "loss": 0.91063797, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.42602539, + "step": 1682, + "time_per_iteration": 2.5241646766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119953, + "balance_loss_mlp": 1.07606041, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.06937214564576595, + "language_loss": 0.87034553, + "learning_rate": 0.0007902514255791125, + "loss": 0.88154507, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.43896484, + "step": 1683, + "time_per_iteration": 2.8741068840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111917, + "balance_loss_mlp": 1.076231, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06778435640114842, + "language_loss": 0.87994444, + "learning_rate": 0.0007899976933676986, + "loss": 0.89113617, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.42919922, + "step": 1684, + "time_per_iteration": 2.959290027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117467, + "balance_loss_mlp": 1.07469463, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.06453517439379398, + "language_loss": 0.87573123, + "learning_rate": 0.0007897438485691955, + "loss": 0.88690597, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.42773438, + "step": 1685, + "time_per_iteration": 2.6591978073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_mlp": 1.08655035, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.13512041919643347, + "language_loss": 0.82386112, + "learning_rate": 0.0007894898912821542, + "loss": 0.835177, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.45043945, + "step": 1686, + "time_per_iteration": 2.5375750064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134689, + "balance_loss_mlp": 1.09201205, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.07414292899066016, + "language_loss": 0.8748548, + "learning_rate": 0.0007892358216051695, + "loss": 0.88620168, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.42675781, + "step": 1687, + "time_per_iteration": 2.73968243598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132707, + "balance_loss_mlp": 1.09098339, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06337992950379638, + "language_loss": 0.92269105, + "learning_rate": 0.0007889816396368803, + "loss": 0.93401814, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.41699219, + "step": 1688, + "time_per_iteration": 2.6067299842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131771, + "balance_loss_mlp": 1.08961868, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07885708031147778, + "language_loss": 0.85782814, + "learning_rate": 0.0007887273454759687, + "loss": 0.86914587, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.421875, + "step": 1689, + "time_per_iteration": 2.484260320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122278, + "balance_loss_mlp": 1.08031607, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.06527022407794938, + "language_loss": 0.82859224, + "learning_rate": 0.0007884729392211603, + "loss": 0.83981502, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.41943359, + "step": 1690, + "time_per_iteration": 2.642786741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129634, + "balance_loss_mlp": 1.08812594, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09568065131307975, + "language_loss": 0.86132944, + "learning_rate": 0.0007882184209712245, + "loss": 0.87262577, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.41503906, + "step": 1691, + "time_per_iteration": 2.5199530124664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123067, + "balance_loss_mlp": 1.08234525, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06282055281729462, + "language_loss": 0.86132228, + "learning_rate": 0.000787963790824974, + "loss": 0.87255299, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.40722656, + "step": 1692, + "time_per_iteration": 2.9768075942993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124522, + "balance_loss_mlp": 1.08427668, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.07612118071262816, + "language_loss": 0.89543802, + "learning_rate": 0.0007877090488812651, + "loss": 0.90668321, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.40258789, + "step": 1693, + "time_per_iteration": 2.4604485034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124012, + "balance_loss_mlp": 1.08207428, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.1035661329718289, + "language_loss": 0.83982152, + "learning_rate": 0.0007874541952389973, + "loss": 0.85106164, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.41943359, + "step": 1694, + "time_per_iteration": 2.6709587574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113814, + "balance_loss_mlp": 1.09753752, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08446561178027004, + "language_loss": 0.86949492, + "learning_rate": 0.0007871992299971136, + "loss": 0.8808763, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.40625, + "step": 1695, + "time_per_iteration": 2.5585403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150743, + "balance_loss_mlp": 1.11023593, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.05830689117178756, + "language_loss": 0.84793502, + "learning_rate": 0.0007869441532546001, + "loss": 0.85944247, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.4050293, + "step": 1696, + "time_per_iteration": 2.7510788440704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148317, + "balance_loss_mlp": 1.1100266, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06976949490853021, + "language_loss": 0.79791546, + "learning_rate": 0.0007866889651104867, + "loss": 0.80939865, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.38256836, + "step": 1697, + "time_per_iteration": 2.7944459915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152152, + "balance_loss_mlp": 1.11114383, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.06767982610774756, + "language_loss": 0.83777177, + "learning_rate": 0.000786433665663846, + "loss": 0.84929335, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.40991211, + "step": 1698, + "time_per_iteration": 2.6864194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167936, + "balance_loss_mlp": 1.12514019, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.0725657973515617, + "language_loss": 0.87005848, + "learning_rate": 0.0007861782550137942, + "loss": 0.88173789, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.42797852, + "step": 1699, + "time_per_iteration": 2.896897792816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160393, + "balance_loss_mlp": 1.11986172, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.0774952835645251, + "language_loss": 0.86092401, + "learning_rate": 0.0007859227332594901, + "loss": 0.87252796, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.40527344, + "step": 1700, + "time_per_iteration": 2.8986380100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165908, + "balance_loss_mlp": 1.12449527, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09509515836767467, + "language_loss": 0.85007191, + "learning_rate": 0.0007856671005001365, + "loss": 0.86173105, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.41381836, + "step": 1701, + "time_per_iteration": 3.148084878921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168963, + "balance_loss_mlp": 1.12726378, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.07560076292899535, + "language_loss": 0.82363045, + "learning_rate": 0.0007854113568349787, + "loss": 0.83532006, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.41699219, + "step": 1702, + "time_per_iteration": 3.1411454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191314, + "balance_loss_mlp": 1.14882779, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.08142047178498793, + "language_loss": 0.81090933, + "learning_rate": 0.0007851555023633052, + "loss": 0.82282251, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.42504883, + "step": 1703, + "time_per_iteration": 2.9109766483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197058, + "balance_loss_mlp": 1.1559788, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07993965020483434, + "language_loss": 0.82561779, + "learning_rate": 0.0007848995371844474, + "loss": 0.83758843, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.41088867, + "step": 1704, + "time_per_iteration": 2.531611680984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197334, + "balance_loss_mlp": 1.15267849, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.11293951672356671, + "language_loss": 0.81012988, + "learning_rate": 0.0007846434613977801, + "loss": 0.82210326, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.44677734, + "step": 1705, + "time_per_iteration": 2.5413970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175519, + "balance_loss_mlp": 1.1340816, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.10106481858624654, + "language_loss": 0.78958142, + "learning_rate": 0.0007843872751027203, + "loss": 0.80133665, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.41455078, + "step": 1706, + "time_per_iteration": 2.817387580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158115, + "balance_loss_mlp": 1.1166296, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.06764312208644677, + "language_loss": 0.87366319, + "learning_rate": 0.0007841309783987287, + "loss": 0.88524431, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.41503906, + "step": 1707, + "time_per_iteration": 2.7335729598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155907, + "balance_loss_mlp": 1.11117959, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06220723681544313, + "language_loss": 0.89445031, + "learning_rate": 0.0007838745713853084, + "loss": 0.90600932, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.44702148, + "step": 1708, + "time_per_iteration": 2.6179606914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114633, + "balance_loss_mlp": 1.10207939, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.09473479000062662, + "language_loss": 0.84092307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85238636, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.44213867, + "step": 1709, + "time_per_iteration": 2.703660249710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160153, + "balance_loss_mlp": 1.11723721, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06816782803484764, + "language_loss": 0.86778289, + "learning_rate": 0.0007833614268284082, + "loss": 0.8793844, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.42944336, + "step": 1710, + "time_per_iteration": 2.548859119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077221, + "balance_loss_mlp": 1.06558585, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.029019472878356288, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75186992, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.11621094, + "step": 1711, + "time_per_iteration": 4.9234619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117236, + "balance_loss_mlp": 1.12934983, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.10714861433418864, + "language_loss": 0.78928375, + "learning_rate": 0.0007828478422289016, + "loss": 0.80100739, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.43017578, + "step": 1712, + "time_per_iteration": 2.584307909011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167703, + "balance_loss_mlp": 1.12228465, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.08165577234876795, + "language_loss": 0.89409995, + "learning_rate": 0.0007825908851623833, + "loss": 0.90577698, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.45410156, + "step": 1713, + "time_per_iteration": 2.7400283813476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158648, + "balance_loss_mlp": 1.11475515, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08464988169520862, + "language_loss": 0.85764992, + "learning_rate": 0.0007823338183843533, + "loss": 0.86923635, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.43896484, + "step": 1714, + "time_per_iteration": 2.671375036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157012, + "balance_loss_mlp": 1.11419201, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.0730773907324959, + "language_loss": 0.81870985, + "learning_rate": 0.0007820766419946141, + "loss": 0.83028001, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4284668, + "step": 1715, + "time_per_iteration": 3.3361854553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_mlp": 1.01473284, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.017749933707714268, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80699992, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.12792969, + "step": 1716, + "time_per_iteration": 4.933880567550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193401, + "balance_loss_mlp": 1.14895988, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.1003306893312863, + "language_loss": 0.76434684, + "learning_rate": 0.0007815619607794288, + "loss": 0.77628088, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.4440918, + "step": 1717, + "time_per_iteration": 2.6259148120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191125, + "balance_loss_mlp": 1.14823365, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.07927399877074098, + "language_loss": 0.83156073, + "learning_rate": 0.0007813044561538001, + "loss": 0.84347194, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.42895508, + "step": 1718, + "time_per_iteration": 3.1473774909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.145239, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06905487251407855, + "language_loss": 0.88941157, + "learning_rate": 0.0007810468423160958, + "loss": 0.9013117, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.44799805, + "step": 1719, + "time_per_iteration": 2.895155906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181044, + "balance_loss_mlp": 1.13943982, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06204943336400955, + "language_loss": 0.82643551, + "learning_rate": 0.0007807891193663306, + "loss": 0.83824587, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.41625977, + "step": 1720, + "time_per_iteration": 2.7824859619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165341, + "balance_loss_mlp": 1.12357068, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07732363095630222, + "language_loss": 0.82492876, + "learning_rate": 0.0007805312874045614, + "loss": 0.83658212, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.41796875, + "step": 1721, + "time_per_iteration": 2.5710601806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170989, + "balance_loss_mlp": 1.12807381, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.07358039625922873, + "language_loss": 0.86639178, + "learning_rate": 0.0007802733465308874, + "loss": 0.87810171, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.42895508, + "step": 1722, + "time_per_iteration": 2.4402778148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171295, + "balance_loss_mlp": 1.12632966, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06616160911514579, + "language_loss": 0.8424235, + "learning_rate": 0.0007800152968454501, + "loss": 0.85413647, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.44970703, + "step": 1723, + "time_per_iteration": 2.689309597015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115688, + "balance_loss_mlp": 1.11634886, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06191321033146657, + "language_loss": 0.90671206, + "learning_rate": 0.0007797571384484334, + "loss": 0.91828084, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.40527344, + "step": 1724, + "time_per_iteration": 2.8473238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147699, + "balance_loss_mlp": 1.10421109, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.06062690844208358, + "language_loss": 0.92524576, + "learning_rate": 0.0007794988714400633, + "loss": 0.93672276, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.43530273, + "step": 1725, + "time_per_iteration": 2.62685227394104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146389, + "balance_loss_mlp": 1.10118532, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.09351886782013036, + "language_loss": 0.85586655, + "learning_rate": 0.0007792404959206079, + "loss": 0.86733043, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.45214844, + "step": 1726, + "time_per_iteration": 2.487520694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150134, + "balance_loss_mlp": 1.10707533, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.09481341164405561, + "language_loss": 0.81825417, + "learning_rate": 0.0007789820119903774, + "loss": 0.82975549, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.4309082, + "step": 1727, + "time_per_iteration": 2.9732954502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118165, + "balance_loss_mlp": 1.16734493, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.0769954731958624, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79674315, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14257812, + "step": 1728, + "time_per_iteration": 4.8314409255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149054, + "balance_loss_mlp": 1.10599601, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.06765949793064117, + "language_loss": 0.84123361, + "learning_rate": 0.0007784647192990428, + "loss": 0.85272419, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.43041992, + "step": 1729, + "time_per_iteration": 2.715163230895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147649, + "balance_loss_mlp": 1.10799968, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06156065876328187, + "language_loss": 0.80939102, + "learning_rate": 0.0007782059107387696, + "loss": 0.82086754, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.39672852, + "step": 1730, + "time_per_iteration": 2.865858554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165768, + "balance_loss_mlp": 1.12247074, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.07708666526094303, + "language_loss": 0.88668191, + "learning_rate": 0.0007779469941693826, + "loss": 0.89833963, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.43261719, + "step": 1731, + "time_per_iteration": 2.8640921115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166075, + "balance_loss_mlp": 1.12351775, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.08600344935746515, + "language_loss": 0.76943499, + "learning_rate": 0.0007776879696914029, + "loss": 0.78109574, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.42553711, + "step": 1732, + "time_per_iteration": 2.8162899017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159987, + "balance_loss_mlp": 1.11745262, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.07534435583192022, + "language_loss": 0.89131331, + "learning_rate": 0.000777428837405392, + "loss": 0.90291321, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.42553711, + "step": 1733, + "time_per_iteration": 2.869436740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151668, + "balance_loss_mlp": 1.11042213, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.0649827105829465, + "language_loss": 0.87220478, + "learning_rate": 0.0007771695974119544, + "loss": 0.88372147, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.41259766, + "step": 1734, + "time_per_iteration": 2.5153088569641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138148, + "balance_loss_mlp": 1.0959959, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07614790264044081, + "language_loss": 0.76295686, + "learning_rate": 0.0007769102498117359, + "loss": 0.77433836, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.42163086, + "step": 1735, + "time_per_iteration": 3.1105504035949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136381, + "balance_loss_mlp": 1.09430027, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06230250245944302, + "language_loss": 0.80020654, + "learning_rate": 0.000776650794705424, + "loss": 0.81157035, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.42089844, + "step": 1736, + "time_per_iteration": 3.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141082, + "balance_loss_mlp": 1.09890568, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.053956568858798265, + "language_loss": 0.82610357, + "learning_rate": 0.0007763912321937483, + "loss": 0.8375144, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.421875, + "step": 1737, + "time_per_iteration": 2.6871769428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126175, + "balance_loss_mlp": 1.0870508, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.06336651482337263, + "language_loss": 0.82955027, + "learning_rate": 0.0007761315623774799, + "loss": 0.84081209, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.39111328, + "step": 1738, + "time_per_iteration": 3.4055540561676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113225, + "balance_loss_mlp": 1.09088469, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08278309899958468, + "language_loss": 0.88244802, + "learning_rate": 0.0007758717853574313, + "loss": 0.89377058, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.41381836, + "step": 1739, + "time_per_iteration": 2.7666313648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120554, + "balance_loss_mlp": 1.08114362, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0696820530517557, + "language_loss": 0.90798807, + "learning_rate": 0.0007756119012344571, + "loss": 0.91919363, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.39404297, + "step": 1740, + "time_per_iteration": 2.5491223335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115915, + "balance_loss_mlp": 1.07428706, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06589349032225494, + "language_loss": 0.85103011, + "learning_rate": 0.0007753519101094535, + "loss": 0.86218929, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.41625977, + "step": 1741, + "time_per_iteration": 2.765583038330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112401, + "balance_loss_mlp": 1.0837177, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.0662644502369307, + "language_loss": 0.86452365, + "learning_rate": 0.0007750918120833575, + "loss": 0.87576377, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.40283203, + "step": 1742, + "time_per_iteration": 2.6085479259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140409, + "balance_loss_mlp": 1.10240483, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.07280628286033199, + "language_loss": 0.87783647, + "learning_rate": 0.0007748316072571485, + "loss": 0.88924056, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.37963867, + "step": 1743, + "time_per_iteration": 2.793119192123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133272, + "balance_loss_mlp": 1.09259784, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0850070564381928, + "language_loss": 0.79522568, + "learning_rate": 0.0007745712957318467, + "loss": 0.80655837, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.40698242, + "step": 1744, + "time_per_iteration": 2.943847417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137205, + "balance_loss_mlp": 1.09700739, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06831295126385283, + "language_loss": 0.86807823, + "learning_rate": 0.0007743108776085141, + "loss": 0.87945032, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.40136719, + "step": 1745, + "time_per_iteration": 2.771634101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011368, + "balance_loss_mlp": 1.09743714, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.05902486087385494, + "language_loss": 0.83364028, + "learning_rate": 0.0007740503529882543, + "loss": 0.84500825, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.39331055, + "step": 1746, + "time_per_iteration": 2.7896366119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139374, + "balance_loss_mlp": 1.09831822, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.061665767711377016, + "language_loss": 0.90955931, + "learning_rate": 0.0007737897219722114, + "loss": 0.92095304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.41088867, + "step": 1747, + "time_per_iteration": 2.7088165283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129332, + "balance_loss_mlp": 1.08725071, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08528813851267185, + "language_loss": 0.81553382, + "learning_rate": 0.0007735289846615716, + "loss": 0.82682711, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.42089844, + "step": 1748, + "time_per_iteration": 2.635098934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129982, + "balance_loss_mlp": 1.09119081, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.09169024401551043, + "language_loss": 0.82026851, + "learning_rate": 0.0007732681411575621, + "loss": 0.83156836, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.38818359, + "step": 1749, + "time_per_iteration": 2.6693224906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134785, + "balance_loss_mlp": 1.09437299, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0698579909367107, + "language_loss": 0.88035583, + "learning_rate": 0.0007730071915614514, + "loss": 0.89170372, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.40405273, + "step": 1750, + "time_per_iteration": 2.6900789737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.09800839, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09938227861633823, + "language_loss": 0.89158392, + "learning_rate": 0.0007727461359745489, + "loss": 0.90296388, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.3996582, + "step": 1751, + "time_per_iteration": 2.5086123943328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154901, + "balance_loss_mlp": 1.1132257, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06249007419708336, + "language_loss": 0.86569941, + "learning_rate": 0.0007724849744982056, + "loss": 0.87724847, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.41674805, + "step": 1752, + "time_per_iteration": 2.700474739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169913, + "balance_loss_mlp": 1.12737882, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.06015013269361517, + "language_loss": 0.8195309, + "learning_rate": 0.0007722237072338131, + "loss": 0.83123004, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.42529297, + "step": 1753, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.14816022, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.11537307258838475, + "language_loss": 0.85648489, + "learning_rate": 0.0007719623342828046, + "loss": 0.86841327, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.44726562, + "step": 1754, + "time_per_iteration": 2.517010450363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191581, + "balance_loss_mlp": 1.14685392, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.06847069318075473, + "language_loss": 0.84535718, + "learning_rate": 0.000771700855746654, + "loss": 0.85727292, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.44750977, + "step": 1755, + "time_per_iteration": 2.5961217880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164795, + "balance_loss_mlp": 1.1231432, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.05626734330263072, + "language_loss": 0.8872534, + "learning_rate": 0.0007714392717268763, + "loss": 0.89890134, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.41674805, + "step": 1756, + "time_per_iteration": 2.5784223079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166558, + "balance_loss_mlp": 1.12185431, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.07105398160496887, + "language_loss": 0.8649826, + "learning_rate": 0.0007711775823250273, + "loss": 0.87664813, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.44702148, + "step": 1757, + "time_per_iteration": 2.5373613834381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115594, + "balance_loss_mlp": 1.11207056, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06341765106008965, + "language_loss": 0.83797616, + "learning_rate": 0.0007709157876427039, + "loss": 0.84953558, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.43896484, + "step": 1758, + "time_per_iteration": 3.1393754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144144, + "balance_loss_mlp": 1.10027504, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.0573406658982909, + "language_loss": 0.85933769, + "learning_rate": 0.0007706538877815439, + "loss": 0.8707791, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4387207, + "step": 1759, + "time_per_iteration": 2.6080896854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152987, + "balance_loss_mlp": 1.11054862, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.06135171113161323, + "language_loss": 0.83615482, + "learning_rate": 0.0007703918828432259, + "loss": 0.84768468, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.42456055, + "step": 1760, + "time_per_iteration": 2.5886309146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148897, + "balance_loss_mlp": 1.10464644, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.05937499082636783, + "language_loss": 0.88942921, + "learning_rate": 0.000770129772929469, + "loss": 0.90091813, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.44238281, + "step": 1761, + "time_per_iteration": 2.645293951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140708, + "balance_loss_mlp": 1.09629107, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07244625367361128, + "language_loss": 0.88504505, + "learning_rate": 0.0007698675581420334, + "loss": 0.89645213, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.4440918, + "step": 1762, + "time_per_iteration": 2.849560022354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149138, + "balance_loss_mlp": 1.10469711, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06385607916775927, + "language_loss": 0.79163915, + "learning_rate": 0.0007696052385827199, + "loss": 0.80313051, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.44458008, + "step": 1763, + "time_per_iteration": 2.9164280891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138684, + "balance_loss_mlp": 1.09765172, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.07477333876977248, + "language_loss": 0.78203613, + "learning_rate": 0.00076934281435337, + "loss": 0.79342294, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.41040039, + "step": 1764, + "time_per_iteration": 2.7213284969329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131547, + "balance_loss_mlp": 1.08922768, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0661700543843282, + "language_loss": 0.86476332, + "learning_rate": 0.0007690802855558658, + "loss": 0.87607884, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.4230957, + "step": 1765, + "time_per_iteration": 2.8648691177368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144213, + "balance_loss_mlp": 1.12981212, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.0393682164062729, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77519166, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.14355469, + "step": 1766, + "time_per_iteration": 4.883134603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138855, + "balance_loss_mlp": 1.09441423, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.06478844738748038, + "language_loss": 0.89260793, + "learning_rate": 0.0007685549146641262, + "loss": 0.90399647, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.44458008, + "step": 1767, + "time_per_iteration": 2.5584475994110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138308, + "balance_loss_mlp": 1.09780085, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0552886410345199, + "language_loss": 0.8865279, + "learning_rate": 0.0007682920727738579, + "loss": 0.89791095, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.4050293, + "step": 1768, + "time_per_iteration": 2.462104558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.09170651, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.07550967393636049, + "language_loss": 0.84987569, + "learning_rate": 0.000768029126723369, + "loss": 0.86121619, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.42333984, + "step": 1769, + "time_per_iteration": 2.5362985134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.09360242, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.0745429404709064, + "language_loss": 0.82167029, + "learning_rate": 0.0007677660766147447, + "loss": 0.83301806, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.41186523, + "step": 1770, + "time_per_iteration": 2.516824960708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.06356168, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.02503514207226814, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73550433, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.15917969, + "step": 1771, + "time_per_iteration": 4.943475008010864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137395, + "balance_loss_mlp": 1.09543359, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.06960763795190199, + "language_loss": 0.80136019, + "learning_rate": 0.0007672396646316306, + "loss": 0.81273413, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.41918945, + "step": 1772, + "time_per_iteration": 2.5425803661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145424, + "balance_loss_mlp": 1.10341442, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.05748114386543088, + "language_loss": 0.80760133, + "learning_rate": 0.000766976302961512, + "loss": 0.81905556, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.42041016, + "step": 1773, + "time_per_iteration": 2.982287645339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155937, + "balance_loss_mlp": 1.11330807, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.06912006035569716, + "language_loss": 0.81549138, + "learning_rate": 0.0007667128376420003, + "loss": 0.82705075, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.42626953, + "step": 1774, + "time_per_iteration": 2.5396063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151156, + "balance_loss_mlp": 1.10926604, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07768471353958366, + "language_loss": 0.84963071, + "learning_rate": 0.0007664492687753817, + "loss": 0.86114228, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.41894531, + "step": 1775, + "time_per_iteration": 2.7326042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139013, + "balance_loss_mlp": 1.09845805, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10552495092435867, + "language_loss": 0.81927752, + "learning_rate": 0.000766185596463983, + "loss": 0.83066773, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.40551758, + "step": 1776, + "time_per_iteration": 2.622465133666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126657, + "balance_loss_mlp": 1.08455205, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.06005887645947995, + "language_loss": 0.77224028, + "learning_rate": 0.0007659218208101706, + "loss": 0.78350687, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.42114258, + "step": 1777, + "time_per_iteration": 3.099862575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124902, + "balance_loss_mlp": 1.0852288, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.057585659974550854, + "language_loss": 0.85272229, + "learning_rate": 0.0007656579419163515, + "loss": 0.86397129, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.39672852, + "step": 1778, + "time_per_iteration": 2.7696709632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129572, + "balance_loss_mlp": 1.08794475, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.07376046533358642, + "language_loss": 0.77272999, + "learning_rate": 0.0007653939598849724, + "loss": 0.78402567, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.41650391, + "step": 1779, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131616, + "balance_loss_mlp": 1.11511779, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.05276839393693404, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84011823, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16503906, + "step": 1780, + "time_per_iteration": 4.96061897277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112473, + "balance_loss_mlp": 1.08267307, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07129012841004771, + "language_loss": 0.80831903, + "learning_rate": 0.000764865686819522, + "loss": 0.81956631, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.4206543, + "step": 1781, + "time_per_iteration": 3.089735507965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.08492422, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0622927262326037, + "language_loss": 0.86375809, + "learning_rate": 0.0007646013959905449, + "loss": 0.87502241, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.41503906, + "step": 1782, + "time_per_iteration": 2.6112704277038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123127, + "balance_loss_mlp": 1.08130884, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10167310682771787, + "language_loss": 0.81018484, + "learning_rate": 0.0007643370024341949, + "loss": 0.82141614, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.41821289, + "step": 1783, + "time_per_iteration": 3.1074132919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115046, + "balance_loss_mlp": 1.07563567, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.057781870331099924, + "language_loss": 0.83518296, + "learning_rate": 0.0007640725062531195, + "loss": 0.84633338, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.39404297, + "step": 1784, + "time_per_iteration": 2.491313934326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112121, + "balance_loss_mlp": 1.07228112, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.12476428026998775, + "language_loss": 0.86600161, + "learning_rate": 0.0007638079075500047, + "loss": 0.87712288, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.39819336, + "step": 1785, + "time_per_iteration": 2.5236706733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070785, + "balance_loss_mlp": 1.05457258, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.032988320908807454, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76251453, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.16210938, + "step": 1786, + "time_per_iteration": 4.938300609588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_mlp": 1.09274352, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.06899034270313556, + "language_loss": 0.83409935, + "learning_rate": 0.0007632784029886026, + "loss": 0.84544241, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.41552734, + "step": 1787, + "time_per_iteration": 2.6218347549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140121, + "balance_loss_mlp": 1.09968519, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.05777013506444436, + "language_loss": 0.85674673, + "learning_rate": 0.0007630134973358873, + "loss": 0.86814797, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.40429688, + "step": 1788, + "time_per_iteration": 2.9675180912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.11780846, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.11323624876812292, + "language_loss": 0.86969185, + "learning_rate": 0.0007627484895722763, + "loss": 0.88126147, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.39160156, + "step": 1789, + "time_per_iteration": 2.6400198936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164783, + "balance_loss_mlp": 1.1222018, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.06957715435201431, + "language_loss": 0.80509681, + "learning_rate": 0.0007624833798006552, + "loss": 0.81674469, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.42602539, + "step": 1790, + "time_per_iteration": 3.042621374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162497, + "balance_loss_mlp": 1.11924767, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.09367673394256656, + "language_loss": 0.84194326, + "learning_rate": 0.0007622181681239483, + "loss": 0.85356832, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.43261719, + "step": 1791, + "time_per_iteration": 2.642648220062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140416, + "balance_loss_mlp": 1.09907472, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07487034842421487, + "language_loss": 0.84962463, + "learning_rate": 0.0007619528546451202, + "loss": 0.86102873, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.41333008, + "step": 1792, + "time_per_iteration": 2.8014347553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.08941662, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.05771787988130437, + "language_loss": 0.84187096, + "learning_rate": 0.0007616874394671745, + "loss": 0.85317373, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.40869141, + "step": 1793, + "time_per_iteration": 3.336076498031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137422, + "balance_loss_mlp": 1.09276664, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08239177777048284, + "language_loss": 0.85433841, + "learning_rate": 0.0007614219226931547, + "loss": 0.86571258, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44677734, + "step": 1794, + "time_per_iteration": 2.6596035957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.0951401, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.06809904369873732, + "language_loss": 0.85092592, + "learning_rate": 0.0007611563044261435, + "loss": 0.86229378, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.41674805, + "step": 1795, + "time_per_iteration": 2.545440435409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140576, + "balance_loss_mlp": 1.09601521, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.08865061616635866, + "language_loss": 0.8722235, + "learning_rate": 0.0007608905847692631, + "loss": 0.88362932, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.44555664, + "step": 1796, + "time_per_iteration": 2.471306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112492, + "balance_loss_mlp": 1.08486605, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07442154430907115, + "language_loss": 0.86828166, + "learning_rate": 0.0007606247638256749, + "loss": 0.87953079, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.40039062, + "step": 1797, + "time_per_iteration": 2.8728272914886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_mlp": 1.03099036, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.022391201486326673, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79215777, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.14453125, + "step": 1798, + "time_per_iteration": 4.99533486366272 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_mlp": 1.0224725, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.020693498138200886, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80363786, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.14160156, + "step": 1799, + "time_per_iteration": 4.871920347213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131321, + "balance_loss_mlp": 1.086761, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.06425687332848114, + "language_loss": 0.8622126, + "learning_rate": 0.0007598266943068686, + "loss": 0.8735258, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44555664, + "step": 1800, + "time_per_iteration": 2.7352967262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128705, + "balance_loss_mlp": 1.0892942, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.06122285990583016, + "language_loss": 0.84089196, + "learning_rate": 0.0007595604692488507, + "loss": 0.85217899, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.39404297, + "step": 1801, + "time_per_iteration": 2.520047664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145052, + "balance_loss_mlp": 1.10182643, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.08959882775364528, + "language_loss": 0.83156121, + "learning_rate": 0.0007592941434205215, + "loss": 0.84301168, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.43237305, + "step": 1802, + "time_per_iteration": 2.774533987045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_mlp": 1.01191127, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.0173366039721641, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74594939, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11962891, + "step": 1803, + "time_per_iteration": 5.441190004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130945, + "balance_loss_mlp": 1.08481145, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07392614166366455, + "language_loss": 0.80754089, + "learning_rate": 0.0007587611898665566, + "loss": 0.81885034, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.4609375, + "step": 1804, + "time_per_iteration": 3.0738565921783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126694, + "balance_loss_mlp": 1.08320653, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.052717282161679486, + "language_loss": 0.82365519, + "learning_rate": 0.0007584945623478315, + "loss": 0.83492208, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.43530273, + "step": 1805, + "time_per_iteration": 2.810065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.08773112, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.0654216117506123, + "language_loss": 0.81839657, + "learning_rate": 0.000758227834472617, + "loss": 0.8297019, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.42822266, + "step": 1806, + "time_per_iteration": 3.0400753021240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129234, + "balance_loss_mlp": 1.08631909, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.06780310502945991, + "language_loss": 0.77468187, + "learning_rate": 0.0007579610063444664, + "loss": 0.78597426, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.42895508, + "step": 1807, + "time_per_iteration": 2.720200538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_mlp": 1.0805254, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.056464817781099026, + "language_loss": 0.87875664, + "learning_rate": 0.0007576940780669712, + "loss": 0.88999271, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4309082, + "step": 1808, + "time_per_iteration": 3.1972455978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119319, + "balance_loss_mlp": 1.07723832, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.06350201854913072, + "language_loss": 0.84194762, + "learning_rate": 0.0007574270497437624, + "loss": 0.85314083, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.42089844, + "step": 1809, + "time_per_iteration": 2.956308364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.08036816, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.05949268624371524, + "language_loss": 0.88030243, + "learning_rate": 0.000757159921478509, + "loss": 0.89152765, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.42138672, + "step": 1810, + "time_per_iteration": 2.7515318393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_mlp": 1.04769194, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.027450813841054106, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75509393, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.719837427139282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.09272385, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.06099509375847796, + "language_loss": 0.87676752, + "learning_rate": 0.0007566253655367423, + "loss": 0.88813394, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.43896484, + "step": 1812, + "time_per_iteration": 2.6117310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145498, + "balance_loss_mlp": 1.10196316, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.26075237363376164, + "language_loss": 0.90086293, + "learning_rate": 0.000756357938067762, + "loss": 0.91231787, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.43554688, + "step": 1813, + "time_per_iteration": 2.6537845134735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09305573, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.07803772738029488, + "language_loss": 0.8299284, + "learning_rate": 0.0007560904110718033, + "loss": 0.84130079, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44165039, + "step": 1814, + "time_per_iteration": 3.2229981422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131299, + "balance_loss_mlp": 1.08549881, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06602375994559181, + "language_loss": 0.83648008, + "learning_rate": 0.0007558227846527297, + "loss": 0.8477931, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.45751953, + "step": 1815, + "time_per_iteration": 2.8217966556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137186, + "balance_loss_mlp": 1.09300709, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.06552880481969095, + "language_loss": 0.83563447, + "learning_rate": 0.0007555550589144429, + "loss": 0.84700632, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44189453, + "step": 1816, + "time_per_iteration": 2.4231276512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148289, + "balance_loss_mlp": 1.1026082, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.05960251663438414, + "language_loss": 0.84705317, + "learning_rate": 0.000755287233960883, + "loss": 0.85853606, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.45678711, + "step": 1817, + "time_per_iteration": 2.5598244667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148754, + "balance_loss_mlp": 1.10297787, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.06564730471203778, + "language_loss": 0.78051704, + "learning_rate": 0.0007550193098960292, + "loss": 0.79200459, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.45751953, + "step": 1818, + "time_per_iteration": 2.8570642471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115418, + "balance_loss_mlp": 1.11033523, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.05538445579726575, + "language_loss": 0.8654325, + "learning_rate": 0.0007547512868238988, + "loss": 0.87697428, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.43847656, + "step": 1819, + "time_per_iteration": 3.1437833309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170578, + "balance_loss_mlp": 1.12499213, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.0822966351911203, + "language_loss": 0.83893883, + "learning_rate": 0.0007544831648485473, + "loss": 0.85064459, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.45605469, + "step": 1820, + "time_per_iteration": 2.660233736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162235, + "balance_loss_mlp": 1.11684048, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.06443547558053964, + "language_loss": 0.81439716, + "learning_rate": 0.0007542149440740694, + "loss": 0.82601953, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.45385742, + "step": 1821, + "time_per_iteration": 2.6618528366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154684, + "balance_loss_mlp": 1.10938418, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.06960442221541481, + "language_loss": 0.86201102, + "learning_rate": 0.000753946624604597, + "loss": 0.87355781, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.45288086, + "step": 1822, + "time_per_iteration": 2.7180583477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138697, + "balance_loss_mlp": 1.09466076, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.11840223630221765, + "language_loss": 0.88456279, + "learning_rate": 0.0007536782065443015, + "loss": 0.89594972, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44042969, + "step": 1823, + "time_per_iteration": 2.6035680770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147734, + "balance_loss_mlp": 1.1024822, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.08971754998357863, + "language_loss": 0.75357497, + "learning_rate": 0.0007534096899973919, + "loss": 0.76505232, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.45263672, + "step": 1824, + "time_per_iteration": 2.592313528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136405, + "balance_loss_mlp": 1.095397, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.056380284358423516, + "language_loss": 0.8296026, + "learning_rate": 0.0007531410750681154, + "loss": 0.84096658, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.41015625, + "step": 1825, + "time_per_iteration": 2.7599031925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149352, + "balance_loss_mlp": 1.10710466, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.06329210930184016, + "language_loss": 0.8686763, + "learning_rate": 0.0007528723618607575, + "loss": 0.88016987, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.42236328, + "step": 1826, + "time_per_iteration": 3.423145055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156709, + "balance_loss_mlp": 1.11808527, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.05752886424443174, + "language_loss": 0.8293525, + "learning_rate": 0.0007526035504796422, + "loss": 0.84091961, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.38598633, + "step": 1827, + "time_per_iteration": 2.774202346801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164193, + "balance_loss_mlp": 1.12080038, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.08334994788856638, + "language_loss": 0.87348354, + "learning_rate": 0.0007523346410291312, + "loss": 0.8851254, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.43408203, + "step": 1828, + "time_per_iteration": 2.7933921813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172191, + "balance_loss_mlp": 1.13127816, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.05847449829546615, + "language_loss": 0.85163879, + "learning_rate": 0.0007520656336136245, + "loss": 0.86336064, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.40942383, + "step": 1829, + "time_per_iteration": 2.9654810428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167386, + "balance_loss_mlp": 1.12675905, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06508844853371867, + "language_loss": 0.88540596, + "learning_rate": 0.0007517965283375599, + "loss": 0.89707983, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.40625, + "step": 1830, + "time_per_iteration": 2.833653211593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161789, + "balance_loss_mlp": 1.12078059, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.05306701185260888, + "language_loss": 0.89636958, + "learning_rate": 0.0007515273253054132, + "loss": 0.90798748, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.41015625, + "step": 1831, + "time_per_iteration": 2.648688554763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162371, + "balance_loss_mlp": 1.11788237, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.060637132075448665, + "language_loss": 0.8317945, + "learning_rate": 0.0007512580246216988, + "loss": 0.84341824, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44482422, + "step": 1832, + "time_per_iteration": 2.695558786392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152178, + "balance_loss_mlp": 1.11288619, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.06652239867864222, + "language_loss": 0.8520152, + "learning_rate": 0.000750988626390968, + "loss": 0.86353695, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.39306641, + "step": 1833, + "time_per_iteration": 2.5903215408325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114923, + "balance_loss_mlp": 1.10810232, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.05520517467567221, + "language_loss": 0.85274744, + "learning_rate": 0.0007507191307178108, + "loss": 0.86423969, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.41137695, + "step": 1834, + "time_per_iteration": 2.7567453384399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132557, + "balance_loss_mlp": 1.0890696, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.06897138795442613, + "language_loss": 0.75032014, + "learning_rate": 0.0007504495377068543, + "loss": 0.76164567, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.43481445, + "step": 1835, + "time_per_iteration": 2.7309370040893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134622, + "balance_loss_mlp": 1.08972788, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09099083327189633, + "language_loss": 0.81936944, + "learning_rate": 0.0007501798474627642, + "loss": 0.8307156, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44873047, + "step": 1836, + "time_per_iteration": 2.9126806259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113171, + "balance_loss_mlp": 1.08853245, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.058808043239055564, + "language_loss": 0.8375026, + "learning_rate": 0.0007499100600902433, + "loss": 0.84881973, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.43164062, + "step": 1837, + "time_per_iteration": 2.9810633659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124171, + "balance_loss_mlp": 1.08118403, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08552727697149294, + "language_loss": 0.8450433, + "learning_rate": 0.0007496401756940324, + "loss": 0.85628498, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.43017578, + "step": 1838, + "time_per_iteration": 2.670412540435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130914, + "balance_loss_mlp": 1.08897638, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.06964876492363449, + "language_loss": 0.82608843, + "learning_rate": 0.0007493701943789098, + "loss": 0.83739758, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.41967773, + "step": 1839, + "time_per_iteration": 2.772620677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.09537208, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07045943234490067, + "language_loss": 0.83116889, + "learning_rate": 0.000749100116249692, + "loss": 0.84255433, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.43188477, + "step": 1840, + "time_per_iteration": 2.6031582355499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144616, + "balance_loss_mlp": 1.10110414, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.08424265710124153, + "language_loss": 0.86582088, + "learning_rate": 0.0007488299414112321, + "loss": 0.87726706, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.43505859, + "step": 1841, + "time_per_iteration": 2.5864784717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.10726476, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.058600000923872894, + "language_loss": 0.77847576, + "learning_rate": 0.0007485596699684215, + "loss": 0.78998852, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.43994141, + "step": 1842, + "time_per_iteration": 2.8149642944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156484, + "balance_loss_mlp": 1.11266279, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.055073821734726955, + "language_loss": 0.85694617, + "learning_rate": 0.000748289302026189, + "loss": 0.86851102, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.43823242, + "step": 1843, + "time_per_iteration": 2.8475751876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158372, + "balance_loss_mlp": 1.11688685, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.057565803102883874, + "language_loss": 0.85718876, + "learning_rate": 0.0007480188376895004, + "loss": 0.86877251, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.41479492, + "step": 1844, + "time_per_iteration": 3.0344529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140478, + "balance_loss_mlp": 1.12693632, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.05127204690943662, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74951822, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.13574219, + "step": 1845, + "time_per_iteration": 4.8589537143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176931, + "balance_loss_mlp": 1.13518405, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08988090291235612, + "language_loss": 0.78641856, + "learning_rate": 0.0007474776202528074, + "loss": 0.79818785, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.41772461, + "step": 1846, + "time_per_iteration": 2.9269866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184559, + "balance_loss_mlp": 1.14243031, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08000045078310114, + "language_loss": 0.81513619, + "learning_rate": 0.000747206867362922, + "loss": 0.82698178, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.42114258, + "step": 1847, + "time_per_iteration": 3.067870616912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169442, + "balance_loss_mlp": 1.12573957, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.0760432300690223, + "language_loss": 0.84328806, + "learning_rate": 0.0007469360184988194, + "loss": 0.85498255, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.43701172, + "step": 1848, + "time_per_iteration": 2.8130369186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159569, + "balance_loss_mlp": 1.11837053, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08168000095068725, + "language_loss": 0.86707914, + "learning_rate": 0.0007466650737656518, + "loss": 0.87867486, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.41162109, + "step": 1849, + "time_per_iteration": 2.592503309249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115621, + "balance_loss_mlp": 1.11324644, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06757272046168854, + "language_loss": 0.89898217, + "learning_rate": 0.0007463940332686098, + "loss": 0.91054422, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.42944336, + "step": 1850, + "time_per_iteration": 2.4776744842529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148398, + "balance_loss_mlp": 1.10607898, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.05922624538442341, + "language_loss": 0.84461212, + "learning_rate": 0.0007461228971129205, + "loss": 0.85609609, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.42358398, + "step": 1851, + "time_per_iteration": 2.9012656211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.11387658, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.058626739978073765, + "language_loss": 0.85743707, + "learning_rate": 0.0007458516654038483, + "loss": 0.86898398, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.40820312, + "step": 1852, + "time_per_iteration": 2.666947603225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165665, + "balance_loss_mlp": 1.12160563, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06798765543406252, + "language_loss": 0.86475062, + "learning_rate": 0.0007455803382466946, + "loss": 0.87640727, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44042969, + "step": 1853, + "time_per_iteration": 2.804776191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162987, + "balance_loss_mlp": 1.11985719, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07311152518110202, + "language_loss": 0.87308323, + "learning_rate": 0.0007453089157467979, + "loss": 0.88471317, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.43139648, + "step": 1854, + "time_per_iteration": 2.8038864135742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159292, + "balance_loss_mlp": 1.1161381, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06621845487790666, + "language_loss": 0.82129812, + "learning_rate": 0.0007450373980095341, + "loss": 0.83289105, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.43164062, + "step": 1855, + "time_per_iteration": 3.0980496406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154286, + "balance_loss_mlp": 1.11268187, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.05908088829108725, + "language_loss": 0.87076378, + "learning_rate": 0.0007447657851403155, + "loss": 0.88230669, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.41601562, + "step": 1856, + "time_per_iteration": 2.6393351554870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148054, + "balance_loss_mlp": 1.10609269, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.07116077808597938, + "language_loss": 0.79415643, + "learning_rate": 0.0007444940772445915, + "loss": 0.805637, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.41943359, + "step": 1857, + "time_per_iteration": 2.7049038410186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.10770321, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06303496934817837, + "language_loss": 0.80443203, + "learning_rate": 0.0007442222744278484, + "loss": 0.81591749, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.40844727, + "step": 1858, + "time_per_iteration": 2.6416029930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.10056937, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.06290523981550739, + "language_loss": 0.84690839, + "learning_rate": 0.0007439503767956099, + "loss": 0.85831463, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.40063477, + "step": 1859, + "time_per_iteration": 2.697295665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095769, + "balance_loss_mlp": 1.08213139, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.02707100394521806, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80767375, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.13671875, + "step": 1860, + "time_per_iteration": 4.896381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157881, + "balance_loss_mlp": 1.11744571, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.054355964588402354, + "language_loss": 0.86204398, + "learning_rate": 0.000743406297506922, + "loss": 0.87362283, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.40478516, + "step": 1861, + "time_per_iteration": 2.7121450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154988, + "balance_loss_mlp": 1.11362243, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.056412092641732435, + "language_loss": 0.8442747, + "learning_rate": 0.0007431341160617031, + "loss": 0.85582459, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.41381836, + "step": 1862, + "time_per_iteration": 2.902806520462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.13052833, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06986467819319542, + "language_loss": 0.88734752, + "learning_rate": 0.0007428618402234491, + "loss": 0.89907002, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.41723633, + "step": 1863, + "time_per_iteration": 2.644352436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159657, + "balance_loss_mlp": 1.11831546, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.06293448628505635, + "language_loss": 0.8061077, + "learning_rate": 0.0007425894700978668, + "loss": 0.81770432, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.41357422, + "step": 1864, + "time_per_iteration": 2.782757043838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.10699308, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.056888458094662434, + "language_loss": 0.79858804, + "learning_rate": 0.0007423170057906996, + "loss": 0.81006974, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.41162109, + "step": 1865, + "time_per_iteration": 3.848773956298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133926, + "balance_loss_mlp": 1.09391952, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.06447904861600703, + "language_loss": 0.86500657, + "learning_rate": 0.0007420444474077275, + "loss": 0.87634581, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.40014648, + "step": 1866, + "time_per_iteration": 2.542572498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126566, + "balance_loss_mlp": 1.0855341, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.07300351460408123, + "language_loss": 0.8986578, + "learning_rate": 0.0007417717950547671, + "loss": 0.90992349, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.41040039, + "step": 1867, + "time_per_iteration": 2.5633254051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073925, + "balance_loss_mlp": 1.06143153, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.026482390846264015, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77070534, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.125, + "step": 1868, + "time_per_iteration": 4.904905557632446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111694, + "balance_loss_mlp": 1.07345176, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.053992922509511466, + "language_loss": 0.850173, + "learning_rate": 0.0007412262088623299, + "loss": 0.86128998, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.38232422, + "step": 1869, + "time_per_iteration": 2.7310874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110773, + "balance_loss_mlp": 1.07200575, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08370102618564679, + "language_loss": 0.79675972, + "learning_rate": 0.0007409532752346684, + "loss": 0.80786741, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.38769531, + "step": 1870, + "time_per_iteration": 2.6629347801208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110981, + "balance_loss_mlp": 1.07166612, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06403903481871269, + "language_loss": 0.88829064, + "learning_rate": 0.0007406802480606491, + "loss": 0.89940047, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.39306641, + "step": 1871, + "time_per_iteration": 2.6200008392333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.06835461, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.0729370697679506, + "language_loss": 0.90798759, + "learning_rate": 0.0007404071274462707, + "loss": 0.9190588, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.38769531, + "step": 1872, + "time_per_iteration": 2.5693628787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111805, + "balance_loss_mlp": 1.07978415, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.06627703814726228, + "language_loss": 0.84024733, + "learning_rate": 0.0007401339134975682, + "loss": 0.85142779, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.38208008, + "step": 1873, + "time_per_iteration": 2.7031140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127585, + "balance_loss_mlp": 1.08760262, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.06845959531373838, + "language_loss": 0.84298885, + "learning_rate": 0.0007398606063206122, + "loss": 0.85426462, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.39990234, + "step": 1874, + "time_per_iteration": 2.6090316772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.07598901, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.06521397848462201, + "language_loss": 0.78764814, + "learning_rate": 0.0007395872060215101, + "loss": 0.79879999, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.3918457, + "step": 1875, + "time_per_iteration": 2.620976448059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0831089, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.06345733178575377, + "language_loss": 0.88705117, + "learning_rate": 0.0007393137127064056, + "loss": 0.89827275, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.39013672, + "step": 1876, + "time_per_iteration": 2.7320597171783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125006, + "balance_loss_mlp": 1.08511841, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.056097062255587686, + "language_loss": 0.84576774, + "learning_rate": 0.0007390401264814779, + "loss": 0.85701776, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.39868164, + "step": 1877, + "time_per_iteration": 2.605865478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123607, + "balance_loss_mlp": 1.08503079, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.06159732683880817, + "language_loss": 0.84937686, + "learning_rate": 0.0007387664474529427, + "loss": 0.86061299, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.38598633, + "step": 1878, + "time_per_iteration": 2.6548514366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.09750319, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.05796680079252983, + "language_loss": 0.91768891, + "learning_rate": 0.0007384926757270518, + "loss": 0.92906928, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.40527344, + "step": 1879, + "time_per_iteration": 2.6339149475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137039, + "balance_loss_mlp": 1.09791493, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.05405313293747941, + "language_loss": 0.79881001, + "learning_rate": 0.0007382188114100924, + "loss": 0.81018037, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.39111328, + "step": 1880, + "time_per_iteration": 2.983384132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139197, + "balance_loss_mlp": 1.09964395, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.12141150358978081, + "language_loss": 0.82206392, + "learning_rate": 0.0007379448546083884, + "loss": 0.83345592, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.39575195, + "step": 1881, + "time_per_iteration": 2.9186532497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140707, + "balance_loss_mlp": 1.10127282, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06284373597557333, + "language_loss": 0.88377333, + "learning_rate": 0.0007376708054282992, + "loss": 0.8951804, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.39428711, + "step": 1882, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144635, + "balance_loss_mlp": 1.10605919, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.05224621202588268, + "language_loss": 0.84316945, + "learning_rate": 0.0007373966639762201, + "loss": 0.85461575, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.38574219, + "step": 1883, + "time_per_iteration": 2.623133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147786, + "balance_loss_mlp": 1.10620606, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.06751899300287477, + "language_loss": 0.89170045, + "learning_rate": 0.0007371224303585822, + "loss": 0.90317833, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.41577148, + "step": 1884, + "time_per_iteration": 2.628394842147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021984, + "balance_loss_mlp": 1.01154125, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.007236456832270123, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8137905, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10449219, + "step": 1885, + "time_per_iteration": 4.717620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114112, + "balance_loss_mlp": 1.10049307, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.057116908748179596, + "language_loss": 0.82560247, + "learning_rate": 0.0007365736870525335, + "loss": 0.83701366, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.40625, + "step": 1886, + "time_per_iteration": 2.8198611736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132227, + "balance_loss_mlp": 1.09310222, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.06530442713985495, + "language_loss": 0.83123338, + "learning_rate": 0.000736299177577164, + "loss": 0.84255564, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.39135742, + "step": 1887, + "time_per_iteration": 2.613863945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128864, + "balance_loss_mlp": 1.08992994, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0666501464088242, + "language_loss": 0.84363097, + "learning_rate": 0.0007360245763623174, + "loss": 0.85491955, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3894043, + "step": 1888, + "time_per_iteration": 2.6378068923950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115221, + "balance_loss_mlp": 1.07702661, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06993226621121658, + "language_loss": 0.90142351, + "learning_rate": 0.0007357498835146039, + "loss": 0.91257572, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.38183594, + "step": 1889, + "time_per_iteration": 2.8125081062316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128167, + "balance_loss_mlp": 1.08878016, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.07359030033413445, + "language_loss": 0.87316656, + "learning_rate": 0.0007354750991406684, + "loss": 0.88444823, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.39379883, + "step": 1890, + "time_per_iteration": 2.714569568634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121285, + "balance_loss_mlp": 1.0807066, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07836036923074335, + "language_loss": 0.80991101, + "learning_rate": 0.0007352002233471919, + "loss": 0.8211239, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.40576172, + "step": 1891, + "time_per_iteration": 2.6287412643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121974, + "balance_loss_mlp": 1.08180022, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.058839902089765785, + "language_loss": 0.79524523, + "learning_rate": 0.0007349252562408906, + "loss": 0.80646491, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.40161133, + "step": 1892, + "time_per_iteration": 2.669903039932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125098, + "balance_loss_mlp": 1.08449531, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.057079030651025625, + "language_loss": 0.81590033, + "learning_rate": 0.0007346501979285158, + "loss": 0.8271513, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.40600586, + "step": 1893, + "time_per_iteration": 2.9146764278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083448, + "balance_loss_mlp": 1.07238543, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.036364529291757694, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81622547, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11083984, + "step": 1894, + "time_per_iteration": 4.784435272216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126267, + "balance_loss_mlp": 1.08444858, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06549610472034906, + "language_loss": 0.86352968, + "learning_rate": 0.0007340998081127308, + "loss": 0.87479234, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.41796875, + "step": 1895, + "time_per_iteration": 2.7702367305755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130662, + "balance_loss_mlp": 1.09113181, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06520113052193731, + "language_loss": 0.91046786, + "learning_rate": 0.0007338244768230007, + "loss": 0.92177445, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.39550781, + "step": 1896, + "time_per_iteration": 2.7612760066986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133468, + "balance_loss_mlp": 1.09315181, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.058734972315737245, + "language_loss": 0.89108521, + "learning_rate": 0.0007335490547545578, + "loss": 0.90241992, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.40307617, + "step": 1897, + "time_per_iteration": 3.024462938308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135084, + "balance_loss_mlp": 1.09343266, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06208128991116815, + "language_loss": 0.82833707, + "learning_rate": 0.0007332735420143308, + "loss": 0.83968788, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.41650391, + "step": 1898, + "time_per_iteration": 2.725468158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112873, + "balance_loss_mlp": 1.08669686, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.09645190116324148, + "language_loss": 0.86573303, + "learning_rate": 0.0007329979387092826, + "loss": 0.8770203, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.42016602, + "step": 1899, + "time_per_iteration": 2.6357531547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133626, + "balance_loss_mlp": 1.09259379, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.06150604002201611, + "language_loss": 0.84294677, + "learning_rate": 0.0007327222449464124, + "loss": 0.85428298, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.41040039, + "step": 1900, + "time_per_iteration": 3.2381174564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136855, + "balance_loss_mlp": 1.09382069, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07567830151973255, + "language_loss": 0.89052904, + "learning_rate": 0.0007324464608327538, + "loss": 0.90189761, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4309082, + "step": 1901, + "time_per_iteration": 2.597569227218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.10814035, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.07712085030005716, + "language_loss": 0.88794601, + "learning_rate": 0.0007321705864753758, + "loss": 0.89944601, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.41870117, + "step": 1902, + "time_per_iteration": 2.6877686977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151954, + "balance_loss_mlp": 1.11097002, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.05591922142148154, + "language_loss": 0.84586883, + "learning_rate": 0.0007318946219813823, + "loss": 0.85738844, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.40991211, + "step": 1903, + "time_per_iteration": 3.0283257961273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11341679, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.0702623940180467, + "language_loss": 0.90117764, + "learning_rate": 0.000731618567457912, + "loss": 0.91269374, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.38208008, + "step": 1904, + "time_per_iteration": 2.651491165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114788, + "balance_loss_mlp": 1.10522676, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07047012066076976, + "language_loss": 0.87036794, + "learning_rate": 0.000731342423012139, + "loss": 0.88184673, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.42700195, + "step": 1905, + "time_per_iteration": 3.0361618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143776, + "balance_loss_mlp": 1.10331631, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.06969182334255739, + "language_loss": 0.82982039, + "learning_rate": 0.0007310661887512722, + "loss": 0.84125817, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.40478516, + "step": 1906, + "time_per_iteration": 3.020333766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134716, + "balance_loss_mlp": 1.09592557, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.056548054453958524, + "language_loss": 0.82503444, + "learning_rate": 0.0007307898647825549, + "loss": 0.83638155, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.38793945, + "step": 1907, + "time_per_iteration": 2.6819958686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128623, + "balance_loss_mlp": 1.08568358, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.0662764931561561, + "language_loss": 0.89910614, + "learning_rate": 0.0007305134512132659, + "loss": 0.9103924, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.42944336, + "step": 1908, + "time_per_iteration": 2.688716411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.08063269, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.07972147303822336, + "language_loss": 0.83329952, + "learning_rate": 0.0007302369481507183, + "loss": 0.8445071, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.40136719, + "step": 1909, + "time_per_iteration": 2.520551919937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043123, + "balance_loss_mlp": 1.03272831, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.028970701382128577, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004882, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10400391, + "step": 1910, + "time_per_iteration": 4.862990140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_mlp": 1.07534695, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.0535153553246422, + "language_loss": 0.85860741, + "learning_rate": 0.000729683673975274, + "loss": 0.86976075, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.3996582, + "step": 1911, + "time_per_iteration": 2.6834514141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117796, + "balance_loss_mlp": 1.07783747, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.07394300555179863, + "language_loss": 0.83108044, + "learning_rate": 0.0007294069030771774, + "loss": 0.84225845, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.39941406, + "step": 1912, + "time_per_iteration": 3.6458523273468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124936, + "balance_loss_mlp": 1.08483398, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.05916806609098389, + "language_loss": 0.90897858, + "learning_rate": 0.0007291300431154224, + "loss": 0.920228, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.40112305, + "step": 1913, + "time_per_iteration": 2.5737557411193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_mlp": 1.02157927, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.013681752942923219, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71422619, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.11279297, + "step": 1914, + "time_per_iteration": 5.031456232070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113953, + "balance_loss_mlp": 1.07499564, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.06158754254944219, + "language_loss": 0.79961407, + "learning_rate": 0.0007285760564309179, + "loss": 0.81075364, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.38964844, + "step": 1915, + "time_per_iteration": 3.152339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122924, + "balance_loss_mlp": 1.08346629, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10197178679971165, + "language_loss": 0.85308397, + "learning_rate": 0.0007282989299232448, + "loss": 0.86431319, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.39453125, + "step": 1916, + "time_per_iteration": 3.0152268409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119949, + "balance_loss_mlp": 1.08013296, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.05980283450468872, + "language_loss": 0.8385278, + "learning_rate": 0.0007280217147820668, + "loss": 0.84972733, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.39794922, + "step": 1917, + "time_per_iteration": 2.625802755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114962, + "balance_loss_mlp": 1.07512259, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06755957483710798, + "language_loss": 0.79489267, + "learning_rate": 0.0007277444111150079, + "loss": 0.80604231, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.3984375, + "step": 1918, + "time_per_iteration": 2.6753525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112846, + "balance_loss_mlp": 1.08785725, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.07157808177363079, + "language_loss": 0.84730321, + "learning_rate": 0.0007274670190297272, + "loss": 0.8585878, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.40576172, + "step": 1919, + "time_per_iteration": 2.6149959564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.09986341, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.05944559747374387, + "language_loss": 0.8264004, + "learning_rate": 0.0007271895386339179, + "loss": 0.83782172, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.42285156, + "step": 1920, + "time_per_iteration": 2.7611513137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140712, + "balance_loss_mlp": 1.09970427, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.059089751588204814, + "language_loss": 0.83542717, + "learning_rate": 0.0007269119700353073, + "loss": 0.8468343, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.41015625, + "step": 1921, + "time_per_iteration": 2.782167911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148229, + "balance_loss_mlp": 1.10738814, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06644949508392005, + "language_loss": 0.85268104, + "learning_rate": 0.0007266343133416571, + "loss": 0.8641634, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.40844727, + "step": 1922, + "time_per_iteration": 2.7218997478485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.06340241, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.03214674667569998, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78192997, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.12695312, + "step": 1923, + "time_per_iteration": 4.837427854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145902, + "balance_loss_mlp": 1.1028676, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.07518583721861193, + "language_loss": 0.84417462, + "learning_rate": 0.0007260787361004556, + "loss": 0.85563368, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.43041992, + "step": 1924, + "time_per_iteration": 2.5874598026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.03880954, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023888622594867324, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74812186, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11865234, + "step": 1925, + "time_per_iteration": 4.961286544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137865, + "balance_loss_mlp": 1.09571242, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.05584746966952834, + "language_loss": 0.87657702, + "learning_rate": 0.0007255228077730903, + "loss": 0.88795567, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.42163086, + "step": 1926, + "time_per_iteration": 2.663482666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.09786606, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.05562014185368244, + "language_loss": 0.81976974, + "learning_rate": 0.0007252447122218632, + "loss": 0.83117759, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.42919922, + "step": 1927, + "time_per_iteration": 3.1484758853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138853, + "balance_loss_mlp": 1.09655809, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.06601877155853234, + "language_loss": 0.88791764, + "learning_rate": 0.0007249665292228834, + "loss": 0.89930612, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.4230957, + "step": 1928, + "time_per_iteration": 2.5840864181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140563, + "balance_loss_mlp": 1.09872091, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.05314866644458525, + "language_loss": 0.83534646, + "learning_rate": 0.000724688258884151, + "loss": 0.84675211, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.41845703, + "step": 1929, + "time_per_iteration": 2.6063482761383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129765, + "balance_loss_mlp": 1.09166527, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.06946275153671234, + "language_loss": 0.86767673, + "learning_rate": 0.0007244099013137002, + "loss": 0.87897444, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.38085938, + "step": 1930, + "time_per_iteration": 3.0539071559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.0873971, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.05696415350586704, + "language_loss": 0.89040637, + "learning_rate": 0.0007241314566195993, + "loss": 0.90168232, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.40185547, + "step": 1931, + "time_per_iteration": 3.2625389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07861531, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.08463017827171934, + "language_loss": 0.85909784, + "learning_rate": 0.0007238529249099496, + "loss": 0.87028337, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.39941406, + "step": 1932, + "time_per_iteration": 2.6740944385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.09080601, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.046016525030599324, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78958464, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10791016, + "step": 1933, + "time_per_iteration": 4.862685203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125745, + "balance_loss_mlp": 1.08347321, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.10032321862894769, + "language_loss": 0.80747449, + "learning_rate": 0.000723295600876581, + "loss": 0.81873196, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.42285156, + "step": 1934, + "time_per_iteration": 2.990391969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125218, + "balance_loss_mlp": 1.08406699, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.057414096803471676, + "language_loss": 0.87956464, + "learning_rate": 0.0007230168087692344, + "loss": 0.89081681, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.41162109, + "step": 1935, + "time_per_iteration": 2.656625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119009, + "balance_loss_mlp": 1.07924092, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.060205825913767164, + "language_loss": 0.82307911, + "learning_rate": 0.0007227379300790839, + "loss": 0.83426917, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.39770508, + "step": 1936, + "time_per_iteration": 2.997037649154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114267, + "balance_loss_mlp": 1.07218599, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.06128365804507508, + "language_loss": 0.86067426, + "learning_rate": 0.0007224589649143997, + "loss": 0.87181687, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.4206543, + "step": 1937, + "time_per_iteration": 2.5290677547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124508, + "balance_loss_mlp": 1.08228397, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.06605047879793914, + "language_loss": 0.81297445, + "learning_rate": 0.0007221799133834861, + "loss": 0.82421947, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.42236328, + "step": 1938, + "time_per_iteration": 2.613140106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122203, + "balance_loss_mlp": 1.08195794, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.09318016716004435, + "language_loss": 0.8198092, + "learning_rate": 0.00072190077559468, + "loss": 0.83103126, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.40209961, + "step": 1939, + "time_per_iteration": 2.517237424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115818, + "balance_loss_mlp": 1.07578754, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.0553068133661429, + "language_loss": 0.8932575, + "learning_rate": 0.0007216215516563527, + "loss": 0.90441567, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.40014648, + "step": 1940, + "time_per_iteration": 2.7175915241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_mlp": 1.07089305, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.06982995582267476, + "language_loss": 0.83827746, + "learning_rate": 0.0007213422416769083, + "loss": 0.84939647, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.41015625, + "step": 1941, + "time_per_iteration": 2.5922279357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116664, + "balance_loss_mlp": 1.07684803, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.050249137281424494, + "language_loss": 0.75479639, + "learning_rate": 0.0007210628457647849, + "loss": 0.76596296, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.39819336, + "step": 1942, + "time_per_iteration": 2.583151340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.07781446, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.0794488438004998, + "language_loss": 0.79022861, + "learning_rate": 0.000720783364028453, + "loss": 0.8014161, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.40942383, + "step": 1943, + "time_per_iteration": 2.7737677097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114071, + "balance_loss_mlp": 1.07418346, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.05694655733140731, + "language_loss": 0.87941283, + "learning_rate": 0.0007205037965764177, + "loss": 0.89055347, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.39868164, + "step": 1944, + "time_per_iteration": 2.558089256286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123121, + "balance_loss_mlp": 1.08430672, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07621334150317126, + "language_loss": 0.85730159, + "learning_rate": 0.0007202241435172161, + "loss": 0.86853278, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.38769531, + "step": 1945, + "time_per_iteration": 2.7602779865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125439, + "balance_loss_mlp": 1.08574176, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07927003262790512, + "language_loss": 0.88465476, + "learning_rate": 0.0007199444049594198, + "loss": 0.89590919, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.39697266, + "step": 1946, + "time_per_iteration": 2.9583580493927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119027, + "balance_loss_mlp": 1.07665968, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.055396154938164174, + "language_loss": 0.8346498, + "learning_rate": 0.0007196645810116322, + "loss": 0.8458401, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.42382812, + "step": 1947, + "time_per_iteration": 2.6851320266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131178, + "balance_loss_mlp": 1.09045637, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.05889971918419499, + "language_loss": 0.84302223, + "learning_rate": 0.0007193846717824912, + "loss": 0.854334, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.40698242, + "step": 1948, + "time_per_iteration": 2.9035325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.08848619, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.07994215642664601, + "language_loss": 0.88549483, + "learning_rate": 0.0007191046773806669, + "loss": 0.89678907, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.40942383, + "step": 1949, + "time_per_iteration": 2.574697256088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135159, + "balance_loss_mlp": 1.09224343, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07615017139071276, + "language_loss": 0.8356899, + "learning_rate": 0.0007188245979148631, + "loss": 0.84704149, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.42919922, + "step": 1950, + "time_per_iteration": 3.216397285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137761, + "balance_loss_mlp": 1.09475029, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.061651705216508604, + "language_loss": 0.87894762, + "learning_rate": 0.0007185444334938157, + "loss": 0.89032525, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.43041992, + "step": 1951, + "time_per_iteration": 2.6782584190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127424, + "balance_loss_mlp": 1.08879972, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.07782676746029546, + "language_loss": 0.84900033, + "learning_rate": 0.0007182641842262947, + "loss": 0.86027455, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.38647461, + "step": 1952, + "time_per_iteration": 2.639446258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125752, + "balance_loss_mlp": 1.08603168, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.05954692469221933, + "language_loss": 0.78027642, + "learning_rate": 0.0007179838502211022, + "loss": 0.79153389, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.3972168, + "step": 1953, + "time_per_iteration": 2.84329891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131364, + "balance_loss_mlp": 1.09028411, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.10232430816689406, + "language_loss": 0.86411202, + "learning_rate": 0.0007177034315870738, + "loss": 0.8754257, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.41064453, + "step": 1954, + "time_per_iteration": 2.957648992538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124051, + "balance_loss_mlp": 1.08325803, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06271313302399782, + "language_loss": 0.91398948, + "learning_rate": 0.0007174229284330773, + "loss": 0.92523003, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.40795898, + "step": 1955, + "time_per_iteration": 2.5879859924316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128257, + "balance_loss_mlp": 1.08879828, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.06607511431735706, + "language_loss": 0.86850858, + "learning_rate": 0.0007171423408680141, + "loss": 0.87979114, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.39453125, + "step": 1956, + "time_per_iteration": 2.7903566360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123297, + "balance_loss_mlp": 1.08295655, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.06886679209235984, + "language_loss": 0.90041375, + "learning_rate": 0.0007168616690008176, + "loss": 0.91164672, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.40356445, + "step": 1957, + "time_per_iteration": 2.6327474117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07705224, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.062429689069725576, + "language_loss": 0.85725892, + "learning_rate": 0.0007165809129404545, + "loss": 0.86842352, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.39404297, + "step": 1958, + "time_per_iteration": 2.7385900020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124898, + "balance_loss_mlp": 1.08527279, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.05793527093847313, + "language_loss": 0.85962278, + "learning_rate": 0.0007163000727959239, + "loss": 0.87087178, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.39624023, + "step": 1959, + "time_per_iteration": 2.485438585281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_mlp": 1.0320313, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.027906108498427614, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79005599, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.14453125, + "step": 1960, + "time_per_iteration": 4.834578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_mlp": 1.07865775, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.05325294699236946, + "language_loss": 0.84349847, + "learning_rate": 0.00071573814069052, + "loss": 0.85467696, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.39208984, + "step": 1961, + "time_per_iteration": 2.9086802005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120534, + "balance_loss_mlp": 1.08219612, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.09498383670658105, + "language_loss": 0.88074362, + "learning_rate": 0.0007154570489478081, + "loss": 0.89194894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.38330078, + "step": 1962, + "time_per_iteration": 3.2217841148376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117183, + "balance_loss_mlp": 1.07889283, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.05466788938828107, + "language_loss": 0.86278516, + "learning_rate": 0.0007151758735572514, + "loss": 0.87395698, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.38232422, + "step": 1963, + "time_per_iteration": 3.01104998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130106, + "balance_loss_mlp": 1.08921766, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.06218420858169212, + "language_loss": 0.81413925, + "learning_rate": 0.0007148946146280119, + "loss": 0.82544029, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.40893555, + "step": 1964, + "time_per_iteration": 2.8039112091064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_mlp": 1.01440012, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.022738468700431315, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73218751, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12207031, + "step": 1965, + "time_per_iteration": 4.8600172996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024213, + "balance_loss_mlp": 1.0124352, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.018349030303600054, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76366156, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.11767578, + "step": 1966, + "time_per_iteration": 4.918729782104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135372, + "balance_loss_mlp": 1.09648633, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.2766921299066869, + "language_loss": 0.83812642, + "learning_rate": 0.0007140503377003022, + "loss": 0.84948009, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.38891602, + "step": 1967, + "time_per_iteration": 3.015761613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149326, + "balance_loss_mlp": 1.10862756, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.07158509383086724, + "language_loss": 0.8519339, + "learning_rate": 0.000713768745708599, + "loss": 0.8634271, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.40698242, + "step": 1968, + "time_per_iteration": 2.6109209060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140905, + "balance_loss_mlp": 1.09996843, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.05954158443482363, + "language_loss": 0.774553, + "learning_rate": 0.0007134870707245085, + "loss": 0.78596205, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.40893555, + "step": 1969, + "time_per_iteration": 3.2631757259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150809, + "balance_loss_mlp": 1.11008716, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.05763521765218817, + "language_loss": 0.84313977, + "learning_rate": 0.0007132053128573864, + "loss": 0.85464787, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.40698242, + "step": 1970, + "time_per_iteration": 2.7791051864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143919, + "balance_loss_mlp": 1.10353041, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06905446326925666, + "language_loss": 0.84168518, + "learning_rate": 0.0007129234722166211, + "loss": 0.85312432, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.40356445, + "step": 1971, + "time_per_iteration": 2.8210554122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149932, + "balance_loss_mlp": 1.11152232, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.07023460279096982, + "language_loss": 0.91057038, + "learning_rate": 0.0007126415489116328, + "loss": 0.92206967, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3840332, + "step": 1972, + "time_per_iteration": 2.672755002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153021, + "balance_loss_mlp": 1.11210799, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06814261110374484, + "language_loss": 0.81719398, + "learning_rate": 0.0007123595430518736, + "loss": 0.82872415, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.40917969, + "step": 1973, + "time_per_iteration": 2.8325109481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_mlp": 1.10081029, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.06503005991167149, + "language_loss": 0.86840981, + "learning_rate": 0.0007120774547468282, + "loss": 0.87980628, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.38793945, + "step": 1974, + "time_per_iteration": 2.6115715503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148199, + "balance_loss_mlp": 1.10781133, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.05441443516000103, + "language_loss": 0.81729043, + "learning_rate": 0.0007117952841060128, + "loss": 0.82877243, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.40380859, + "step": 1975, + "time_per_iteration": 2.6378135681152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135454, + "balance_loss_mlp": 1.09389758, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08133175482890537, + "language_loss": 0.83869064, + "learning_rate": 0.0007115130312389756, + "loss": 0.85004514, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.41552734, + "step": 1976, + "time_per_iteration": 2.664318084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139177, + "balance_loss_mlp": 1.0974772, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.06620518382871708, + "language_loss": 0.79781663, + "learning_rate": 0.0007112306962552973, + "loss": 0.80920839, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.41699219, + "step": 1977, + "time_per_iteration": 2.6198599338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.0891974, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.05972767263520316, + "language_loss": 0.85605282, + "learning_rate": 0.0007109482792645896, + "loss": 0.86734867, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.40356445, + "step": 1978, + "time_per_iteration": 2.728576898574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132218, + "balance_loss_mlp": 1.09066188, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.09572440125940551, + "language_loss": 0.84308225, + "learning_rate": 0.0007106657803764969, + "loss": 0.85440445, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.41552734, + "step": 1979, + "time_per_iteration": 2.7279720306396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126537, + "balance_loss_mlp": 1.08340704, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.05862837672704736, + "language_loss": 0.82269728, + "learning_rate": 0.0007103831997006948, + "loss": 0.83396262, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.43164062, + "step": 1980, + "time_per_iteration": 2.746915817260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127621, + "balance_loss_mlp": 1.08663654, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.05821983888794681, + "language_loss": 0.85798764, + "learning_rate": 0.0007101005373468908, + "loss": 0.86926389, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.40991211, + "step": 1981, + "time_per_iteration": 2.878394365310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131348, + "balance_loss_mlp": 1.09060264, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.057148713710776886, + "language_loss": 0.86977971, + "learning_rate": 0.0007098177934248242, + "loss": 0.88109326, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.40771484, + "step": 1982, + "time_per_iteration": 2.7281908988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142672, + "balance_loss_mlp": 1.09918451, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.07304374640444197, + "language_loss": 0.85583997, + "learning_rate": 0.0007095349680442661, + "loss": 0.86726665, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.43505859, + "step": 1983, + "time_per_iteration": 2.831989288330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132213, + "balance_loss_mlp": 1.09015596, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.059661631452858944, + "language_loss": 0.79073238, + "learning_rate": 0.0007092520613150188, + "loss": 0.80205452, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4206543, + "step": 1984, + "time_per_iteration": 2.6566810607910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.09416926, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.0624399969319272, + "language_loss": 0.81395054, + "learning_rate": 0.0007089690733469165, + "loss": 0.82531422, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.42236328, + "step": 1985, + "time_per_iteration": 2.713041067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133128, + "balance_loss_mlp": 1.09023643, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.0833415836593691, + "language_loss": 0.83054602, + "learning_rate": 0.000708686004249825, + "loss": 0.84187728, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.42895508, + "step": 1986, + "time_per_iteration": 2.7708489894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135389, + "balance_loss_mlp": 1.09311724, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.050231849807362665, + "language_loss": 0.91983181, + "learning_rate": 0.0007084028541336413, + "loss": 0.93118572, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.42260742, + "step": 1987, + "time_per_iteration": 2.7049031257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135282, + "balance_loss_mlp": 1.09205675, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.07987509930436443, + "language_loss": 0.86416399, + "learning_rate": 0.0007081196231082942, + "loss": 0.87551689, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.43212891, + "step": 1988, + "time_per_iteration": 2.769559860229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.09949565, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.09872496004335095, + "language_loss": 0.80492568, + "learning_rate": 0.0007078363112837436, + "loss": 0.81635618, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.43579102, + "step": 1989, + "time_per_iteration": 2.836904525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144237, + "balance_loss_mlp": 1.10065365, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.05755280117815587, + "language_loss": 0.85391158, + "learning_rate": 0.000707552918769981, + "loss": 0.86535394, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43579102, + "step": 1990, + "time_per_iteration": 2.552560806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114164, + "balance_loss_mlp": 1.09846199, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.058237292508227935, + "language_loss": 0.83844453, + "learning_rate": 0.000707269445677029, + "loss": 0.84986091, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43188477, + "step": 1991, + "time_per_iteration": 2.717240571975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155192, + "balance_loss_mlp": 1.11270583, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.08345502818850435, + "language_loss": 0.85774487, + "learning_rate": 0.0007069858921149416, + "loss": 0.86929679, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.42480469, + "step": 1992, + "time_per_iteration": 2.937901496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143498, + "balance_loss_mlp": 1.10120225, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.06679457573221616, + "language_loss": 0.86415881, + "learning_rate": 0.0007067022581938043, + "loss": 0.87559378, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.4230957, + "step": 1993, + "time_per_iteration": 2.8283159732818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147458, + "balance_loss_mlp": 1.10614026, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.06079929242541683, + "language_loss": 0.83476102, + "learning_rate": 0.0007064185440237334, + "loss": 0.84623557, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.41333008, + "step": 1994, + "time_per_iteration": 2.738664150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148789, + "balance_loss_mlp": 1.10627878, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.05320553517563596, + "language_loss": 0.8495338, + "learning_rate": 0.0007061347497148764, + "loss": 0.8610217, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.42504883, + "step": 1995, + "time_per_iteration": 2.7379775047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147554, + "balance_loss_mlp": 1.10444832, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.059351713178290334, + "language_loss": 0.86747766, + "learning_rate": 0.0007058508753774122, + "loss": 0.87895322, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.4309082, + "step": 1996, + "time_per_iteration": 2.6882424354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144268, + "balance_loss_mlp": 1.10242534, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.08780844300106258, + "language_loss": 0.87086272, + "learning_rate": 0.0007055669211215505, + "loss": 0.88230544, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.41870117, + "step": 1997, + "time_per_iteration": 2.5902607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136259, + "balance_loss_mlp": 1.09236586, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.0743501008638896, + "language_loss": 0.77852333, + "learning_rate": 0.0007052828870575322, + "loss": 0.78988594, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43896484, + "step": 1998, + "time_per_iteration": 2.643887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113691, + "balance_loss_mlp": 1.09521055, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.05655172042288627, + "language_loss": 0.87035221, + "learning_rate": 0.0007049987732956291, + "loss": 0.88172132, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.41723633, + "step": 1999, + "time_per_iteration": 2.9655773639678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132979, + "balance_loss_mlp": 1.09325886, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.061738893850828154, + "language_loss": 0.83046496, + "learning_rate": 0.0007047145799461439, + "loss": 0.84179473, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.39746094, + "step": 2000, + "time_per_iteration": 2.8509583473205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_mlp": 1.0917958, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06203375299954445, + "language_loss": 0.82530397, + "learning_rate": 0.00070443030711941, + "loss": 0.83663273, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.41088867, + "step": 2001, + "time_per_iteration": 2.759324312210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134639, + "balance_loss_mlp": 1.09386945, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.05757301433327453, + "language_loss": 0.83082199, + "learning_rate": 0.0007041459549257924, + "loss": 0.84216839, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.40771484, + "step": 2002, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121155, + "balance_loss_mlp": 1.08014655, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.07528883527847323, + "language_loss": 0.78547823, + "learning_rate": 0.0007038615234756859, + "loss": 0.79668975, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.41015625, + "step": 2003, + "time_per_iteration": 3.211712598800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_mlp": 1.08257461, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.05751633762771481, + "language_loss": 0.83558142, + "learning_rate": 0.000703577012879517, + "loss": 0.84683371, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.42651367, + "step": 2004, + "time_per_iteration": 2.628211498260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130283, + "balance_loss_mlp": 1.08956099, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.08619617913051347, + "language_loss": 0.89379585, + "learning_rate": 0.0007032924232477423, + "loss": 0.90509868, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.40722656, + "step": 2005, + "time_per_iteration": 2.631619930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128848, + "balance_loss_mlp": 1.08743477, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.06586843636176778, + "language_loss": 0.80831605, + "learning_rate": 0.0007030077546908493, + "loss": 0.81960452, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.6160101890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336479, + "balance_loss_mlp": 1.3253212, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.11294410837330418, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84401143, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11181641, + "step": 2007, + "time_per_iteration": 4.7873475551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131514, + "balance_loss_mlp": 1.09014845, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.06382618687285554, + "language_loss": 0.79329109, + "learning_rate": 0.0007024381812438117, + "loss": 0.8046062, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.41381836, + "step": 2008, + "time_per_iteration": 2.5387141704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152986, + "balance_loss_mlp": 1.11390948, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.0811673363837608, + "language_loss": 0.83681285, + "learning_rate": 0.0007021532765747951, + "loss": 0.84834278, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.390625, + "step": 2009, + "time_per_iteration": 2.9795420169830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164171, + "balance_loss_mlp": 1.12082672, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.11123688030830275, + "language_loss": 0.7961666, + "learning_rate": 0.0007018682934229162, + "loss": 0.80780828, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43334961, + "step": 2010, + "time_per_iteration": 2.9108352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164881, + "balance_loss_mlp": 1.1216315, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.07913719393788664, + "language_loss": 0.83099723, + "learning_rate": 0.0007015832318988152, + "loss": 0.842646, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43237305, + "step": 2011, + "time_per_iteration": 2.605280637741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082789, + "balance_loss_mlp": 1.07096386, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.024547203760462325, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74972868, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11816406, + "step": 2012, + "time_per_iteration": 4.955415964126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161774, + "balance_loss_mlp": 1.12167192, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.062010894867637535, + "language_loss": 0.84259552, + "learning_rate": 0.0007010128741766604, + "loss": 0.85421324, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.40112305, + "step": 2013, + "time_per_iteration": 2.738905906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162675, + "balance_loss_mlp": 1.12080884, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.08443522979585812, + "language_loss": 0.84619504, + "learning_rate": 0.0007007275782000391, + "loss": 0.85782182, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.41870117, + "step": 2014, + "time_per_iteration": 2.6049582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178912, + "balance_loss_mlp": 1.13528132, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.05901822901260885, + "language_loss": 0.84836662, + "learning_rate": 0.0007004422042940605, + "loss": 0.8601557, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.43603516, + "step": 2015, + "time_per_iteration": 2.5449817180633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174031, + "balance_loss_mlp": 1.13106763, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.07137462797198264, + "language_loss": 0.89881837, + "learning_rate": 0.0007001567525695169, + "loss": 0.9105587, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.42993164, + "step": 2016, + "time_per_iteration": 2.5804128646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191346, + "balance_loss_mlp": 1.14921737, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.11416128839824946, + "language_loss": 0.84030014, + "learning_rate": 0.0006998712231372303, + "loss": 0.85221362, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.42138672, + "step": 2017, + "time_per_iteration": 2.9779462814331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182085, + "balance_loss_mlp": 1.13845432, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06300984009010882, + "language_loss": 0.86622429, + "learning_rate": 0.0006995856161080532, + "loss": 0.87804508, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43652344, + "step": 2018, + "time_per_iteration": 2.8405675888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160301, + "balance_loss_mlp": 1.11588371, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.0764923139512956, + "language_loss": 0.8250891, + "learning_rate": 0.0006992999315928679, + "loss": 0.83669221, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.44433594, + "step": 2019, + "time_per_iteration": 2.7929439544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146323, + "balance_loss_mlp": 1.10407472, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.09156853050649941, + "language_loss": 0.86159158, + "learning_rate": 0.0006990141697025871, + "loss": 0.8730548, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.42236328, + "step": 2020, + "time_per_iteration": 2.7913589477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137863, + "balance_loss_mlp": 1.12422562, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.035838926183426385, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77497506, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.13671875, + "step": 2021, + "time_per_iteration": 4.727250576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011348, + "balance_loss_mlp": 1.09398317, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0717580829802053, + "language_loss": 0.82676983, + "learning_rate": 0.0006984424142405392, + "loss": 0.8381179, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.40771484, + "step": 2022, + "time_per_iteration": 2.810420513153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.09006715, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.11474151925346394, + "language_loss": 0.8263585, + "learning_rate": 0.0006981564208907474, + "loss": 0.83766377, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.40454102, + "step": 2023, + "time_per_iteration": 2.604849100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139234, + "balance_loss_mlp": 1.09763026, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.05701984367640102, + "language_loss": 0.90312237, + "learning_rate": 0.0006978703506098102, + "loss": 0.91451472, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.41601562, + "step": 2024, + "time_per_iteration": 2.7345082759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115758, + "balance_loss_mlp": 1.11683416, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.06830457595999238, + "language_loss": 0.87819719, + "learning_rate": 0.00069758420350879, + "loss": 0.88977301, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.40722656, + "step": 2025, + "time_per_iteration": 2.6252336502075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160672, + "balance_loss_mlp": 1.11689889, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.07405760759256953, + "language_loss": 0.8637889, + "learning_rate": 0.000697297979698779, + "loss": 0.87539566, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43774414, + "step": 2026, + "time_per_iteration": 2.709831476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.11291099, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06812366476721117, + "language_loss": 0.83983821, + "learning_rate": 0.0006970116792908992, + "loss": 0.85135239, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.38500977, + "step": 2027, + "time_per_iteration": 3.0651228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.10976994, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.06881031116362346, + "language_loss": 0.82086015, + "learning_rate": 0.000696725302396302, + "loss": 0.832358, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.39990234, + "step": 2028, + "time_per_iteration": 2.6441640853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134129, + "balance_loss_mlp": 1.09400284, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.05768401763088921, + "language_loss": 0.86036873, + "learning_rate": 0.0006964388491261692, + "loss": 0.87171006, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.40136719, + "step": 2029, + "time_per_iteration": 3.3004355430603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129182, + "balance_loss_mlp": 1.08941352, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.06928638271863855, + "language_loss": 0.87596297, + "learning_rate": 0.0006961523195917114, + "loss": 0.88725477, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.39770508, + "step": 2030, + "time_per_iteration": 2.8312549591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112269, + "balance_loss_mlp": 1.08041883, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.06430070846126967, + "language_loss": 0.78209358, + "learning_rate": 0.0006958657139041696, + "loss": 0.79332048, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.4230957, + "step": 2031, + "time_per_iteration": 2.789843797683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172125, + "balance_loss_mlp": 1.1593461, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.04676690558545683, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77885091, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.12792969, + "step": 2032, + "time_per_iteration": 4.9584527015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118419, + "balance_loss_mlp": 1.07781672, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.06222398192409584, + "language_loss": 0.78433788, + "learning_rate": 0.0006952922745149434, + "loss": 0.79552209, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.40600586, + "step": 2033, + "time_per_iteration": 2.6696994304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125088, + "balance_loss_mlp": 1.08288765, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06080690179225973, + "language_loss": 0.88040847, + "learning_rate": 0.000695005441035888, + "loss": 0.89165938, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.421875, + "step": 2034, + "time_per_iteration": 2.675685167312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126781, + "balance_loss_mlp": 1.11333418, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.02489517999219278, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74850214, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.13476562, + "step": 2035, + "time_per_iteration": 4.8780670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114086, + "balance_loss_mlp": 1.10006714, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.09902005838056731, + "language_loss": 0.81387436, + "learning_rate": 0.0006944315470656863, + "loss": 0.82528299, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.40795898, + "step": 2036, + "time_per_iteration": 3.04048228263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132051, + "balance_loss_mlp": 1.08858752, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.07431126960541347, + "language_loss": 0.91352618, + "learning_rate": 0.000694144486797345, + "loss": 0.92484671, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.43432617, + "step": 2037, + "time_per_iteration": 2.692013740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110594, + "balance_loss_mlp": 1.09695601, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.027663679576331687, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8063103, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.13671875, + "step": 2038, + "time_per_iteration": 4.626150369644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128452, + "balance_loss_mlp": 1.08796859, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.06987974305662424, + "language_loss": 0.90060711, + "learning_rate": 0.0006935701402514156, + "loss": 0.91189158, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.40454102, + "step": 2039, + "time_per_iteration": 2.5738487243652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099838, + "balance_loss_mlp": 1.0864867, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.03469500580229188, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74134731, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13378906, + "step": 2040, + "time_per_iteration": 4.957871437072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140825, + "balance_loss_mlp": 1.10112846, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.08036310752647091, + "language_loss": 0.84965599, + "learning_rate": 0.0006929954931031422, + "loss": 0.86106431, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.39672852, + "step": 2041, + "time_per_iteration": 4.232867956161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_mlp": 1.09039509, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05705738410966496, + "language_loss": 0.8864727, + "learning_rate": 0.0006927080570819805, + "loss": 0.89776957, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.39282227, + "step": 2042, + "time_per_iteration": 2.6111459732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_mlp": 1.10252953, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.08862983476083965, + "language_loss": 0.81371272, + "learning_rate": 0.0006924205462449161, + "loss": 0.82514596, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.40795898, + "step": 2043, + "time_per_iteration": 2.6160669326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128783, + "balance_loss_mlp": 1.08932424, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.06601435567751561, + "language_loss": 0.82073617, + "learning_rate": 0.0006921329607035702, + "loss": 0.83202398, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.39453125, + "step": 2044, + "time_per_iteration": 3.2338860034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.08441699, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.06846789620147704, + "language_loss": 0.88441163, + "learning_rate": 0.0006918453005695938, + "loss": 0.89562631, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.37011719, + "step": 2045, + "time_per_iteration": 2.6499555110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135104, + "balance_loss_mlp": 1.09426332, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.05142411082006327, + "language_loss": 0.84655213, + "learning_rate": 0.0006915575659546662, + "loss": 0.85790318, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.40869141, + "step": 2046, + "time_per_iteration": 2.652902364730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133716, + "balance_loss_mlp": 1.09339929, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.08744808643608758, + "language_loss": 0.80837369, + "learning_rate": 0.0006912697569704959, + "loss": 0.81971085, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.40307617, + "step": 2047, + "time_per_iteration": 2.6129064559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131331, + "balance_loss_mlp": 1.09158659, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07468037026935817, + "language_loss": 0.86945641, + "learning_rate": 0.0006909818737288205, + "loss": 0.88076973, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.3972168, + "step": 2048, + "time_per_iteration": 2.5576181411743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10632348, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07110132916922086, + "language_loss": 0.81226838, + "learning_rate": 0.000690693916341406, + "loss": 0.82373071, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.39916992, + "step": 2049, + "time_per_iteration": 2.5884814262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154156, + "balance_loss_mlp": 1.11398268, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.05472880535545416, + "language_loss": 0.82429487, + "learning_rate": 0.0006904058849200475, + "loss": 0.83583641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.40185547, + "step": 2050, + "time_per_iteration": 2.7662599086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144327, + "balance_loss_mlp": 1.10565519, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.06127353443593348, + "language_loss": 0.85204089, + "learning_rate": 0.0006901177795765683, + "loss": 0.86348414, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.38647461, + "step": 2051, + "time_per_iteration": 2.577353000640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011475, + "balance_loss_mlp": 1.10768366, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.10882102145067868, + "language_loss": 0.81508064, + "learning_rate": 0.0006898296004228213, + "loss": 0.82655561, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.39819336, + "step": 2052, + "time_per_iteration": 2.7242588996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118361, + "balance_loss_mlp": 1.10605848, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03880030883121314, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79245102, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12304688, + "step": 2053, + "time_per_iteration": 4.852335691452026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.1204555, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.06533456514809383, + "language_loss": 0.79943091, + "learning_rate": 0.0006892530211320763, + "loss": 0.81103128, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.39575195, + "step": 2054, + "time_per_iteration": 2.726592779159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163981, + "balance_loss_mlp": 1.12528563, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.06955061494726521, + "language_loss": 0.8399905, + "learning_rate": 0.000688964621218926, + "loss": 0.85163033, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.38696289, + "step": 2055, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156737, + "balance_loss_mlp": 1.11737382, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.06754212988294535, + "language_loss": 0.80637926, + "learning_rate": 0.0006886761479432037, + "loss": 0.81794661, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.39379883, + "step": 2056, + "time_per_iteration": 2.8334691524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169364, + "balance_loss_mlp": 1.12866604, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.08783588969410645, + "language_loss": 0.85058302, + "learning_rate": 0.0006883876014169045, + "loss": 0.86227667, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.40698242, + "step": 2057, + "time_per_iteration": 2.4859981536865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163618, + "balance_loss_mlp": 1.12344468, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07066278036752763, + "language_loss": 0.90527105, + "learning_rate": 0.000688098981752052, + "loss": 0.91690719, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.40161133, + "step": 2058, + "time_per_iteration": 2.737825393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169191, + "balance_loss_mlp": 1.12849319, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08574741875980238, + "language_loss": 0.80283022, + "learning_rate": 0.0006878102890606982, + "loss": 0.81452215, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.40722656, + "step": 2059, + "time_per_iteration": 3.0589451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159966, + "balance_loss_mlp": 1.12034082, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.07158976818793618, + "language_loss": 0.81510139, + "learning_rate": 0.0006875215234549239, + "loss": 0.8267011, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.39648438, + "step": 2060, + "time_per_iteration": 2.5404529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11150885, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.11168111879418678, + "language_loss": 0.86092877, + "learning_rate": 0.0006872326850468376, + "loss": 0.87244487, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.40087891, + "step": 2061, + "time_per_iteration": 2.6653215885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153197, + "balance_loss_mlp": 1.11133087, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.0731410886524803, + "language_loss": 0.79433036, + "learning_rate": 0.0006869437739485762, + "loss": 0.80586231, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.41870117, + "step": 2062, + "time_per_iteration": 2.6032299995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147299, + "balance_loss_mlp": 1.1086272, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06685158443863869, + "language_loss": 0.9296748, + "learning_rate": 0.0006866547902723053, + "loss": 0.9411478, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.38647461, + "step": 2063, + "time_per_iteration": 2.676166534423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150184, + "balance_loss_mlp": 1.11148858, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10136223850880095, + "language_loss": 0.80330342, + "learning_rate": 0.000686365734130218, + "loss": 0.81480527, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.38696289, + "step": 2064, + "time_per_iteration": 2.6844232082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143564, + "balance_loss_mlp": 1.10420108, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06083764513088428, + "language_loss": 0.84282482, + "learning_rate": 0.000686076605634536, + "loss": 0.85426044, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.39379883, + "step": 2065, + "time_per_iteration": 2.6315250396728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156007, + "balance_loss_mlp": 1.11704922, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.07154960647229537, + "language_loss": 0.84777498, + "learning_rate": 0.0006857874048975088, + "loss": 0.85933506, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.38964844, + "step": 2066, + "time_per_iteration": 2.651740074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144331, + "balance_loss_mlp": 1.10298944, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06215318135177391, + "language_loss": 0.87357152, + "learning_rate": 0.0006854981320314142, + "loss": 0.88501477, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.41381836, + "step": 2067, + "time_per_iteration": 2.5062263011932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150736, + "balance_loss_mlp": 1.11089611, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.07144157906743025, + "language_loss": 0.87282014, + "learning_rate": 0.0006852087871485579, + "loss": 0.88432747, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.3984375, + "step": 2068, + "time_per_iteration": 2.6593010425567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141379, + "balance_loss_mlp": 1.10206354, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08492249089395289, + "language_loss": 0.82224536, + "learning_rate": 0.0006849193703612735, + "loss": 0.83365911, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.39282227, + "step": 2069, + "time_per_iteration": 2.755782127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137654, + "balance_loss_mlp": 1.09817159, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07327967142242812, + "language_loss": 0.78054988, + "learning_rate": 0.0006846298817819225, + "loss": 0.79192644, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.39477539, + "step": 2070, + "time_per_iteration": 2.987943410873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148271, + "balance_loss_mlp": 1.10909855, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.08050617332568782, + "language_loss": 0.81162381, + "learning_rate": 0.0006843403215228945, + "loss": 0.82310653, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.3918457, + "step": 2071, + "time_per_iteration": 2.4827940464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165055, + "balance_loss_mlp": 1.12585878, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.07083437878036915, + "language_loss": 0.80721962, + "learning_rate": 0.0006840506896966065, + "loss": 0.81887019, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.3918457, + "step": 2072, + "time_per_iteration": 2.6827309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166963, + "balance_loss_mlp": 1.12621748, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.06725102297232902, + "language_loss": 0.8278873, + "learning_rate": 0.0006837609864155038, + "loss": 0.83955693, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.40771484, + "step": 2073, + "time_per_iteration": 2.9130313396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116361, + "balance_loss_mlp": 1.12584436, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.07471059517929624, + "language_loss": 0.8375988, + "learning_rate": 0.0006834712117920592, + "loss": 0.84923482, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.37768555, + "step": 2074, + "time_per_iteration": 2.61501145362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162616, + "balance_loss_mlp": 1.12325335, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.13245970923224126, + "language_loss": 0.85901093, + "learning_rate": 0.0006831813659387729, + "loss": 0.87063706, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.39331055, + "step": 2075, + "time_per_iteration": 2.563549041748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149354, + "balance_loss_mlp": 1.11075377, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.06732512968880089, + "language_loss": 0.84738618, + "learning_rate": 0.0006828914489681733, + "loss": 0.85887969, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.38574219, + "step": 2076, + "time_per_iteration": 2.7011008262634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142979, + "balance_loss_mlp": 1.10440326, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.050728888200014394, + "language_loss": 0.85780215, + "learning_rate": 0.0006826014609928162, + "loss": 0.86923194, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.38598633, + "step": 2077, + "time_per_iteration": 2.699880838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_mlp": 1.01472485, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.012471286598558728, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84226274, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12158203, + "step": 2078, + "time_per_iteration": 4.819272518157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112436, + "balance_loss_mlp": 1.08549809, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.08765386089658693, + "language_loss": 0.80571902, + "learning_rate": 0.0006820212724781896, + "loss": 0.81696254, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.38842773, + "step": 2079, + "time_per_iteration": 2.6927945613861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112693, + "balance_loss_mlp": 1.07526088, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06830833334646268, + "language_loss": 0.84229112, + "learning_rate": 0.0006817310721641694, + "loss": 0.85341799, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.37402344, + "step": 2080, + "time_per_iteration": 2.8158507347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_mlp": 1.07422495, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0821477508940244, + "language_loss": 0.84532309, + "learning_rate": 0.00068144080129589, + "loss": 0.85646749, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.40234375, + "step": 2081, + "time_per_iteration": 2.665823221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111145, + "balance_loss_mlp": 1.07206321, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.06681211266265834, + "language_loss": 0.83178174, + "learning_rate": 0.0006811504599860441, + "loss": 0.84289622, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.39379883, + "step": 2082, + "time_per_iteration": 2.517651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112401, + "balance_loss_mlp": 1.07382464, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.04646658923847655, + "language_loss": 0.86172366, + "learning_rate": 0.0006808600483473526, + "loss": 0.87284768, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.38549805, + "step": 2083, + "time_per_iteration": 2.85060715675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106871, + "balance_loss_mlp": 1.06743646, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.05030907040360332, + "language_loss": 0.86459124, + "learning_rate": 0.0006805695664925629, + "loss": 0.87565994, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.39379883, + "step": 2084, + "time_per_iteration": 2.775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117346, + "balance_loss_mlp": 1.07810271, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.06453737345570608, + "language_loss": 0.84040797, + "learning_rate": 0.0006802790145344506, + "loss": 0.85158145, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.39233398, + "step": 2085, + "time_per_iteration": 2.4470229148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112227, + "balance_loss_mlp": 1.08459997, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07025741726477988, + "language_loss": 0.87659204, + "learning_rate": 0.0006799883925858176, + "loss": 0.8878147, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.37646484, + "step": 2086, + "time_per_iteration": 2.861490249633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136148, + "balance_loss_mlp": 1.09709549, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06341077230687828, + "language_loss": 0.85575259, + "learning_rate": 0.0006796977007594933, + "loss": 0.86711407, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.39038086, + "step": 2087, + "time_per_iteration": 2.619633197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.10920811, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.0625455511079972, + "language_loss": 0.86956239, + "learning_rate": 0.0006794069391683345, + "loss": 0.88106287, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.40844727, + "step": 2088, + "time_per_iteration": 4.210111618041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145795, + "balance_loss_mlp": 1.10683715, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.0705312667092641, + "language_loss": 0.81334388, + "learning_rate": 0.0006791161079252248, + "loss": 0.8248018, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.38916016, + "step": 2089, + "time_per_iteration": 2.614766836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_mlp": 1.10286903, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.084499094041807, + "language_loss": 0.82758236, + "learning_rate": 0.0006788252071430747, + "loss": 0.83899295, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.38183594, + "step": 2090, + "time_per_iteration": 2.617656707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135863, + "balance_loss_mlp": 1.09490228, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.0700927477934208, + "language_loss": 0.8703053, + "learning_rate": 0.0006785342369348222, + "loss": 0.88166392, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.40942383, + "step": 2091, + "time_per_iteration": 2.7607271671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122396, + "balance_loss_mlp": 1.08513117, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.09140990562062702, + "language_loss": 0.8009733, + "learning_rate": 0.0006782431974134316, + "loss": 0.81219733, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.37280273, + "step": 2092, + "time_per_iteration": 2.5610032081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118064, + "balance_loss_mlp": 1.07889199, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.054626907115785994, + "language_loss": 0.89608824, + "learning_rate": 0.0006779520886918949, + "loss": 0.90726894, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.3918457, + "step": 2093, + "time_per_iteration": 3.064581871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110103, + "balance_loss_mlp": 1.07279015, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.057101365791561574, + "language_loss": 0.81741238, + "learning_rate": 0.0006776609108832301, + "loss": 0.82851338, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.37304688, + "step": 2094, + "time_per_iteration": 2.77875018119812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_mlp": 1.06403446, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.06401566733015203, + "language_loss": 0.85612595, + "learning_rate": 0.0006773696641004828, + "loss": 0.86712897, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36254883, + "step": 2095, + "time_per_iteration": 2.5543506145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06522298, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.06439261414673134, + "language_loss": 0.77821416, + "learning_rate": 0.0006770783484567247, + "loss": 0.78923213, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.36572266, + "step": 2096, + "time_per_iteration": 3.14194393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114941, + "balance_loss_mlp": 1.07862973, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.051673087984505275, + "language_loss": 0.86408114, + "learning_rate": 0.000676786964065055, + "loss": 0.87523055, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36328125, + "step": 2097, + "time_per_iteration": 2.796668529510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109226, + "balance_loss_mlp": 1.07270014, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07558073774647381, + "language_loss": 0.79608446, + "learning_rate": 0.0006764955110385986, + "loss": 0.80717671, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.36547852, + "step": 2098, + "time_per_iteration": 2.721588134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.07998002, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06754969850087679, + "language_loss": 0.80409288, + "learning_rate": 0.0006762039894905083, + "loss": 0.8152715, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.37890625, + "step": 2099, + "time_per_iteration": 2.6286327838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126862, + "balance_loss_mlp": 1.08728456, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.06639046911061866, + "language_loss": 0.80760598, + "learning_rate": 0.000675912399533962, + "loss": 0.8188746, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.39599609, + "step": 2100, + "time_per_iteration": 2.5150249004364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110396, + "balance_loss_mlp": 1.07420361, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.05652757132031041, + "language_loss": 0.85431337, + "learning_rate": 0.0006756207412821656, + "loss": 0.86541736, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36206055, + "step": 2101, + "time_per_iteration": 2.9816384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.06962454, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08079981189537652, + "language_loss": 0.81269771, + "learning_rate": 0.0006753290148483505, + "loss": 0.8237704, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.37670898, + "step": 2102, + "time_per_iteration": 3.0291824340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111488, + "balance_loss_mlp": 1.07458103, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07115498960503684, + "language_loss": 0.79040611, + "learning_rate": 0.0006750372203457752, + "loss": 0.80152106, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.36914062, + "step": 2103, + "time_per_iteration": 2.5193490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111376, + "balance_loss_mlp": 1.07458746, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.049732783973711246, + "language_loss": 0.87039417, + "learning_rate": 0.0006747453578877242, + "loss": 0.88150793, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.36767578, + "step": 2104, + "time_per_iteration": 2.691030979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116651, + "balance_loss_mlp": 1.07998228, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.06592833756650988, + "language_loss": 0.83420014, + "learning_rate": 0.0006744534275875085, + "loss": 0.8453666, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.36669922, + "step": 2105, + "time_per_iteration": 2.9842946529388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.08099532, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.07270624559080442, + "language_loss": 0.85434729, + "learning_rate": 0.0006741614295584657, + "loss": 0.86553085, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.3737793, + "step": 2106, + "time_per_iteration": 2.63811993598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117316, + "balance_loss_mlp": 1.08057594, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.05922552771988275, + "language_loss": 0.78890157, + "learning_rate": 0.0006738693639139595, + "loss": 0.80007476, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.36743164, + "step": 2107, + "time_per_iteration": 2.9618351459503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116371, + "balance_loss_mlp": 1.07746077, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.06915522511623486, + "language_loss": 0.77808583, + "learning_rate": 0.0006735772307673796, + "loss": 0.78924954, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.38916016, + "step": 2108, + "time_per_iteration": 3.575981855392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111094, + "balance_loss_mlp": 1.07380557, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06309901973905298, + "language_loss": 0.83742046, + "learning_rate": 0.0006732850302321421, + "loss": 0.84853137, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.37280273, + "step": 2109, + "time_per_iteration": 3.045565605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114415, + "balance_loss_mlp": 1.0778178, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.060704196703692835, + "language_loss": 0.84782875, + "learning_rate": 0.00067299276242169, + "loss": 0.85897285, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.3659668, + "step": 2110, + "time_per_iteration": 2.6868693828582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047442, + "balance_loss_mlp": 1.03666544, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.029253972882140933, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75429612, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.10791016, + "step": 2111, + "time_per_iteration": 4.918604612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110434, + "balance_loss_mlp": 1.07281184, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.06207465310904933, + "language_loss": 0.78018594, + "learning_rate": 0.0006724080254290395, + "loss": 0.79129028, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.37597656, + "step": 2112, + "time_per_iteration": 2.798377752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116483, + "balance_loss_mlp": 1.08012438, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07195778929743761, + "language_loss": 0.89838338, + "learning_rate": 0.0006721155564738566, + "loss": 0.90954828, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36401367, + "step": 2113, + "time_per_iteration": 2.721280813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_mlp": 1.02395451, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.019551827625956694, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79657471, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.10888672, + "step": 2114, + "time_per_iteration": 4.956322193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110507, + "balance_loss_mlp": 1.07269359, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.052092004512661015, + "language_loss": 0.85970294, + "learning_rate": 0.0006715304182135078, + "loss": 0.87080801, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.37792969, + "step": 2115, + "time_per_iteration": 2.611116647720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114836, + "balance_loss_mlp": 1.07611692, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.051206353593090614, + "language_loss": 0.89130676, + "learning_rate": 0.0006712377491355127, + "loss": 0.90245515, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.38696289, + "step": 2116, + "time_per_iteration": 2.8788397312164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120118, + "balance_loss_mlp": 1.0829246, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.049235441975469474, + "language_loss": 0.81475073, + "learning_rate": 0.0006709450135771274, + "loss": 0.82595193, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.37182617, + "step": 2117, + "time_per_iteration": 2.944436550140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118262, + "balance_loss_mlp": 1.08233273, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.05682697017745506, + "language_loss": 0.86693907, + "learning_rate": 0.0006706522116520023, + "loss": 0.87812167, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.35913086, + "step": 2118, + "time_per_iteration": 2.6161422729492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125881, + "balance_loss_mlp": 1.08766294, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.060179733914174166, + "language_loss": 0.83147317, + "learning_rate": 0.0006703593434738127, + "loss": 0.84273201, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.38208008, + "step": 2119, + "time_per_iteration": 2.719313383102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123164, + "balance_loss_mlp": 1.0857563, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06825324786035328, + "language_loss": 0.78421569, + "learning_rate": 0.0006700664091562604, + "loss": 0.79544735, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.37402344, + "step": 2120, + "time_per_iteration": 2.569246530532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125736, + "balance_loss_mlp": 1.09090257, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.051920603902655335, + "language_loss": 0.85211694, + "learning_rate": 0.0006697734088130725, + "loss": 0.86337435, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.34863281, + "step": 2121, + "time_per_iteration": 2.67394757270813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124636, + "balance_loss_mlp": 1.08732319, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.05791753235244458, + "language_loss": 0.85750419, + "learning_rate": 0.0006694803425580018, + "loss": 0.86875051, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.37304688, + "step": 2122, + "time_per_iteration": 2.9812121391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129477, + "balance_loss_mlp": 1.09178257, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.06590998571054847, + "language_loss": 0.84986377, + "learning_rate": 0.0006691872105048268, + "loss": 0.86115849, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.37646484, + "step": 2123, + "time_per_iteration": 2.56272292137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137814, + "balance_loss_mlp": 1.10157394, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.05742584890727743, + "language_loss": 0.84864831, + "learning_rate": 0.0006688940127673513, + "loss": 0.86002642, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.36254883, + "step": 2124, + "time_per_iteration": 2.6935954093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113197, + "balance_loss_mlp": 1.09642184, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.05672589959491125, + "language_loss": 0.85339016, + "learning_rate": 0.0006686007494594049, + "loss": 0.86470985, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.35571289, + "step": 2125, + "time_per_iteration": 2.8291172981262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128385, + "balance_loss_mlp": 1.09097719, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.06786502616833631, + "language_loss": 0.81025755, + "learning_rate": 0.0006683074206948425, + "loss": 0.82154143, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.37402344, + "step": 2126, + "time_per_iteration": 2.5193305015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126549, + "balance_loss_mlp": 1.09095287, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.06065849070073351, + "language_loss": 0.81971312, + "learning_rate": 0.0006680140265875443, + "loss": 0.83097857, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.35595703, + "step": 2127, + "time_per_iteration": 2.8254714012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.09184861, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.054477830354085016, + "language_loss": 0.95947516, + "learning_rate": 0.0006677205672514162, + "loss": 0.97074527, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35205078, + "step": 2128, + "time_per_iteration": 2.6226608753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120165, + "balance_loss_mlp": 1.0867151, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.047090391860463696, + "language_loss": 0.88730562, + "learning_rate": 0.000667427042800389, + "loss": 0.8985073, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.3347168, + "step": 2129, + "time_per_iteration": 2.7718160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118491, + "balance_loss_mlp": 1.0833478, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.05934025192817406, + "language_loss": 0.83200449, + "learning_rate": 0.0006671334533484192, + "loss": 0.84318936, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.3515625, + "step": 2130, + "time_per_iteration": 2.7164061069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126113, + "balance_loss_mlp": 1.09199548, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.04849724567471186, + "language_loss": 0.83465552, + "learning_rate": 0.0006668397990094881, + "loss": 0.84591663, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.34130859, + "step": 2131, + "time_per_iteration": 2.684115171432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124098, + "balance_loss_mlp": 1.08738196, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.059898700524732326, + "language_loss": 0.84854865, + "learning_rate": 0.0006665460798976027, + "loss": 0.85978961, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.3671875, + "step": 2132, + "time_per_iteration": 2.748350143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114613, + "balance_loss_mlp": 1.07899356, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057665198388541644, + "language_loss": 0.81392014, + "learning_rate": 0.0006662522961267947, + "loss": 0.82506627, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.35620117, + "step": 2133, + "time_per_iteration": 2.696699619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117281, + "balance_loss_mlp": 1.08192313, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.05272213252392562, + "language_loss": 0.87773252, + "learning_rate": 0.0006659584478111211, + "loss": 0.88890535, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.35351562, + "step": 2134, + "time_per_iteration": 2.793302536010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08249605, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.06878890228068688, + "language_loss": 0.83315176, + "learning_rate": 0.000665664535064664, + "loss": 0.84434175, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.36499023, + "step": 2135, + "time_per_iteration": 3.0627260208129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104736, + "balance_loss_mlp": 1.06987929, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.05984370507865806, + "language_loss": 0.83017695, + "learning_rate": 0.0006653705580015303, + "loss": 0.84122425, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.34863281, + "step": 2136, + "time_per_iteration": 2.6851253509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103645, + "balance_loss_mlp": 1.0668807, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.07790160743926922, + "language_loss": 0.86554241, + "learning_rate": 0.0006650765167358523, + "loss": 0.87657887, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.36743164, + "step": 2137, + "time_per_iteration": 2.7750425338745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111084, + "balance_loss_mlp": 1.07579243, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06074101962252474, + "language_loss": 0.9028185, + "learning_rate": 0.0006647824113817864, + "loss": 0.91392696, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.3503418, + "step": 2138, + "time_per_iteration": 2.5466508865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120271, + "balance_loss_mlp": 1.08348298, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.0860402389983067, + "language_loss": 0.81677365, + "learning_rate": 0.000664488242053515, + "loss": 0.82797635, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.36767578, + "step": 2139, + "time_per_iteration": 2.7149875164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114944, + "balance_loss_mlp": 1.08108878, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.05168082296105111, + "language_loss": 0.83784723, + "learning_rate": 0.0006641940088652445, + "loss": 0.84899676, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.33886719, + "step": 2140, + "time_per_iteration": 2.7871952056884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118503, + "balance_loss_mlp": 1.08130932, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.07036618696819374, + "language_loss": 0.8248812, + "learning_rate": 0.0006638997119312065, + "loss": 0.83606619, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.37207031, + "step": 2141, + "time_per_iteration": 2.7391679286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_mlp": 1.02841258, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.01970513212166274, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76101923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10205078, + "step": 2142, + "time_per_iteration": 4.920190095901489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113096, + "balance_loss_mlp": 1.09562647, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.07114532863779677, + "language_loss": 0.8524918, + "learning_rate": 0.000663310927282877, + "loss": 0.86380136, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.35327148, + "step": 2143, + "time_per_iteration": 2.762634515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.09098172, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.06302616573108136, + "language_loss": 0.86451441, + "learning_rate": 0.000663016439797172, + "loss": 0.87578332, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.35913086, + "step": 2144, + "time_per_iteration": 2.623093366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.082816, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.054034946771414454, + "language_loss": 0.80777407, + "learning_rate": 0.0006627218890228724, + "loss": 0.81894982, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.34765625, + "step": 2145, + "time_per_iteration": 2.79042911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.08373237, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.06837741268569841, + "language_loss": 0.83587825, + "learning_rate": 0.0006624272750743326, + "loss": 0.84706175, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.34643555, + "step": 2146, + "time_per_iteration": 3.0066065788269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110844, + "balance_loss_mlp": 1.07591534, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.052525216454956766, + "language_loss": 0.83126348, + "learning_rate": 0.0006621325980659322, + "loss": 0.84237194, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.34912109, + "step": 2147, + "time_per_iteration": 2.77634334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110797, + "balance_loss_mlp": 1.07429504, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06743799661442922, + "language_loss": 0.82004929, + "learning_rate": 0.000661837858112075, + "loss": 0.83115721, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.36499023, + "step": 2148, + "time_per_iteration": 2.8309879302978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108806, + "balance_loss_mlp": 1.07156515, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.060878143567582824, + "language_loss": 0.88845801, + "learning_rate": 0.0006615430553271888, + "loss": 0.89954603, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.37231445, + "step": 2149, + "time_per_iteration": 2.7831413745880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110838, + "balance_loss_mlp": 1.0737617, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.05890657915946428, + "language_loss": 0.85358977, + "learning_rate": 0.0006612481898257264, + "loss": 0.86467361, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.34619141, + "step": 2150, + "time_per_iteration": 2.8594231605529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116144, + "balance_loss_mlp": 1.08021438, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.0708787663681645, + "language_loss": 0.85383213, + "learning_rate": 0.000660953261722165, + "loss": 0.86499357, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.359375, + "step": 2151, + "time_per_iteration": 2.616218090057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110269, + "balance_loss_mlp": 1.07512605, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.05740780888166335, + "language_loss": 0.82834315, + "learning_rate": 0.0006606582711310055, + "loss": 0.83944577, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.3515625, + "step": 2152, + "time_per_iteration": 2.752922773361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116071, + "balance_loss_mlp": 1.07918727, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.062483875204726216, + "language_loss": 0.83428371, + "learning_rate": 0.0006603632181667736, + "loss": 0.84544444, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.36865234, + "step": 2153, + "time_per_iteration": 2.6699299812316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093007, + "balance_loss_mlp": 1.0828501, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03944020407638644, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8003633, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.1015625, + "step": 2154, + "time_per_iteration": 4.931839227676392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117724, + "balance_loss_mlp": 1.0825572, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08977793466970029, + "language_loss": 0.82004881, + "learning_rate": 0.0006597729255773153, + "loss": 0.83122605, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.3515625, + "step": 2155, + "time_per_iteration": 2.5100300312042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114058, + "balance_loss_mlp": 1.07769895, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.24704033930451297, + "language_loss": 0.82534748, + "learning_rate": 0.0006594776861812608, + "loss": 0.83648813, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.36328125, + "step": 2156, + "time_per_iteration": 2.652275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124067, + "balance_loss_mlp": 1.0867784, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.053182178449683815, + "language_loss": 0.86615425, + "learning_rate": 0.0006591823848704776, + "loss": 0.87739491, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.37280273, + "step": 2157, + "time_per_iteration": 2.958137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123111, + "balance_loss_mlp": 1.08653796, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.05319975052329094, + "language_loss": 0.81529272, + "learning_rate": 0.0006588870217596117, + "loss": 0.82652378, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.36572266, + "step": 2158, + "time_per_iteration": 2.739755392074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136072, + "balance_loss_mlp": 1.09847283, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.06859141393857857, + "language_loss": 0.85955006, + "learning_rate": 0.0006585915969633334, + "loss": 0.87091076, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.37573242, + "step": 2159, + "time_per_iteration": 2.561397075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138332, + "balance_loss_mlp": 1.1019969, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06079365960323944, + "language_loss": 0.89314306, + "learning_rate": 0.0006582961105963366, + "loss": 0.90452635, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.36328125, + "step": 2160, + "time_per_iteration": 2.791609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141319, + "balance_loss_mlp": 1.10546052, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.06462553372591408, + "language_loss": 0.77976739, + "learning_rate": 0.0006580005627733395, + "loss": 0.79118055, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.35913086, + "step": 2161, + "time_per_iteration": 2.6615841388702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152655, + "balance_loss_mlp": 1.11536634, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.06713934338553489, + "language_loss": 0.82142949, + "learning_rate": 0.0006577049536090838, + "loss": 0.83295602, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.37280273, + "step": 2162, + "time_per_iteration": 2.7025601863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11163712, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.06389110494138472, + "language_loss": 0.8567937, + "learning_rate": 0.000657409283218335, + "loss": 0.86828005, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37011719, + "step": 2163, + "time_per_iteration": 2.6993329524993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160123, + "balance_loss_mlp": 1.12352586, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.058431004936437055, + "language_loss": 0.81466627, + "learning_rate": 0.0006571135517158829, + "loss": 0.82626748, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.3659668, + "step": 2164, + "time_per_iteration": 2.6519243717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114432, + "balance_loss_mlp": 1.13316202, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.04824937130362004, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77908379, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.11181641, + "step": 2165, + "time_per_iteration": 4.770123481750488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155561, + "balance_loss_mlp": 1.11765289, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.07363984603082524, + "language_loss": 0.83210087, + "learning_rate": 0.0006565219058351444, + "loss": 0.84365654, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37866211, + "step": 2166, + "time_per_iteration": 2.6601247787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10470724, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.06568932383648114, + "language_loss": 0.83008349, + "learning_rate": 0.0006562259916865553, + "loss": 0.84152913, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.39868164, + "step": 2167, + "time_per_iteration": 2.5785412788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137223, + "balance_loss_mlp": 1.0999341, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.06458514122378838, + "language_loss": 0.79966152, + "learning_rate": 0.0006559300168856573, + "loss": 0.81103373, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.37255859, + "step": 2168, + "time_per_iteration": 2.7237303256988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140045, + "balance_loss_mlp": 1.10316169, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.050633821406227124, + "language_loss": 0.86603534, + "learning_rate": 0.0006556339815473577, + "loss": 0.8774358, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.36889648, + "step": 2169, + "time_per_iteration": 2.6403653621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140714, + "balance_loss_mlp": 1.10254359, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.05999280354484277, + "language_loss": 0.86559451, + "learning_rate": 0.000655337885786588, + "loss": 0.87700164, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.3815918, + "step": 2170, + "time_per_iteration": 2.927175283432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144977, + "balance_loss_mlp": 1.10737872, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.06541761902088469, + "language_loss": 0.85292417, + "learning_rate": 0.0006550417297183025, + "loss": 0.86437398, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37597656, + "step": 2171, + "time_per_iteration": 2.617203950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139774, + "balance_loss_mlp": 1.10174668, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.06470887192105082, + "language_loss": 0.81668884, + "learning_rate": 0.0006547455134574793, + "loss": 0.82808661, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.37988281, + "step": 2172, + "time_per_iteration": 2.6800732612609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.10817289, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06060457888036509, + "language_loss": 0.84434199, + "learning_rate": 0.0006544492371191198, + "loss": 0.85579354, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.36962891, + "step": 2173, + "time_per_iteration": 3.134876251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140851, + "balance_loss_mlp": 1.10113096, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.09700819760133231, + "language_loss": 0.83721489, + "learning_rate": 0.0006541529008182485, + "loss": 0.84862345, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.39697266, + "step": 2174, + "time_per_iteration": 3.1724131107330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113477, + "balance_loss_mlp": 1.09893537, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.060160949925642034, + "language_loss": 0.87700981, + "learning_rate": 0.0006538565046699136, + "loss": 0.88835752, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.3581543, + "step": 2175, + "time_per_iteration": 2.5730292797088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133428, + "balance_loss_mlp": 1.09683084, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.06692113802371265, + "language_loss": 0.81824857, + "learning_rate": 0.0006535600487891862, + "loss": 0.82958287, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.3659668, + "step": 2176, + "time_per_iteration": 2.7692394256591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121709, + "balance_loss_mlp": 1.08651876, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.07459509047969586, + "language_loss": 0.89445305, + "learning_rate": 0.0006532635332911603, + "loss": 0.90567011, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.3515625, + "step": 2177, + "time_per_iteration": 2.668281078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122309, + "balance_loss_mlp": 1.08449602, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.054056674099833946, + "language_loss": 0.80669487, + "learning_rate": 0.0006529669582909541, + "loss": 0.81791794, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37744141, + "step": 2178, + "time_per_iteration": 3.234210729598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132134, + "balance_loss_mlp": 1.0946312, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.13706718234639897, + "language_loss": 0.85650241, + "learning_rate": 0.0006526703239037077, + "loss": 0.86782372, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.37475586, + "step": 2179, + "time_per_iteration": 2.6495871543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129835, + "balance_loss_mlp": 1.09094954, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.09871097727336539, + "language_loss": 0.86649984, + "learning_rate": 0.0006523736302445851, + "loss": 0.8777982, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.38891602, + "step": 2180, + "time_per_iteration": 2.7558817863464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133221, + "balance_loss_mlp": 1.09390545, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.05706426412838818, + "language_loss": 0.77595234, + "learning_rate": 0.0006520768774287728, + "loss": 0.78728461, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.39306641, + "step": 2181, + "time_per_iteration": 3.7205944061279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143371, + "balance_loss_mlp": 1.10436535, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06053658357019196, + "language_loss": 0.85689628, + "learning_rate": 0.0006517800655714806, + "loss": 0.86832994, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.39013672, + "step": 2182, + "time_per_iteration": 2.8325769901275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140717, + "balance_loss_mlp": 1.10218823, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07751994631636654, + "language_loss": 0.85342467, + "learning_rate": 0.0006514831947879407, + "loss": 0.86483186, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.38500977, + "step": 2183, + "time_per_iteration": 2.930466890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154155, + "balance_loss_mlp": 1.11531675, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.061313063449444025, + "language_loss": 0.78360265, + "learning_rate": 0.0006511862651934091, + "loss": 0.7951442, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.38842773, + "step": 2184, + "time_per_iteration": 3.0874462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168235, + "balance_loss_mlp": 1.1299212, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.07362784353092817, + "language_loss": 0.820894, + "learning_rate": 0.0006508892769031638, + "loss": 0.83257627, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.3828125, + "step": 2185, + "time_per_iteration": 2.6239352226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.11551726, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.06908564705964859, + "language_loss": 0.87278891, + "learning_rate": 0.000650592230032506, + "loss": 0.88430935, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.36523438, + "step": 2186, + "time_per_iteration": 2.7282140254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149658, + "balance_loss_mlp": 1.11079597, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.0823679101553184, + "language_loss": 0.85327846, + "learning_rate": 0.0006502951246967595, + "loss": 0.86477506, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38891602, + "step": 2187, + "time_per_iteration": 2.8729426860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154068, + "balance_loss_mlp": 1.1164453, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.05336445965116177, + "language_loss": 0.86749196, + "learning_rate": 0.0006499979610112706, + "loss": 0.87903261, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.3762207, + "step": 2188, + "time_per_iteration": 2.7119579315185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151369, + "balance_loss_mlp": 1.1127454, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.055701229884667774, + "language_loss": 0.84561181, + "learning_rate": 0.000649700739091409, + "loss": 0.85712552, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.38623047, + "step": 2189, + "time_per_iteration": 2.7023189067840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108128, + "balance_loss_mlp": 1.07126629, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.037864476589066096, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74917555, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10009766, + "step": 2190, + "time_per_iteration": 4.808679103851318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.10751486, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.07155258064415941, + "language_loss": 0.85762346, + "learning_rate": 0.0006491061210101557, + "loss": 0.8690486, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.35009766, + "step": 2191, + "time_per_iteration": 2.7315032482147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.10880995, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.05430057095490736, + "language_loss": 0.84269011, + "learning_rate": 0.0006488087250796157, + "loss": 0.85415035, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.37231445, + "step": 2192, + "time_per_iteration": 2.91867995262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140476, + "balance_loss_mlp": 1.10154223, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.05336306174245454, + "language_loss": 0.81998622, + "learning_rate": 0.0006485112713764049, + "loss": 0.83139098, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.38916016, + "step": 2193, + "time_per_iteration": 2.954740047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139123, + "balance_loss_mlp": 1.10178626, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.05416843930927548, + "language_loss": 0.83712393, + "learning_rate": 0.0006482137600160051, + "loss": 0.84851515, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.3737793, + "step": 2194, + "time_per_iteration": 2.4989676475524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144403, + "balance_loss_mlp": 1.10573113, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.05184002865736912, + "language_loss": 0.8501671, + "learning_rate": 0.0006479161911139206, + "loss": 0.86161113, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.38671875, + "step": 2195, + "time_per_iteration": 2.5739963054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.09721804, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08063840338659255, + "language_loss": 0.85617948, + "learning_rate": 0.0006476185647856778, + "loss": 0.86753291, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38134766, + "step": 2196, + "time_per_iteration": 2.578218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124656, + "balance_loss_mlp": 1.08808231, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.05804099842364966, + "language_loss": 0.82180464, + "learning_rate": 0.0006473208811468255, + "loss": 0.8330512, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.36547852, + "step": 2197, + "time_per_iteration": 2.8833000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123675, + "balance_loss_mlp": 1.08707809, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.058050592535879256, + "language_loss": 0.84475237, + "learning_rate": 0.0006470231403129347, + "loss": 0.8559891, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.36621094, + "step": 2198, + "time_per_iteration": 2.590959072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124319, + "balance_loss_mlp": 1.08781683, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.05086119187590394, + "language_loss": 0.82252729, + "learning_rate": 0.0006467253423995988, + "loss": 0.83377045, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.36499023, + "step": 2199, + "time_per_iteration": 2.8386192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128105, + "balance_loss_mlp": 1.0917697, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.06679650853448169, + "language_loss": 0.79627949, + "learning_rate": 0.000646427487522433, + "loss": 0.8075605, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.36352539, + "step": 2200, + "time_per_iteration": 2.635103464126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08423305, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.0831390189187338, + "language_loss": 0.83172977, + "learning_rate": 0.0006461295757970749, + "loss": 0.84293896, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.36669922, + "step": 2201, + "time_per_iteration": 2.819474697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.07891917, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.062417347947693186, + "language_loss": 0.81792694, + "learning_rate": 0.0006458316073391839, + "loss": 0.82911074, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39428711, + "step": 2202, + "time_per_iteration": 2.871166229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0872103, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.05500893378921445, + "language_loss": 0.88072616, + "learning_rate": 0.0006455335822644422, + "loss": 0.89194781, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.34936523, + "step": 2203, + "time_per_iteration": 2.6111316680908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123624, + "balance_loss_mlp": 1.08683574, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.06843699867702463, + "language_loss": 0.78204858, + "learning_rate": 0.0006452355006885527, + "loss": 0.79328489, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.36791992, + "step": 2204, + "time_per_iteration": 2.6248953342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119975, + "balance_loss_mlp": 1.08209014, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.07209183527246785, + "language_loss": 0.87310261, + "learning_rate": 0.0006449373627272412, + "loss": 0.88430238, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.37866211, + "step": 2205, + "time_per_iteration": 2.703838348388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119116, + "balance_loss_mlp": 1.08197045, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.07114514004539872, + "language_loss": 0.82698691, + "learning_rate": 0.0006446391684962553, + "loss": 0.8381781, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.37158203, + "step": 2206, + "time_per_iteration": 2.6619176864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115514, + "balance_loss_mlp": 1.08022797, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.05684297237550015, + "language_loss": 0.83361518, + "learning_rate": 0.000644340918111364, + "loss": 0.84477031, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.3527832, + "step": 2207, + "time_per_iteration": 2.5367140769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126169, + "balance_loss_mlp": 1.09016824, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07504639835111325, + "language_loss": 0.8513602, + "learning_rate": 0.0006440426116883585, + "loss": 0.8626219, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.36010742, + "step": 2208, + "time_per_iteration": 2.5879015922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118301, + "balance_loss_mlp": 1.08129835, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06421639244231503, + "language_loss": 0.86279738, + "learning_rate": 0.0006437442493430519, + "loss": 0.8739804, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.37011719, + "step": 2209, + "time_per_iteration": 2.6396701335906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114919, + "balance_loss_mlp": 1.07741535, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.06478280605491378, + "language_loss": 0.87082028, + "learning_rate": 0.000643445831191278, + "loss": 0.88196945, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.37524414, + "step": 2210, + "time_per_iteration": 2.902726173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109225, + "balance_loss_mlp": 1.07265139, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.0604627940505335, + "language_loss": 0.81718135, + "learning_rate": 0.0006431473573488937, + "loss": 0.82827359, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.3659668, + "step": 2211, + "time_per_iteration": 2.756131887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.06492758, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.0751061946408966, + "language_loss": 0.84961367, + "learning_rate": 0.0006428488279317765, + "loss": 0.86063254, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.36938477, + "step": 2212, + "time_per_iteration": 2.6532113552093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100031, + "balance_loss_mlp": 1.06541276, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.06889274289933833, + "language_loss": 0.87372804, + "learning_rate": 0.0006425502430558259, + "loss": 0.88472843, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.34619141, + "step": 2213, + "time_per_iteration": 2.6332669258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_mlp": 1.06874728, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.08165118310272598, + "language_loss": 0.84992635, + "learning_rate": 0.0006422516028369628, + "loss": 0.86098623, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.37231445, + "step": 2214, + "time_per_iteration": 2.618557929992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098802, + "balance_loss_mlp": 1.06237185, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.05512742279801928, + "language_loss": 0.8369562, + "learning_rate": 0.0006419529073911296, + "loss": 0.84794426, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.36425781, + "step": 2215, + "time_per_iteration": 2.833543062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095611, + "balance_loss_mlp": 1.06166017, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.0818108199754697, + "language_loss": 0.85651129, + "learning_rate": 0.0006416541568342901, + "loss": 0.8674674, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.33935547, + "step": 2216, + "time_per_iteration": 2.8430728912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.0622437, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.05864229124252446, + "language_loss": 0.84272695, + "learning_rate": 0.0006413553512824297, + "loss": 0.85369843, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.34912109, + "step": 2217, + "time_per_iteration": 2.7368276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095005, + "balance_loss_mlp": 1.05943322, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.06419705252846208, + "language_loss": 0.84589773, + "learning_rate": 0.0006410564908515549, + "loss": 0.85684776, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.35595703, + "step": 2218, + "time_per_iteration": 2.650841236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096054, + "balance_loss_mlp": 1.06052935, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.06892642653628764, + "language_loss": 0.85406113, + "learning_rate": 0.0006407575756576935, + "loss": 0.86502165, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.35546875, + "step": 2219, + "time_per_iteration": 2.7199461460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103421, + "balance_loss_mlp": 1.0681113, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.055123892223664483, + "language_loss": 0.88112384, + "learning_rate": 0.0006404586058168951, + "loss": 0.89215803, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.35327148, + "step": 2220, + "time_per_iteration": 2.7125062942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_mlp": 1.07129836, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.06740071030395202, + "language_loss": 0.86848915, + "learning_rate": 0.0006401595814452296, + "loss": 0.87955624, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.35424805, + "step": 2221, + "time_per_iteration": 2.6037752628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07349372, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.06763062065124635, + "language_loss": 0.81391692, + "learning_rate": 0.000639860502658789, + "loss": 0.82500279, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.35131836, + "step": 2222, + "time_per_iteration": 2.620530366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.07475281, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.07514934658842116, + "language_loss": 0.85168004, + "learning_rate": 0.0006395613695736853, + "loss": 0.86278212, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.35449219, + "step": 2223, + "time_per_iteration": 2.67494797706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106777, + "balance_loss_mlp": 1.07015634, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.06258659729032073, + "language_loss": 0.81998539, + "learning_rate": 0.0006392621823060529, + "loss": 0.83105314, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.36621094, + "step": 2224, + "time_per_iteration": 2.729048490524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107838, + "balance_loss_mlp": 1.07197976, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.07791132694448914, + "language_loss": 0.85259843, + "learning_rate": 0.0006389629409720465, + "loss": 0.86367679, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.35839844, + "step": 2225, + "time_per_iteration": 2.6461989879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102526, + "balance_loss_mlp": 1.06836081, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.06694393428490365, + "language_loss": 0.88831687, + "learning_rate": 0.0006386636456878417, + "loss": 0.89934212, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.34155273, + "step": 2226, + "time_per_iteration": 2.8701326847076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106393, + "balance_loss_mlp": 1.07091641, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07990341915486338, + "language_loss": 0.92087269, + "learning_rate": 0.0006383642965696353, + "loss": 0.93193656, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.35473633, + "step": 2227, + "time_per_iteration": 2.4640464782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_mlp": 1.06544292, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.053395147694407376, + "language_loss": 0.82962096, + "learning_rate": 0.000638064893733645, + "loss": 0.84064686, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.37158203, + "step": 2228, + "time_per_iteration": 2.783597946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117505, + "balance_loss_mlp": 1.08198094, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.07356604001224937, + "language_loss": 0.89838171, + "learning_rate": 0.000637765437296109, + "loss": 0.90955675, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.35522461, + "step": 2229, + "time_per_iteration": 2.6639621257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112327, + "balance_loss_mlp": 1.07644475, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.05563237387214821, + "language_loss": 0.85128897, + "learning_rate": 0.000637465927373287, + "loss": 0.86241227, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.35913086, + "step": 2230, + "time_per_iteration": 2.6883277893066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107316, + "balance_loss_mlp": 1.07253075, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.06522010118943229, + "language_loss": 0.78980476, + "learning_rate": 0.000637166364081459, + "loss": 0.80087787, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.34790039, + "step": 2231, + "time_per_iteration": 2.711379051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111807, + "balance_loss_mlp": 1.07814288, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.06512604260411947, + "language_loss": 0.84616333, + "learning_rate": 0.0006368667475369256, + "loss": 0.85728139, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.33666992, + "step": 2232, + "time_per_iteration": 2.7521519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083825, + "balance_loss_mlp": 1.07271492, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.03367734377341464, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79611605, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.11132812, + "step": 2233, + "time_per_iteration": 4.941352605819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106502, + "balance_loss_mlp": 1.05414832, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.027928692850204096, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79960448, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10888672, + "step": 2234, + "time_per_iteration": 4.825460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117593, + "balance_loss_mlp": 1.08302259, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.05150642259295507, + "language_loss": 0.86345804, + "learning_rate": 0.0006359675795504112, + "loss": 0.87463403, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.34619141, + "step": 2235, + "time_per_iteration": 2.662977695465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.09099901, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.07348370683515035, + "language_loss": 0.74711537, + "learning_rate": 0.0006356677511584775, + "loss": 0.75838703, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.36181641, + "step": 2236, + "time_per_iteration": 3.51220965385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127234, + "balance_loss_mlp": 1.09337878, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.061045373266899905, + "language_loss": 0.86476523, + "learning_rate": 0.0006353678700956511, + "loss": 0.8760376, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.33886719, + "step": 2237, + "time_per_iteration": 2.60677170753479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_mlp": 1.09085464, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.06413862374233106, + "language_loss": 0.83745819, + "learning_rate": 0.0006350679364783569, + "loss": 0.84870958, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.34326172, + "step": 2238, + "time_per_iteration": 2.7771050930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117438, + "balance_loss_mlp": 1.08212781, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.057478588831895126, + "language_loss": 0.85746336, + "learning_rate": 0.0006347679504230393, + "loss": 0.86863768, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.35351562, + "step": 2239, + "time_per_iteration": 2.6826984882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120648, + "balance_loss_mlp": 1.08405077, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.0566935574955873, + "language_loss": 0.76113296, + "learning_rate": 0.0006344679120461632, + "loss": 0.7723394, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.36621094, + "step": 2240, + "time_per_iteration": 3.3756330013275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122418, + "balance_loss_mlp": 1.0843904, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.06383187448999712, + "language_loss": 0.80362582, + "learning_rate": 0.0006341678214642134, + "loss": 0.81484997, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.38037109, + "step": 2241, + "time_per_iteration": 2.6837639808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121026, + "balance_loss_mlp": 1.08633661, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06213603676301435, + "language_loss": 0.82894886, + "learning_rate": 0.0006338676787936963, + "loss": 0.84015912, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.34716797, + "step": 2242, + "time_per_iteration": 3.0835442543029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.09019864, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.06026893794725229, + "language_loss": 0.83885002, + "learning_rate": 0.0006335674841511367, + "loss": 0.85011244, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.36035156, + "step": 2243, + "time_per_iteration": 2.6649861335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054007, + "balance_loss_mlp": 1.04466057, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.029651379922801115, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80235171, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09326172, + "step": 2244, + "time_per_iteration": 5.015843868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043427, + "balance_loss_mlp": 1.03412855, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.025217175998849217, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7840898, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09277344, + "step": 2245, + "time_per_iteration": 4.923234939575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118877, + "balance_loss_mlp": 1.08282828, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.05723795681410829, + "language_loss": 0.83027297, + "learning_rate": 0.0006326665895567652, + "loss": 0.84146178, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.3605957, + "step": 2246, + "time_per_iteration": 2.6065175533294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112241, + "balance_loss_mlp": 1.08652771, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.06570844887047847, + "language_loss": 0.87358153, + "learning_rate": 0.0006323661881916976, + "loss": 0.88480568, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.35864258, + "step": 2247, + "time_per_iteration": 2.682924509048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124487, + "balance_loss_mlp": 1.08996427, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.05864327339271887, + "language_loss": 0.8139447, + "learning_rate": 0.0006320657354375179, + "loss": 0.82518953, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.34521484, + "step": 2248, + "time_per_iteration": 2.9315433502197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112819, + "balance_loss_mlp": 1.09125865, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.05550733837968219, + "language_loss": 0.87244421, + "learning_rate": 0.0006317652314108726, + "loss": 0.88372612, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.36938477, + "step": 2249, + "time_per_iteration": 2.5357820987701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125911, + "balance_loss_mlp": 1.09186506, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.06944226399680122, + "language_loss": 0.91745955, + "learning_rate": 0.0006314646762284277, + "loss": 0.92871869, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.34057617, + "step": 2250, + "time_per_iteration": 2.650629997253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010844, + "balance_loss_mlp": 1.00116396, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012503035455709091, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76436675, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09667969, + "step": 2251, + "time_per_iteration": 4.895758867263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118011, + "balance_loss_mlp": 1.08341658, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.05843208138947643, + "language_loss": 0.77784407, + "learning_rate": 0.0006308634128629022, + "loss": 0.78902417, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.34570312, + "step": 2252, + "time_per_iteration": 2.916623592376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112901, + "balance_loss_mlp": 1.09289002, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0729174620046303, + "language_loss": 0.87908506, + "learning_rate": 0.0006305627049132531, + "loss": 0.89037514, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.36132812, + "step": 2253, + "time_per_iteration": 2.741239070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121412, + "balance_loss_mlp": 1.08660293, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05583951255628595, + "language_loss": 0.8599245, + "learning_rate": 0.0006302619462746662, + "loss": 0.87113857, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.34814453, + "step": 2254, + "time_per_iteration": 3.1628546714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123282, + "balance_loss_mlp": 1.08966494, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.05704174545577272, + "language_loss": 0.90291667, + "learning_rate": 0.0006299611370639069, + "loss": 0.91414952, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.33618164, + "step": 2255, + "time_per_iteration": 2.7106690406799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125975, + "balance_loss_mlp": 1.09157157, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.06008787734976465, + "language_loss": 0.79589838, + "learning_rate": 0.0006296602773977593, + "loss": 0.80715805, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.34423828, + "step": 2256, + "time_per_iteration": 2.673064947128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.08652973, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.05906133720876415, + "language_loss": 0.87730187, + "learning_rate": 0.0006293593673930277, + "loss": 0.88852072, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.35400391, + "step": 2257, + "time_per_iteration": 2.6278131008148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115203, + "balance_loss_mlp": 1.08010745, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07846710421999975, + "language_loss": 0.7888447, + "learning_rate": 0.0006290584071665358, + "loss": 0.79999673, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.35107422, + "step": 2258, + "time_per_iteration": 2.8708009719848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112425, + "balance_loss_mlp": 1.07709181, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06520269446334741, + "language_loss": 0.82244253, + "learning_rate": 0.0006287573968351266, + "loss": 0.83356678, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.35351562, + "step": 2259, + "time_per_iteration": 2.5682222843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113367, + "balance_loss_mlp": 1.07729471, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.07246583855612315, + "language_loss": 0.82777989, + "learning_rate": 0.0006284563365156626, + "loss": 0.83891356, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.3605957, + "step": 2260, + "time_per_iteration": 2.827087879180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108747, + "balance_loss_mlp": 1.07148242, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.12125557864683041, + "language_loss": 0.87600839, + "learning_rate": 0.0006281552263250261, + "loss": 0.88709581, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37255859, + "step": 2261, + "time_per_iteration": 2.479753017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_mlp": 1.02072453, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.029168664611412945, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81722796, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10546875, + "step": 2262, + "time_per_iteration": 4.812009334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106503, + "balance_loss_mlp": 1.07104969, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.06614620097740347, + "language_loss": 0.81361771, + "learning_rate": 0.0006275528567978593, + "loss": 0.82468277, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.35449219, + "step": 2263, + "time_per_iteration": 2.903029203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115264, + "balance_loss_mlp": 1.07923913, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07895665669601973, + "language_loss": 0.82951373, + "learning_rate": 0.0006272515976951898, + "loss": 0.84066635, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.3605957, + "step": 2264, + "time_per_iteration": 3.066096544265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109411, + "balance_loss_mlp": 1.07300496, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06560373300441709, + "language_loss": 0.79299462, + "learning_rate": 0.0006269502891890687, + "loss": 0.80408877, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.36425781, + "step": 2265, + "time_per_iteration": 3.036302089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098467, + "balance_loss_mlp": 1.06504071, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05296436812265497, + "language_loss": 0.88411891, + "learning_rate": 0.0006266489313964743, + "loss": 0.89510357, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.33447266, + "step": 2266, + "time_per_iteration": 2.766963481903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105293, + "balance_loss_mlp": 1.06907725, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.057339134399699385, + "language_loss": 0.85443783, + "learning_rate": 0.0006263475244344041, + "loss": 0.86549073, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.36230469, + "step": 2267, + "time_per_iteration": 2.8397552967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104848, + "balance_loss_mlp": 1.0681076, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.06097162500725226, + "language_loss": 0.84725475, + "learning_rate": 0.0006260460684198746, + "loss": 0.85830331, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.36743164, + "step": 2268, + "time_per_iteration": 2.725037097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.06901538, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07238177879654556, + "language_loss": 0.84404624, + "learning_rate": 0.0006257445634699213, + "loss": 0.85510075, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.36425781, + "step": 2269, + "time_per_iteration": 2.623194456100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.06855631, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.060050482587473634, + "language_loss": 0.83212304, + "learning_rate": 0.0006254430097015993, + "loss": 0.84317344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36499023, + "step": 2270, + "time_per_iteration": 2.6570417881011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_mlp": 1.02752221, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.021802814945167073, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751677, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.1015625, + "step": 2271, + "time_per_iteration": 4.800662517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109594, + "balance_loss_mlp": 1.07299662, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.08079345415457889, + "language_loss": 0.85730046, + "learning_rate": 0.0006248397561781609, + "loss": 0.8683964, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.3659668, + "step": 2272, + "time_per_iteration": 2.879779815673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110506, + "balance_loss_mlp": 1.07312167, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.06456885574264018, + "language_loss": 0.86308181, + "learning_rate": 0.0006245380566572482, + "loss": 0.87418681, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.37402344, + "step": 2273, + "time_per_iteration": 2.671515703201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108969, + "balance_loss_mlp": 1.07227635, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07977356034675265, + "language_loss": 0.76295209, + "learning_rate": 0.0006242363087863744, + "loss": 0.77404177, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36669922, + "step": 2274, + "time_per_iteration": 3.0036468505859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_mlp": 1.07430363, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06387432282930158, + "language_loss": 0.86488557, + "learning_rate": 0.0006239345126826878, + "loss": 0.87598979, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.36132812, + "step": 2275, + "time_per_iteration": 2.8046963214874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113113, + "balance_loss_mlp": 1.07551455, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.06304446482372832, + "language_loss": 0.84217036, + "learning_rate": 0.0006236326684633561, + "loss": 0.85330147, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37597656, + "step": 2276, + "time_per_iteration": 2.8238136768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113871, + "balance_loss_mlp": 1.07725024, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07202298424456109, + "language_loss": 0.75335848, + "learning_rate": 0.0006233307762455658, + "loss": 0.76449716, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.36645508, + "step": 2277, + "time_per_iteration": 2.6191978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121381, + "balance_loss_mlp": 1.08576083, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.053405108271766075, + "language_loss": 0.8389169, + "learning_rate": 0.0006230288361465216, + "loss": 0.85013068, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.35644531, + "step": 2278, + "time_per_iteration": 3.0405595302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113147, + "balance_loss_mlp": 1.09399056, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.06317085407877503, + "language_loss": 0.85187429, + "learning_rate": 0.0006227268482834473, + "loss": 0.86318898, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.37475586, + "step": 2279, + "time_per_iteration": 2.884791135787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140517, + "balance_loss_mlp": 1.10272789, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.08374351035766264, + "language_loss": 0.87551039, + "learning_rate": 0.000622424812773585, + "loss": 0.88691556, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.37768555, + "step": 2280, + "time_per_iteration": 2.790846824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129266, + "balance_loss_mlp": 1.09150028, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07881944372222376, + "language_loss": 0.79747838, + "learning_rate": 0.000622122729734195, + "loss": 0.80877101, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.37744141, + "step": 2281, + "time_per_iteration": 2.5392401218414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130236, + "balance_loss_mlp": 1.09404397, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06512890224106707, + "language_loss": 0.87574816, + "learning_rate": 0.0006218205992825566, + "loss": 0.88705051, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.36206055, + "step": 2282, + "time_per_iteration": 2.6409003734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130264, + "balance_loss_mlp": 1.09304714, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.058092029820517505, + "language_loss": 0.82094592, + "learning_rate": 0.0006215184215359671, + "loss": 0.83224851, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37207031, + "step": 2283, + "time_per_iteration": 2.798405647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112171, + "balance_loss_mlp": 1.08506513, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.06799742884418125, + "language_loss": 0.86864793, + "learning_rate": 0.0006212161966117425, + "loss": 0.87986505, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36669922, + "step": 2284, + "time_per_iteration": 2.7305543422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120327, + "balance_loss_mlp": 1.0823704, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.0718064317498989, + "language_loss": 0.81899178, + "learning_rate": 0.0006209139246272164, + "loss": 0.83019507, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37939453, + "step": 2285, + "time_per_iteration": 2.9496707916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114672, + "balance_loss_mlp": 1.07569027, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.0666339573323591, + "language_loss": 0.81558084, + "learning_rate": 0.0006206116056997421, + "loss": 0.82672757, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.38964844, + "step": 2286, + "time_per_iteration": 2.56559681892395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.08414793, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.07939984369379535, + "language_loss": 0.82495737, + "learning_rate": 0.0006203092399466892, + "loss": 0.83617818, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.37915039, + "step": 2287, + "time_per_iteration": 2.614211082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08368051, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.05953237575059506, + "language_loss": 0.85318255, + "learning_rate": 0.0006200068274854473, + "loss": 0.86438239, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36303711, + "step": 2288, + "time_per_iteration": 2.6718688011169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123012, + "balance_loss_mlp": 1.08679628, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0828196201385275, + "language_loss": 0.86406159, + "learning_rate": 0.0006197043684334229, + "loss": 0.87529171, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.36230469, + "step": 2289, + "time_per_iteration": 2.7540907859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128468, + "balance_loss_mlp": 1.09158421, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.11266642339430595, + "language_loss": 0.79650962, + "learning_rate": 0.0006194018629080411, + "loss": 0.80779433, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.36865234, + "step": 2290, + "time_per_iteration": 2.7200653553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127999, + "balance_loss_mlp": 1.09099627, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.0658560511601545, + "language_loss": 0.81793892, + "learning_rate": 0.0006190993110267451, + "loss": 0.82921886, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.36987305, + "step": 2291, + "time_per_iteration": 2.709512233734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130933, + "balance_loss_mlp": 1.09311938, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.0787223425712205, + "language_loss": 0.84518313, + "learning_rate": 0.0006187967129069958, + "loss": 0.85649246, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.37792969, + "step": 2292, + "time_per_iteration": 2.4924299716949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.08935523, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07162475848736369, + "language_loss": 0.87490463, + "learning_rate": 0.0006184940686662722, + "loss": 0.88615251, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.35449219, + "step": 2293, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119268, + "balance_loss_mlp": 1.08445859, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06340812224100711, + "language_loss": 0.9041853, + "learning_rate": 0.0006181913784220714, + "loss": 0.91537791, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.34838867, + "step": 2294, + "time_per_iteration": 2.64821457862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_mlp": 1.0290786, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.025861242717412188, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81591213, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.08349609, + "step": 2295, + "time_per_iteration": 4.885660171508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119684, + "balance_loss_mlp": 1.08537531, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.10155164806079009, + "language_loss": 0.80041152, + "learning_rate": 0.0006175858603933146, + "loss": 0.81160837, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.34326172, + "step": 2296, + "time_per_iteration": 2.881615400314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129433, + "balance_loss_mlp": 1.09393275, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.0685445546464461, + "language_loss": 0.81208229, + "learning_rate": 0.0006172830328438416, + "loss": 0.82337666, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.35498047, + "step": 2297, + "time_per_iteration": 2.940401315689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.08680558, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.09103818832157724, + "language_loss": 0.87286425, + "learning_rate": 0.0006169801597610572, + "loss": 0.88410091, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.36889648, + "step": 2298, + "time_per_iteration": 2.7739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.08195138, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.1052787532551667, + "language_loss": 0.9040001, + "learning_rate": 0.0006166772412625469, + "loss": 0.91515625, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.33666992, + "step": 2299, + "time_per_iteration": 2.734384298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112173, + "balance_loss_mlp": 1.07710147, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.07592361192988976, + "language_loss": 0.81779516, + "learning_rate": 0.0006163742774659141, + "loss": 0.82891691, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.35107422, + "step": 2300, + "time_per_iteration": 2.8436357975006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107602, + "balance_loss_mlp": 1.07410455, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.0790889900730028, + "language_loss": 0.86033177, + "learning_rate": 0.0006160712684887801, + "loss": 0.87140775, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.33496094, + "step": 2301, + "time_per_iteration": 2.816479206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.07118952, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.0554513610730849, + "language_loss": 0.82599401, + "learning_rate": 0.0006157682144487832, + "loss": 0.83703709, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.33129883, + "step": 2302, + "time_per_iteration": 2.7371127605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112573, + "balance_loss_mlp": 1.07776368, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.08617173815320239, + "language_loss": 0.83484352, + "learning_rate": 0.0006154651154635793, + "loss": 0.84596926, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.34838867, + "step": 2303, + "time_per_iteration": 2.822388172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122213, + "balance_loss_mlp": 1.08664048, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.06891313471916412, + "language_loss": 0.85087454, + "learning_rate": 0.0006151619716508421, + "loss": 0.86209667, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.35571289, + "step": 2304, + "time_per_iteration": 2.5669307708740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113601, + "balance_loss_mlp": 1.07905424, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0676174746334525, + "language_loss": 0.87354678, + "learning_rate": 0.0006148587831282625, + "loss": 0.88468277, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.34545898, + "step": 2305, + "time_per_iteration": 2.7296478748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_mlp": 1.03257155, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03035679683037383, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80218178, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.09521484, + "step": 2306, + "time_per_iteration": 4.932115077972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132739, + "balance_loss_mlp": 1.09490204, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.0708853860960667, + "language_loss": 0.87972111, + "learning_rate": 0.0006142522724244255, + "loss": 0.89104849, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.37817383, + "step": 2307, + "time_per_iteration": 2.5106770992279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_mlp": 1.01785433, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.02287011405410123, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77512109, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09521484, + "step": 2308, + "time_per_iteration": 4.842617034912109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120011, + "balance_loss_mlp": 1.08405757, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07624843376245131, + "language_loss": 0.77539825, + "learning_rate": 0.000613645584293942, + "loss": 0.78659838, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.35986328, + "step": 2309, + "time_per_iteration": 2.8661446571350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.08933806, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.0700550632478262, + "language_loss": 0.83505249, + "learning_rate": 0.0006133421739881185, + "loss": 0.84630251, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.35693359, + "step": 2310, + "time_per_iteration": 2.6644127368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118668, + "balance_loss_mlp": 1.08319092, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.11928760190169391, + "language_loss": 0.83116257, + "learning_rate": 0.0006130387196789605, + "loss": 0.84234929, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.35522461, + "step": 2311, + "time_per_iteration": 2.7157018184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111828, + "balance_loss_mlp": 1.07699549, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.05741887786628051, + "language_loss": 0.84819949, + "learning_rate": 0.0006127352214842795, + "loss": 0.85931778, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.34838867, + "step": 2312, + "time_per_iteration": 2.9459052085876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118701, + "balance_loss_mlp": 1.08293796, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07350143541661519, + "language_loss": 0.85691726, + "learning_rate": 0.0006124316795219041, + "loss": 0.86810434, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.35742188, + "step": 2313, + "time_per_iteration": 2.772299289703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131037, + "balance_loss_mlp": 1.0956552, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.06263706285199609, + "language_loss": 0.82505524, + "learning_rate": 0.0006121280939096794, + "loss": 0.83636558, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.35424805, + "step": 2314, + "time_per_iteration": 2.7951674461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114668, + "balance_loss_mlp": 1.11020195, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.0720052818844606, + "language_loss": 0.88360798, + "learning_rate": 0.000611824464765468, + "loss": 0.89507478, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.36499023, + "step": 2315, + "time_per_iteration": 2.5895602703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067507, + "balance_loss_mlp": 1.05682635, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.0344692196546668, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79662448, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.10693359, + "step": 2316, + "time_per_iteration": 4.6560447216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137929, + "balance_loss_mlp": 1.1022377, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.06826351361083724, + "language_loss": 0.85665047, + "learning_rate": 0.000611217076352619, + "loss": 0.86802971, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35693359, + "step": 2317, + "time_per_iteration": 2.7965078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132041, + "balance_loss_mlp": 1.09835279, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06652231411845559, + "language_loss": 0.83542907, + "learning_rate": 0.0006109133173197905, + "loss": 0.84674948, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.33691406, + "step": 2318, + "time_per_iteration": 2.678832769393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124047, + "balance_loss_mlp": 1.08897519, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.06811942389724822, + "language_loss": 0.85992062, + "learning_rate": 0.0006106095152265935, + "loss": 0.8711611, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35107422, + "step": 2319, + "time_per_iteration": 2.9018518924713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111639, + "balance_loss_mlp": 1.08060324, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.06308491230142964, + "language_loss": 0.85126555, + "learning_rate": 0.0006103056701909739, + "loss": 0.8624295, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.3581543, + "step": 2320, + "time_per_iteration": 2.927619218826294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111434, + "balance_loss_mlp": 1.07869673, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.08034132862269446, + "language_loss": 0.83192152, + "learning_rate": 0.0006100017823308956, + "loss": 0.8430649, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35644531, + "step": 2321, + "time_per_iteration": 3.1759355068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111575, + "balance_loss_mlp": 1.07645655, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.0688182521177716, + "language_loss": 0.79684091, + "learning_rate": 0.0006096978517643377, + "loss": 0.8079567, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.35131836, + "step": 2322, + "time_per_iteration": 2.791020154953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_mlp": 1.07337499, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.08831218810897808, + "language_loss": 0.83671057, + "learning_rate": 0.0006093938786092968, + "loss": 0.84780538, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.36108398, + "step": 2323, + "time_per_iteration": 2.614248037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107948, + "balance_loss_mlp": 1.0734967, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06554008035854059, + "language_loss": 0.90401232, + "learning_rate": 0.0006090898629837857, + "loss": 0.91509175, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.34448242, + "step": 2324, + "time_per_iteration": 2.7988476753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114598, + "balance_loss_mlp": 1.07950234, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.05596676685861875, + "language_loss": 0.87779921, + "learning_rate": 0.0006087858050058337, + "loss": 0.88894522, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.35083008, + "step": 2325, + "time_per_iteration": 2.8598742485046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106952, + "balance_loss_mlp": 1.07309675, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.08404177014968839, + "language_loss": 0.82489681, + "learning_rate": 0.0006084817047934866, + "loss": 0.83596623, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.33886719, + "step": 2326, + "time_per_iteration": 2.6458888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.07780075, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.07155239810176077, + "language_loss": 0.89966661, + "learning_rate": 0.0006081775624648066, + "loss": 0.91078842, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.34399414, + "step": 2327, + "time_per_iteration": 2.580366373062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120962, + "balance_loss_mlp": 1.08689189, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.06301539333332261, + "language_loss": 0.83119273, + "learning_rate": 0.0006078733781378721, + "loss": 0.8424024, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.34082031, + "step": 2328, + "time_per_iteration": 2.54127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110302, + "balance_loss_mlp": 1.07594562, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.057204005558127505, + "language_loss": 0.82213807, + "learning_rate": 0.0006075691519307781, + "loss": 0.83324105, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.34375, + "step": 2329, + "time_per_iteration": 2.8602964878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117341, + "balance_loss_mlp": 1.08193612, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.055534005363494426, + "language_loss": 0.81606597, + "learning_rate": 0.0006072648839616356, + "loss": 0.82723939, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.35400391, + "step": 2330, + "time_per_iteration": 2.662810802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119722, + "balance_loss_mlp": 1.08565211, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.050779766652796585, + "language_loss": 0.82901573, + "learning_rate": 0.0006069605743485718, + "loss": 0.84021294, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.34057617, + "step": 2331, + "time_per_iteration": 3.3678483963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128598, + "balance_loss_mlp": 1.0950762, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.04918059080846435, + "language_loss": 0.83280981, + "learning_rate": 0.0006066562232097303, + "loss": 0.84409571, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.33544922, + "step": 2332, + "time_per_iteration": 2.7449440956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123187, + "balance_loss_mlp": 1.08785367, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.052836841401222294, + "language_loss": 0.86161315, + "learning_rate": 0.0006063518306632708, + "loss": 0.87284505, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.35375977, + "step": 2333, + "time_per_iteration": 2.9690473079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127444, + "balance_loss_mlp": 1.09220576, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.06707958703687776, + "language_loss": 0.82286978, + "learning_rate": 0.0006060473968273688, + "loss": 0.83414423, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.35229492, + "step": 2334, + "time_per_iteration": 2.665539026260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142277, + "balance_loss_mlp": 1.13331211, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.036352477885187424, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79021817, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.08984375, + "step": 2335, + "time_per_iteration": 4.888899326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115384, + "balance_loss_mlp": 1.10641909, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.027581232823365703, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82120597, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.08984375, + "step": 2336, + "time_per_iteration": 4.835580348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126219, + "balance_loss_mlp": 1.09155297, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.06484007747623576, + "language_loss": 0.88115394, + "learning_rate": 0.0006051338487650047, + "loss": 0.89241612, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.34667969, + "step": 2337, + "time_per_iteration": 2.4327657222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125299, + "balance_loss_mlp": 1.08846319, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.06762371666749806, + "language_loss": 0.82857472, + "learning_rate": 0.0006048292509534095, + "loss": 0.83982766, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.3684082, + "step": 2338, + "time_per_iteration": 2.583315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.08851767, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.06288042140328122, + "language_loss": 0.78114402, + "learning_rate": 0.0006045246124434895, + "loss": 0.792373, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.34350586, + "step": 2339, + "time_per_iteration": 2.718944787979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111671, + "balance_loss_mlp": 1.08223438, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06455240115792976, + "language_loss": 0.86995041, + "learning_rate": 0.0006042199333535162, + "loss": 0.88111752, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.3449707, + "step": 2340, + "time_per_iteration": 3.280731439590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120556, + "balance_loss_mlp": 1.08803582, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06119421780994794, + "language_loss": 0.83960807, + "learning_rate": 0.0006039152138017763, + "loss": 0.85081363, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.32519531, + "step": 2341, + "time_per_iteration": 3.042808771133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08285511, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.06181422787629511, + "language_loss": 0.83835328, + "learning_rate": 0.0006036104539065726, + "loss": 0.84952325, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.34155273, + "step": 2342, + "time_per_iteration": 2.671872138977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117893, + "balance_loss_mlp": 1.08208227, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.05413998463628708, + "language_loss": 0.84596831, + "learning_rate": 0.000603305653786223, + "loss": 0.85714728, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.3581543, + "step": 2343, + "time_per_iteration": 3.153627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116847, + "balance_loss_mlp": 1.08182287, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.06019885466307642, + "language_loss": 0.84242773, + "learning_rate": 0.0006030008135590622, + "loss": 0.85359621, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.35058594, + "step": 2344, + "time_per_iteration": 2.724281072616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109974, + "balance_loss_mlp": 1.07564187, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.06173385406680834, + "language_loss": 0.80783409, + "learning_rate": 0.0006026959333434387, + "loss": 0.81893378, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.34350586, + "step": 2345, + "time_per_iteration": 2.7752277851104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107914, + "balance_loss_mlp": 1.07336736, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.04677974400708639, + "language_loss": 0.77811158, + "learning_rate": 0.0006023910132577181, + "loss": 0.78919077, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.34545898, + "step": 2346, + "time_per_iteration": 2.663447141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_mlp": 1.06802082, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.060558646022808645, + "language_loss": 0.85310882, + "learning_rate": 0.0006020860534202806, + "loss": 0.86412525, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.33618164, + "step": 2347, + "time_per_iteration": 2.480811595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108815, + "balance_loss_mlp": 1.07388651, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06606096221098971, + "language_loss": 0.81316173, + "learning_rate": 0.0006017810539495224, + "loss": 0.82424992, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.34960938, + "step": 2348, + "time_per_iteration": 2.9476070404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098582, + "balance_loss_mlp": 1.06415427, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.0571113923067653, + "language_loss": 0.82774842, + "learning_rate": 0.0006014760149638547, + "loss": 0.83873427, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.34423828, + "step": 2349, + "time_per_iteration": 2.6655263900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103718, + "balance_loss_mlp": 1.07005334, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.06475243948679671, + "language_loss": 0.88831103, + "learning_rate": 0.000601170936581704, + "loss": 0.89934826, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.33666992, + "step": 2350, + "time_per_iteration": 2.5269417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06343222, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.06432650174878703, + "language_loss": 0.84562814, + "learning_rate": 0.0006008658189215121, + "loss": 0.85660601, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.34399414, + "step": 2351, + "time_per_iteration": 2.621596097946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110179, + "balance_loss_mlp": 1.07267594, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.3016755485520666, + "language_loss": 0.8046757, + "learning_rate": 0.0006005606621017366, + "loss": 0.81577748, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.375, + "step": 2352, + "time_per_iteration": 2.561138153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111286, + "balance_loss_mlp": 1.07564211, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.055264843638134026, + "language_loss": 0.80770934, + "learning_rate": 0.0006002554662408496, + "loss": 0.81882215, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.35644531, + "step": 2353, + "time_per_iteration": 2.87947940826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118454, + "balance_loss_mlp": 1.08180928, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.06003231312298175, + "language_loss": 0.91710508, + "learning_rate": 0.0005999502314573388, + "loss": 0.92828965, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36645508, + "step": 2354, + "time_per_iteration": 2.703589916229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127767, + "balance_loss_mlp": 1.09119391, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.06522748471040672, + "language_loss": 0.86741221, + "learning_rate": 0.0005996449578697066, + "loss": 0.87868989, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.36547852, + "step": 2355, + "time_per_iteration": 2.6407227516174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114254, + "balance_loss_mlp": 1.10627651, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.05645244306136207, + "language_loss": 0.81587362, + "learning_rate": 0.0005993396455964709, + "loss": 0.827299, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36279297, + "step": 2356, + "time_per_iteration": 2.7260916233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159041, + "balance_loss_mlp": 1.12263405, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.0574643396084849, + "language_loss": 0.81904489, + "learning_rate": 0.0005990342947561647, + "loss": 0.83063525, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.36401367, + "step": 2357, + "time_per_iteration": 2.763461112976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158122, + "balance_loss_mlp": 1.12109542, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.06627350558163068, + "language_loss": 0.78124607, + "learning_rate": 0.0005987289054673351, + "loss": 0.79282725, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.37011719, + "step": 2358, + "time_per_iteration": 2.7317159175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172658, + "balance_loss_mlp": 1.16121387, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.05600708096364228, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77748394, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11425781, + "step": 2359, + "time_per_iteration": 4.815205335617065 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168714, + "balance_loss_mlp": 1.13257003, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.0832511333205401, + "language_loss": 0.91429126, + "learning_rate": 0.0005981180120183722, + "loss": 0.92597842, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36206055, + "step": 2360, + "time_per_iteration": 2.675994873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154784, + "balance_loss_mlp": 1.11825836, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.06456101952662723, + "language_loss": 0.85450256, + "learning_rate": 0.0005978125080954089, + "loss": 0.86605042, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.36523438, + "step": 2361, + "time_per_iteration": 2.844592332839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134938, + "balance_loss_mlp": 1.0997715, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.06943573222196867, + "language_loss": 0.77225572, + "learning_rate": 0.000597506966198262, + "loss": 0.7836051, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.35180664, + "step": 2362, + "time_per_iteration": 2.990652322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127189, + "balance_loss_mlp": 1.09216547, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.07387250459530678, + "language_loss": 0.84014916, + "learning_rate": 0.0005972013864455536, + "loss": 0.85142106, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.3503418, + "step": 2363, + "time_per_iteration": 2.589594841003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124933, + "balance_loss_mlp": 1.09141088, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.06451639193106218, + "language_loss": 0.85711533, + "learning_rate": 0.0005968957689559203, + "loss": 0.86836469, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.33544922, + "step": 2364, + "time_per_iteration": 2.6682167053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119758, + "balance_loss_mlp": 1.08585453, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.06239355206550831, + "language_loss": 0.89508158, + "learning_rate": 0.0005965901138480131, + "loss": 0.90627909, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.33911133, + "step": 2365, + "time_per_iteration": 2.6365487575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125783, + "balance_loss_mlp": 1.08816087, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07086256306792152, + "language_loss": 0.87331104, + "learning_rate": 0.0005962844212404982, + "loss": 0.88456881, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.37597656, + "step": 2366, + "time_per_iteration": 2.6617612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123043, + "balance_loss_mlp": 1.08763838, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05743086206543283, + "language_loss": 0.87604624, + "learning_rate": 0.0005959786912520558, + "loss": 0.88727665, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.35400391, + "step": 2367, + "time_per_iteration": 2.5842456817626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112429, + "balance_loss_mlp": 1.08878994, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.05541530908978363, + "language_loss": 0.84261698, + "learning_rate": 0.0005956729240013806, + "loss": 0.8538599, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.35522461, + "step": 2368, + "time_per_iteration": 2.8338305950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131752, + "balance_loss_mlp": 1.09880257, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06117437276065272, + "language_loss": 0.91673207, + "learning_rate": 0.0005953671196071824, + "loss": 0.92804956, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.32958984, + "step": 2369, + "time_per_iteration": 2.6954920291900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140492, + "balance_loss_mlp": 1.10089099, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.05874804832244865, + "language_loss": 0.80540514, + "learning_rate": 0.0005950612781881846, + "loss": 0.81681007, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.39575195, + "step": 2370, + "time_per_iteration": 2.695518732070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133052, + "balance_loss_mlp": 1.09526241, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.054922750315337415, + "language_loss": 0.76194978, + "learning_rate": 0.0005947553998631259, + "loss": 0.77328038, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37792969, + "step": 2371, + "time_per_iteration": 2.854757070541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133988, + "balance_loss_mlp": 1.09777188, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.04850294692755014, + "language_loss": 0.79227567, + "learning_rate": 0.000594449484750758, + "loss": 0.80361551, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36206055, + "step": 2372, + "time_per_iteration": 3.2277348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128775, + "balance_loss_mlp": 1.09263051, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.06286219474212958, + "language_loss": 0.83387208, + "learning_rate": 0.0005941435329698484, + "loss": 0.84515989, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36132812, + "step": 2373, + "time_per_iteration": 2.676492929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126424, + "balance_loss_mlp": 1.09025562, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.05768590484176838, + "language_loss": 0.83615124, + "learning_rate": 0.0005938375446391778, + "loss": 0.84741557, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36181641, + "step": 2374, + "time_per_iteration": 2.7465567588806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137671, + "balance_loss_mlp": 1.09969115, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.05745321957635053, + "language_loss": 0.89048398, + "learning_rate": 0.0005935315198775415, + "loss": 0.90186071, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38012695, + "step": 2375, + "time_per_iteration": 2.6580095291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128206, + "balance_loss_mlp": 1.09320593, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06107240600749233, + "language_loss": 0.87175268, + "learning_rate": 0.0005932254588037486, + "loss": 0.88303471, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.35009766, + "step": 2376, + "time_per_iteration": 2.488588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121963, + "balance_loss_mlp": 1.08600903, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.05478508122440065, + "language_loss": 0.86331463, + "learning_rate": 0.000592919361536623, + "loss": 0.87453431, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.35961914, + "step": 2377, + "time_per_iteration": 2.644374132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127537, + "balance_loss_mlp": 1.09196472, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.05713052679154174, + "language_loss": 0.89246452, + "learning_rate": 0.0005926132281950017, + "loss": 0.90373993, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.35571289, + "step": 2378, + "time_per_iteration": 2.7563676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121403, + "balance_loss_mlp": 1.08406663, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.05503863795363348, + "language_loss": 0.85310149, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431557, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37329102, + "step": 2379, + "time_per_iteration": 2.8923282623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123743, + "balance_loss_mlp": 1.087098, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.05441682742417314, + "language_loss": 0.86308765, + "learning_rate": 0.0005920008537636931, + "loss": 0.8743251, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.3659668, + "step": 2380, + "time_per_iteration": 2.8928191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121741, + "balance_loss_mlp": 1.0852387, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.0522540937039379, + "language_loss": 0.86756825, + "learning_rate": 0.0005916946129117504, + "loss": 0.87878567, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.9031155109405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129507, + "balance_loss_mlp": 1.09281409, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.055637229661903514, + "language_loss": 0.80852348, + "learning_rate": 0.0005913883364608017, + "loss": 0.8198185, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36694336, + "step": 2382, + "time_per_iteration": 3.0779874324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123213, + "balance_loss_mlp": 1.088094, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.05906328885450196, + "language_loss": 0.88737094, + "learning_rate": 0.0005910820245297542, + "loss": 0.89860308, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.35131836, + "step": 2383, + "time_per_iteration": 2.889805555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119246, + "balance_loss_mlp": 1.0824585, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.06990697064707628, + "language_loss": 0.80825961, + "learning_rate": 0.000590775677237529, + "loss": 0.81945217, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.36791992, + "step": 2384, + "time_per_iteration": 2.7286477088928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011127, + "balance_loss_mlp": 1.07562566, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.06044507930671915, + "language_loss": 0.80186594, + "learning_rate": 0.0005904692947030601, + "loss": 0.81299293, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.37084961, + "step": 2385, + "time_per_iteration": 2.6249661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112584, + "balance_loss_mlp": 1.07446146, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.06266023003425206, + "language_loss": 0.89858609, + "learning_rate": 0.0005901628770452963, + "loss": 0.90971196, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.38110352, + "step": 2386, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106229, + "balance_loss_mlp": 1.06925035, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.05741151930163357, + "language_loss": 0.87304425, + "learning_rate": 0.000589856424383199, + "loss": 0.88410658, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.36987305, + "step": 2387, + "time_per_iteration": 2.6852517127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.07863569, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.06606538590283985, + "language_loss": 0.83553612, + "learning_rate": 0.000589549936835744, + "loss": 0.84669703, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.37451172, + "step": 2388, + "time_per_iteration": 2.8861043453216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106236, + "balance_loss_mlp": 1.07135534, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06160096974470471, + "language_loss": 0.79523546, + "learning_rate": 0.0005892434145219202, + "loss": 0.80629778, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.34912109, + "step": 2389, + "time_per_iteration": 2.6016130447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06219506, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07218042116864783, + "language_loss": 0.82768381, + "learning_rate": 0.0005889368575607303, + "loss": 0.83865625, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.35058594, + "step": 2390, + "time_per_iteration": 2.806382894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_mlp": 1.06791568, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.06321076421250729, + "language_loss": 0.78347147, + "learning_rate": 0.00058863026607119, + "loss": 0.7944994, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.34912109, + "step": 2391, + "time_per_iteration": 3.0679373741149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06800711, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07981135891264553, + "language_loss": 0.80153728, + "learning_rate": 0.0005883236401723287, + "loss": 0.81255829, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.34130859, + "step": 2392, + "time_per_iteration": 3.178016185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102518, + "balance_loss_mlp": 1.06830466, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.05809686694512272, + "language_loss": 0.8436439, + "learning_rate": 0.0005880169799831893, + "loss": 0.85466909, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.34204102, + "step": 2393, + "time_per_iteration": 2.7394168376922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099974, + "balance_loss_mlp": 1.06537914, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.05496993027151255, + "language_loss": 0.81652063, + "learning_rate": 0.0005877102856228278, + "loss": 0.82752037, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.34594727, + "step": 2394, + "time_per_iteration": 2.857044219970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07225823, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.0685378240754912, + "language_loss": 0.84987622, + "learning_rate": 0.0005874035572103133, + "loss": 0.86095524, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.35644531, + "step": 2395, + "time_per_iteration": 2.6805660724639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_mlp": 1.06699777, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.07818612590839771, + "language_loss": 0.82504952, + "learning_rate": 0.0005870967948647288, + "loss": 0.83607757, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.35839844, + "step": 2396, + "time_per_iteration": 2.7740094661712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13801181, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.06620078890509219, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75458288, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.11962891, + "step": 2397, + "time_per_iteration": 5.407956838607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.07158542, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.05578291602549768, + "language_loss": 0.85959148, + "learning_rate": 0.0005864831688507443, + "loss": 0.87066138, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.35424805, + "step": 2398, + "time_per_iteration": 3.000498056411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108167, + "balance_loss_mlp": 1.07342887, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.0567470157783756, + "language_loss": 0.7555595, + "learning_rate": 0.0005861763054205754, + "loss": 0.7666412, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.34765625, + "step": 2399, + "time_per_iteration": 2.7206692695617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108701, + "balance_loss_mlp": 1.07303381, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.054446102099669776, + "language_loss": 0.80056608, + "learning_rate": 0.0005858694085337976, + "loss": 0.81165302, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.35668945, + "step": 2400, + "time_per_iteration": 2.8272197246551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107355, + "balance_loss_mlp": 1.07090116, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.06783884534527172, + "language_loss": 0.83774948, + "learning_rate": 0.0005855624783095589, + "loss": 0.84882307, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.36425781, + "step": 2401, + "time_per_iteration": 2.6019625663757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102889, + "balance_loss_mlp": 1.06812799, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.05559222161472476, + "language_loss": 0.8541491, + "learning_rate": 0.00058525551486702, + "loss": 0.86517805, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.34790039, + "step": 2402, + "time_per_iteration": 2.5166754722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106757, + "balance_loss_mlp": 1.07058895, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.07030933336499708, + "language_loss": 0.80856764, + "learning_rate": 0.0005849485183253548, + "loss": 0.81963521, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.36206055, + "step": 2403, + "time_per_iteration": 2.6906049251556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.07090759, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.057304610397081915, + "language_loss": 0.87811077, + "learning_rate": 0.0005846414888037501, + "loss": 0.88916934, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.34960938, + "step": 2404, + "time_per_iteration": 2.488797426223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.06899309, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.05034114049250231, + "language_loss": 0.82261539, + "learning_rate": 0.0005843344264214049, + "loss": 0.83363742, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.33203125, + "step": 2405, + "time_per_iteration": 2.746372938156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110347, + "balance_loss_mlp": 1.07068777, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.10060755415937467, + "language_loss": 0.85092008, + "learning_rate": 0.0005840273312975317, + "loss": 0.86195481, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.32788086, + "step": 2406, + "time_per_iteration": 2.834230661392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112626, + "balance_loss_mlp": 1.07829416, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.06610522075480575, + "language_loss": 0.90376371, + "learning_rate": 0.0005837202035513555, + "loss": 0.91489005, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.34326172, + "step": 2407, + "time_per_iteration": 2.577099084854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112101, + "balance_loss_mlp": 1.07693422, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06799718927162632, + "language_loss": 0.81987119, + "learning_rate": 0.0005834130433021136, + "loss": 0.83099222, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.3515625, + "step": 2408, + "time_per_iteration": 2.751481771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07537687, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07576984187058394, + "language_loss": 0.73707795, + "learning_rate": 0.0005831058506690563, + "loss": 0.74819058, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.359375, + "step": 2409, + "time_per_iteration": 2.6351587772369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104428, + "balance_loss_mlp": 1.0719074, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06066453040155937, + "language_loss": 0.86246306, + "learning_rate": 0.0005827986257714464, + "loss": 0.87350732, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.32519531, + "step": 2410, + "time_per_iteration": 2.9171712398529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_mlp": 1.07334006, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.05632663018450853, + "language_loss": 0.8897202, + "learning_rate": 0.0005824913687285591, + "loss": 0.90078408, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.33032227, + "step": 2411, + "time_per_iteration": 2.6863625049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104253, + "balance_loss_mlp": 1.07056427, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.09102731097831396, + "language_loss": 0.81903768, + "learning_rate": 0.0005821840796596821, + "loss": 0.83008015, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.3371582, + "step": 2412, + "time_per_iteration": 2.658602714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108642, + "balance_loss_mlp": 1.07605052, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.04905521047169809, + "language_loss": 0.8043226, + "learning_rate": 0.0005818767586841158, + "loss": 0.81540906, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.32592773, + "step": 2413, + "time_per_iteration": 2.7577285766601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108976, + "balance_loss_mlp": 1.07655096, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06302213894221746, + "language_loss": 0.865412, + "learning_rate": 0.0005815694059211726, + "loss": 0.8765018, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.32421875, + "step": 2414, + "time_per_iteration": 2.6655328273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174358, + "balance_loss_mlp": 1.16362953, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.06384975588330166, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82047987, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.10742188, + "step": 2415, + "time_per_iteration": 4.795905828475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_mlp": 1.09135294, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.035706806463564576, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78046715, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.10302734, + "step": 2416, + "time_per_iteration": 4.964730978012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100793, + "balance_loss_mlp": 1.06910706, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.054161288123553565, + "language_loss": 0.8669647, + "learning_rate": 0.0005806471581013931, + "loss": 0.8779726, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.31640625, + "step": 2417, + "time_per_iteration": 2.7034828662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106596, + "balance_loss_mlp": 1.07221591, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.05684649238509572, + "language_loss": 0.78830767, + "learning_rate": 0.0005803396793823146, + "loss": 0.79937363, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.34375, + "step": 2418, + "time_per_iteration": 2.810929536819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112917, + "balance_loss_mlp": 1.07848907, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07858966703970842, + "language_loss": 0.86256903, + "learning_rate": 0.0005800321694726065, + "loss": 0.87369823, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.34423828, + "step": 2419, + "time_per_iteration": 2.797091484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113087, + "balance_loss_mlp": 1.07880187, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.06627504844203173, + "language_loss": 0.86954433, + "learning_rate": 0.0005797246284916545, + "loss": 0.8806752, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.34277344, + "step": 2420, + "time_per_iteration": 2.689190149307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_mlp": 1.09355068, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.047662019725998206, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78608793, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.10058594, + "step": 2421, + "time_per_iteration": 6.38897705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112649, + "balance_loss_mlp": 1.09318316, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.06710074217369558, + "language_loss": 0.88096154, + "learning_rate": 0.0005791094537936233, + "loss": 0.8922264, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.33325195, + "step": 2422, + "time_per_iteration": 4.209144353866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126187, + "balance_loss_mlp": 1.09340453, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.0626199173608307, + "language_loss": 0.82125473, + "learning_rate": 0.0005788018203153762, + "loss": 0.83251661, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.32788086, + "step": 2423, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138161, + "balance_loss_mlp": 1.10540235, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.07666207610831233, + "language_loss": 0.85944337, + "learning_rate": 0.000578494156243549, + "loss": 0.87082505, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.32763672, + "step": 2424, + "time_per_iteration": 2.582838296890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142028, + "balance_loss_mlp": 1.10779119, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.11745148991984863, + "language_loss": 0.89446878, + "learning_rate": 0.0005781864616975878, + "loss": 0.90588903, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.3425293, + "step": 2425, + "time_per_iteration": 2.6464650630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135149, + "balance_loss_mlp": 1.10081649, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.07242740344873133, + "language_loss": 0.84278369, + "learning_rate": 0.0005778787367969502, + "loss": 0.85413516, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.34375, + "step": 2426, + "time_per_iteration": 2.5785605907440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.09822595, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.06251358549871673, + "language_loss": 0.81181312, + "learning_rate": 0.0005775709816611053, + "loss": 0.82312894, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.33374023, + "step": 2427, + "time_per_iteration": 2.9622879028320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125428, + "balance_loss_mlp": 1.09100056, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.06013841542134278, + "language_loss": 0.83607411, + "learning_rate": 0.0005772631964095346, + "loss": 0.84732836, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.34448242, + "step": 2428, + "time_per_iteration": 2.681161403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123616, + "balance_loss_mlp": 1.08990407, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.05815575913312505, + "language_loss": 0.85975552, + "learning_rate": 0.000576955381161731, + "loss": 0.87099165, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.3371582, + "step": 2429, + "time_per_iteration": 2.670814275741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122337, + "balance_loss_mlp": 1.08843446, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07250877112671852, + "language_loss": 0.86541677, + "learning_rate": 0.0005766475360371985, + "loss": 0.8766402, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.33935547, + "step": 2430, + "time_per_iteration": 2.5907814502716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118824, + "balance_loss_mlp": 1.08368063, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0946745942266809, + "language_loss": 0.84659714, + "learning_rate": 0.0005763396611554536, + "loss": 0.85778534, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.3515625, + "step": 2431, + "time_per_iteration": 2.679352045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123862, + "balance_loss_mlp": 1.0890286, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.06880905442669231, + "language_loss": 0.80567783, + "learning_rate": 0.0005760317566360237, + "loss": 0.81691647, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.34838867, + "step": 2432, + "time_per_iteration": 3.0134341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116239, + "balance_loss_mlp": 1.08090591, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.09211359876128772, + "language_loss": 0.85498667, + "learning_rate": 0.000575723822598448, + "loss": 0.86614907, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.35375977, + "step": 2433, + "time_per_iteration": 2.807387351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113775, + "balance_loss_mlp": 1.07882285, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07984033993726149, + "language_loss": 0.81515086, + "learning_rate": 0.0005754158591622773, + "loss": 0.82628858, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.35009766, + "step": 2434, + "time_per_iteration": 2.9610190391540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108856, + "balance_loss_mlp": 1.07335579, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.08173781127815187, + "language_loss": 0.83058012, + "learning_rate": 0.0005751078664470732, + "loss": 0.84166867, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.35522461, + "step": 2435, + "time_per_iteration": 2.5381393432617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105873, + "balance_loss_mlp": 1.07125473, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.06625067078188727, + "language_loss": 0.86156499, + "learning_rate": 0.0005747998445724094, + "loss": 0.87262368, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.34643555, + "step": 2436, + "time_per_iteration": 2.5991244316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110631, + "balance_loss_mlp": 1.0730263, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.06922366477490534, + "language_loss": 0.8967731, + "learning_rate": 0.0005744917936578707, + "loss": 0.90783614, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.33276367, + "step": 2437, + "time_per_iteration": 2.7876076698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110478, + "balance_loss_mlp": 1.07087731, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.05346939801811538, + "language_loss": 0.83987176, + "learning_rate": 0.0005741837138230526, + "loss": 0.8509196, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.33911133, + "step": 2438, + "time_per_iteration": 2.7089829444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110533, + "balance_loss_mlp": 1.07063985, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06113216144822436, + "language_loss": 0.8632471, + "learning_rate": 0.0005738756051875627, + "loss": 0.87430036, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.34692383, + "step": 2439, + "time_per_iteration": 3.10072922706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106031, + "balance_loss_mlp": 1.07031631, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.054040954813727636, + "language_loss": 0.83196378, + "learning_rate": 0.0005735674678710192, + "loss": 0.84302408, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.35668945, + "step": 2440, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06644058, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.06378034204188901, + "language_loss": 0.81315678, + "learning_rate": 0.0005732593019930517, + "loss": 0.82417667, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.35571289, + "step": 2441, + "time_per_iteration": 2.8945391178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_mlp": 1.0766257, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0589509513637404, + "language_loss": 0.88047123, + "learning_rate": 0.0005729511076733008, + "loss": 0.89160711, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.36962891, + "step": 2442, + "time_per_iteration": 2.6688244342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119163, + "balance_loss_mlp": 1.08199334, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.06849073497169517, + "language_loss": 0.84747314, + "learning_rate": 0.000572642885031418, + "loss": 0.85866475, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.37207031, + "step": 2443, + "time_per_iteration": 2.9179134368896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108245, + "balance_loss_mlp": 1.07276881, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.0584848920178353, + "language_loss": 0.80748844, + "learning_rate": 0.0005723346341870662, + "loss": 0.81857085, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35522461, + "step": 2444, + "time_per_iteration": 2.701399087905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129757, + "balance_loss_mlp": 1.09277797, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.11865712100152984, + "language_loss": 0.86692929, + "learning_rate": 0.0005720263552599188, + "loss": 0.87822688, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.36962891, + "step": 2445, + "time_per_iteration": 2.4486730098724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121458, + "balance_loss_mlp": 1.08500421, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08366602087356424, + "language_loss": 0.79955238, + "learning_rate": 0.0005717180483696604, + "loss": 0.81076699, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.36499023, + "step": 2446, + "time_per_iteration": 2.8785839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120985, + "balance_loss_mlp": 1.08486462, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0682417361382486, + "language_loss": 0.83352333, + "learning_rate": 0.0005714097136359862, + "loss": 0.84473318, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36157227, + "step": 2447, + "time_per_iteration": 2.6363351345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118201, + "balance_loss_mlp": 1.08296275, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.051381339811927676, + "language_loss": 0.86498094, + "learning_rate": 0.0005711013511786027, + "loss": 0.87616301, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.35253906, + "step": 2448, + "time_per_iteration": 2.762845993041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111685, + "balance_loss_mlp": 1.08170676, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.058854412729412026, + "language_loss": 0.84082228, + "learning_rate": 0.0005707929611172263, + "loss": 0.85199082, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3515625, + "step": 2449, + "time_per_iteration": 2.7246243953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08007717, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.11039935923903105, + "language_loss": 0.84227139, + "learning_rate": 0.000570484543571585, + "loss": 0.85343003, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.35791016, + "step": 2450, + "time_per_iteration": 2.610919237136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113904, + "balance_loss_mlp": 1.0777123, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.0667594391398321, + "language_loss": 0.82813287, + "learning_rate": 0.0005701760986614171, + "loss": 0.8392719, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36181641, + "step": 2451, + "time_per_iteration": 2.5151522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120641, + "balance_loss_mlp": 1.08590317, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.0603467987943219, + "language_loss": 0.87650943, + "learning_rate": 0.0005698676265064714, + "loss": 0.88771582, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.34765625, + "step": 2452, + "time_per_iteration": 2.5722150802612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114487, + "balance_loss_mlp": 1.07920074, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.07549274937771847, + "language_loss": 0.89053345, + "learning_rate": 0.0005695591272265074, + "loss": 0.90167832, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.35327148, + "step": 2453, + "time_per_iteration": 2.5431923866271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109778, + "balance_loss_mlp": 1.07384801, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.05406998074400625, + "language_loss": 0.82143486, + "learning_rate": 0.0005692506009412954, + "loss": 0.83253264, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.359375, + "step": 2454, + "time_per_iteration": 2.6976101398468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176153, + "balance_loss_mlp": 1.16375494, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.047894752053778404, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78727424, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.12402344, + "step": 2455, + "time_per_iteration": 5.006427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103739, + "balance_loss_mlp": 1.07000232, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07748007609747588, + "language_loss": 0.89838475, + "learning_rate": 0.0005686334678342593, + "loss": 0.90942216, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.3371582, + "step": 2456, + "time_per_iteration": 2.88089919090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110083, + "balance_loss_mlp": 1.07586968, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.053591450648947214, + "language_loss": 0.81747675, + "learning_rate": 0.0005683248612520274, + "loss": 0.82857764, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.34204102, + "step": 2457, + "time_per_iteration": 3.0411272048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111421, + "balance_loss_mlp": 1.07811391, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.10239407628225645, + "language_loss": 0.84273934, + "learning_rate": 0.0005680162281437321, + "loss": 0.85388148, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36083984, + "step": 2458, + "time_per_iteration": 2.8898301124572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120752, + "balance_loss_mlp": 1.08608592, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.0555075738071769, + "language_loss": 0.85104299, + "learning_rate": 0.000567707568629195, + "loss": 0.86225057, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.34692383, + "step": 2459, + "time_per_iteration": 2.706040143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122742, + "balance_loss_mlp": 1.08778977, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.06127780861136823, + "language_loss": 0.82619834, + "learning_rate": 0.0005673988828282486, + "loss": 0.83742571, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.34985352, + "step": 2460, + "time_per_iteration": 2.674525499343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111568, + "balance_loss_mlp": 1.07668757, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05574274236604154, + "language_loss": 0.81308633, + "learning_rate": 0.0005670901708607352, + "loss": 0.82420194, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.34912109, + "step": 2461, + "time_per_iteration": 2.982827663421631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109707, + "balance_loss_mlp": 1.0753746, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.15434207723638854, + "language_loss": 0.84411561, + "learning_rate": 0.0005667814328465076, + "loss": 0.85521269, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.34350586, + "step": 2462, + "time_per_iteration": 2.639051914215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.07245243, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.07072772635633937, + "language_loss": 0.81988347, + "learning_rate": 0.0005664726689054285, + "loss": 0.83094847, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34033203, + "step": 2463, + "time_per_iteration": 2.4655356407165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.07973766, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.06987107232693553, + "language_loss": 0.8107388, + "learning_rate": 0.0005661638791573704, + "loss": 0.82186544, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.32958984, + "step": 2464, + "time_per_iteration": 2.7433135509490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111686, + "balance_loss_mlp": 1.07742512, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.060845328276789123, + "language_loss": 0.87247777, + "learning_rate": 0.0005658550637222164, + "loss": 0.88359463, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.34277344, + "step": 2465, + "time_per_iteration": 2.615755558013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113074, + "balance_loss_mlp": 1.07762074, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.05153784391367151, + "language_loss": 0.82349539, + "learning_rate": 0.0005655462227198592, + "loss": 0.83462608, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35473633, + "step": 2466, + "time_per_iteration": 2.91003680229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109891, + "balance_loss_mlp": 1.07460487, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.055186067432112955, + "language_loss": 0.84493053, + "learning_rate": 0.0005652373562702016, + "loss": 0.85602945, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.3527832, + "step": 2467, + "time_per_iteration": 2.6209630966186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.07982516, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06952200013405305, + "language_loss": 0.88642848, + "learning_rate": 0.000564928464493156, + "loss": 0.89760423, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.37744141, + "step": 2468, + "time_per_iteration": 2.609154224395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117938, + "balance_loss_mlp": 1.0807451, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.05705018138682977, + "language_loss": 0.81856024, + "learning_rate": 0.000564619547508645, + "loss": 0.82973957, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.37158203, + "step": 2469, + "time_per_iteration": 3.041351556777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117314, + "balance_loss_mlp": 1.07849944, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.08036472839994792, + "language_loss": 0.83256048, + "learning_rate": 0.0005643106054366008, + "loss": 0.84373355, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.38818359, + "step": 2470, + "time_per_iteration": 2.5631182193756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07258332, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.05805518051262763, + "language_loss": 0.79916292, + "learning_rate": 0.000564001638396965, + "loss": 0.81025255, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.36376953, + "step": 2471, + "time_per_iteration": 2.7381579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110653, + "balance_loss_mlp": 1.0717926, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0665112346682766, + "language_loss": 0.82313401, + "learning_rate": 0.0005636926465096897, + "loss": 0.83419931, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.34741211, + "step": 2472, + "time_per_iteration": 3.0346837043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111103, + "balance_loss_mlp": 1.07622218, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.06532220540392095, + "language_loss": 0.87808621, + "learning_rate": 0.0005633836298947363, + "loss": 0.88919711, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.34912109, + "step": 2473, + "time_per_iteration": 2.587581157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122307, + "balance_loss_mlp": 1.08716393, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09099011339346055, + "language_loss": 0.70947754, + "learning_rate": 0.000563074588672075, + "loss": 0.72070062, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3515625, + "step": 2474, + "time_per_iteration": 2.7112982273101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.09012604, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06360669353624634, + "language_loss": 0.85420531, + "learning_rate": 0.0005627655229616868, + "loss": 0.8654604, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.35400391, + "step": 2475, + "time_per_iteration": 2.7166192531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131445, + "balance_loss_mlp": 1.09532499, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.05566651470752815, + "language_loss": 0.90219474, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350919, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.36132812, + "step": 2476, + "time_per_iteration": 2.8342158794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.08339906, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06751222051526788, + "language_loss": 0.8450973, + "learning_rate": 0.0005621473185576986, + "loss": 0.85629016, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.35913086, + "step": 2477, + "time_per_iteration": 2.727320432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126891, + "balance_loss_mlp": 1.0915097, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.06498777385437565, + "language_loss": 0.87181318, + "learning_rate": 0.0005618381801041068, + "loss": 0.88308215, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.35400391, + "step": 2478, + "time_per_iteration": 2.622197389602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136775, + "balance_loss_mlp": 1.09965336, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.0693017023966873, + "language_loss": 0.83176625, + "learning_rate": 0.0005615290176428044, + "loss": 0.84313405, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.37084961, + "step": 2479, + "time_per_iteration": 2.6874895095825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.10275292, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.06633902685884922, + "language_loss": 0.85015559, + "learning_rate": 0.0005612198312938187, + "loss": 0.86152905, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.34619141, + "step": 2480, + "time_per_iteration": 2.7283356189727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143197, + "balance_loss_mlp": 1.10717165, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08700724997250119, + "language_loss": 0.79558903, + "learning_rate": 0.0005609106211771868, + "loss": 0.80702102, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.36035156, + "step": 2481, + "time_per_iteration": 2.8008668422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155105, + "balance_loss_mlp": 1.11857891, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07115217474866456, + "language_loss": 0.89249581, + "learning_rate": 0.0005606013874129543, + "loss": 0.90404689, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36523438, + "step": 2482, + "time_per_iteration": 2.746906280517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146439, + "balance_loss_mlp": 1.11027122, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.052135079835272054, + "language_loss": 0.80459106, + "learning_rate": 0.0005602921301211768, + "loss": 0.81605548, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36181641, + "step": 2483, + "time_per_iteration": 2.760091543197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133668, + "balance_loss_mlp": 1.09895456, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.06775745953777351, + "language_loss": 0.82220864, + "learning_rate": 0.0005599828494219185, + "loss": 0.83354533, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.34716797, + "step": 2484, + "time_per_iteration": 2.5458662509918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113545, + "balance_loss_mlp": 1.10004473, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.08200141457856946, + "language_loss": 0.89550984, + "learning_rate": 0.0005596735454352527, + "loss": 0.90686429, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35424805, + "step": 2485, + "time_per_iteration": 2.8570785522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143886, + "balance_loss_mlp": 1.1075511, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07792091337932193, + "language_loss": 0.85635722, + "learning_rate": 0.0005593642182812619, + "loss": 0.86779606, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36352539, + "step": 2486, + "time_per_iteration": 2.630213975906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139867, + "balance_loss_mlp": 1.10369921, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.06102595686098437, + "language_loss": 0.83692348, + "learning_rate": 0.0005590548680800378, + "loss": 0.84832209, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36206055, + "step": 2487, + "time_per_iteration": 3.1342179775238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139371, + "balance_loss_mlp": 1.10389483, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0657277256500081, + "language_loss": 0.76383913, + "learning_rate": 0.0005587454949516804, + "loss": 0.77523285, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35498047, + "step": 2488, + "time_per_iteration": 2.6958112716674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145548, + "balance_loss_mlp": 1.10833097, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.061160167550216256, + "language_loss": 0.88185161, + "learning_rate": 0.0005584360990162993, + "loss": 0.89330709, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.37255859, + "step": 2489, + "time_per_iteration": 2.61667537689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133811, + "balance_loss_mlp": 1.09881115, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.0507120714137282, + "language_loss": 0.85551566, + "learning_rate": 0.0005581266803940124, + "loss": 0.86685371, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.35009766, + "step": 2490, + "time_per_iteration": 2.7139766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133471, + "balance_loss_mlp": 1.09649253, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0583035541715914, + "language_loss": 0.87154239, + "learning_rate": 0.0005578172392049471, + "loss": 0.88287711, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.36987305, + "step": 2491, + "time_per_iteration": 2.7481577396392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134521, + "balance_loss_mlp": 1.09918737, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.08141144255217014, + "language_loss": 0.84311044, + "learning_rate": 0.0005575077755692386, + "loss": 0.85445559, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.35351562, + "step": 2492, + "time_per_iteration": 2.7962934970855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132576, + "balance_loss_mlp": 1.09793389, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.053456927876726165, + "language_loss": 0.86199152, + "learning_rate": 0.0005571982896070316, + "loss": 0.87331724, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.34692383, + "step": 2493, + "time_per_iteration": 2.6755988597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131371, + "balance_loss_mlp": 1.09534633, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.059320296473078654, + "language_loss": 0.89793247, + "learning_rate": 0.0005568887814384792, + "loss": 0.90924621, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.36035156, + "step": 2494, + "time_per_iteration": 2.5790021419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139931, + "balance_loss_mlp": 1.1042639, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.061123462827233396, + "language_loss": 0.87048668, + "learning_rate": 0.000556579251183743, + "loss": 0.88188601, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.35693359, + "step": 2495, + "time_per_iteration": 2.6916205883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134229, + "balance_loss_mlp": 1.0992769, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.05789705924573782, + "language_loss": 0.80256224, + "learning_rate": 0.0005562696989629936, + "loss": 0.81390452, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.34960938, + "step": 2496, + "time_per_iteration": 2.690638542175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133544, + "balance_loss_mlp": 1.0990684, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.06114364023526716, + "language_loss": 0.82642174, + "learning_rate": 0.0005559601248964095, + "loss": 0.83775711, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.34521484, + "step": 2497, + "time_per_iteration": 2.6249618530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135944, + "balance_loss_mlp": 1.10249412, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.06899971908711858, + "language_loss": 0.85956562, + "learning_rate": 0.0005556505291041783, + "loss": 0.87092507, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.33447266, + "step": 2498, + "time_per_iteration": 2.7098748683929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135161, + "balance_loss_mlp": 1.10097158, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.055207166893370456, + "language_loss": 0.84689957, + "learning_rate": 0.0005553409117064954, + "loss": 0.85825121, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.34228516, + "step": 2499, + "time_per_iteration": 2.8708267211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_mlp": 1.10242295, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.06687134527330599, + "language_loss": 0.8476308, + "learning_rate": 0.0005550312728235654, + "loss": 0.85899949, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.34448242, + "step": 2500, + "time_per_iteration": 2.6980721950531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128863, + "balance_loss_mlp": 1.09500802, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07829313389837793, + "language_loss": 0.83860761, + "learning_rate": 0.0005547216125756003, + "loss": 0.84989619, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.33862305, + "step": 2501, + "time_per_iteration": 2.737539291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140174, + "balance_loss_mlp": 1.10729611, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.06644553638954338, + "language_loss": 0.82266629, + "learning_rate": 0.0005544119310828211, + "loss": 0.83406806, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.32885742, + "step": 2502, + "time_per_iteration": 3.082392930984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125836, + "balance_loss_mlp": 1.09245706, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.061244964440333595, + "language_loss": 0.85365945, + "learning_rate": 0.0005541022284654568, + "loss": 0.86491781, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.33398438, + "step": 2503, + "time_per_iteration": 2.9372761249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125034, + "balance_loss_mlp": 1.09189391, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06168262746563105, + "language_loss": 0.84156048, + "learning_rate": 0.0005537925048437446, + "loss": 0.8528108, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.33154297, + "step": 2504, + "time_per_iteration": 2.589538097381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_mlp": 1.04296899, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.02361726537833674, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.7680397, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09521484, + "step": 2505, + "time_per_iteration": 4.908772230148315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111219, + "balance_loss_mlp": 1.07819104, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.056356974386017084, + "language_loss": 0.88423991, + "learning_rate": 0.0005531729950682664, + "loss": 0.89536178, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.34008789, + "step": 2506, + "time_per_iteration": 3.003096580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108061, + "balance_loss_mlp": 1.0739913, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.08388532833554185, + "language_loss": 0.85083711, + "learning_rate": 0.000552863209155015, + "loss": 0.86191773, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.34082031, + "step": 2507, + "time_per_iteration": 2.511463165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.07331145, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.05856414722035687, + "language_loss": 0.82348502, + "learning_rate": 0.0005525534027184461, + "loss": 0.83454525, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.32714844, + "step": 2508, + "time_per_iteration": 2.6477487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102119, + "balance_loss_mlp": 1.06993294, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.054304228935087996, + "language_loss": 0.83357495, + "learning_rate": 0.0005522435758788365, + "loss": 0.84459615, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.32177734, + "step": 2509, + "time_per_iteration": 2.715082883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_mlp": 1.06741309, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.07316920081788965, + "language_loss": 0.80354846, + "learning_rate": 0.0005519337287564721, + "loss": 0.81456852, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.34594727, + "step": 2510, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103913, + "balance_loss_mlp": 1.07225132, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07052632360826482, + "language_loss": 0.83703697, + "learning_rate": 0.000551623861471646, + "loss": 0.84807611, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.31640625, + "step": 2511, + "time_per_iteration": 2.7521867752075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_mlp": 1.01886296, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.02307493847576384, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79847658, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09960938, + "step": 2512, + "time_per_iteration": 4.850410461425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110192, + "balance_loss_mlp": 1.06877947, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.060960940408773784, + "language_loss": 0.86943817, + "learning_rate": 0.0005510040668958211, + "loss": 0.88045734, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.33154297, + "step": 2513, + "time_per_iteration": 2.5581674575805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00145698, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.01573295897448314, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78772056, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.10009766, + "step": 2514, + "time_per_iteration": 4.821207523345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101449, + "balance_loss_mlp": 1.06876206, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.06635931409503217, + "language_loss": 0.8316704, + "learning_rate": 0.0005503841931138645, + "loss": 0.84268492, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.3269043, + "step": 2515, + "time_per_iteration": 2.6826930046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109492, + "balance_loss_mlp": 1.06247151, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07963111819885421, + "language_loss": 0.81975293, + "learning_rate": 0.0005500742268214025, + "loss": 0.83070219, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.32446289, + "step": 2516, + "time_per_iteration": 2.4913811683654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109084, + "balance_loss_mlp": 1.07763672, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.057140457991015275, + "language_loss": 0.85559756, + "learning_rate": 0.0005497642410884014, + "loss": 0.86668837, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.31420898, + "step": 2517, + "time_per_iteration": 2.7807135581970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101598, + "balance_loss_mlp": 1.06855321, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.05176470538316484, + "language_loss": 0.85257566, + "learning_rate": 0.0005494542360352085, + "loss": 0.86359167, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.33056641, + "step": 2518, + "time_per_iteration": 2.653507947921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115114, + "balance_loss_mlp": 1.08285642, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.0599313447084905, + "language_loss": 0.85512084, + "learning_rate": 0.0005491442117821783, + "loss": 0.86627203, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.32226562, + "step": 2519, + "time_per_iteration": 2.717984676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08325005, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.0649010079315795, + "language_loss": 0.87622237, + "learning_rate": 0.0005488341684496732, + "loss": 0.88739175, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.33691406, + "step": 2520, + "time_per_iteration": 2.652135133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108566, + "balance_loss_mlp": 1.07606971, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06559854904132026, + "language_loss": 0.92200404, + "learning_rate": 0.0005485241061580624, + "loss": 0.93308973, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.32495117, + "step": 2521, + "time_per_iteration": 2.7108826637268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102755, + "balance_loss_mlp": 1.07037747, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.055876909605250345, + "language_loss": 0.84836948, + "learning_rate": 0.0005482140250277228, + "loss": 0.85939705, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.32373047, + "step": 2522, + "time_per_iteration": 2.997586965560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105629, + "balance_loss_mlp": 1.07408667, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.07027884549034326, + "language_loss": 0.87641776, + "learning_rate": 0.0005479039251790387, + "loss": 0.88747412, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.31518555, + "step": 2523, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096851, + "balance_loss_mlp": 1.06478369, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.061509725516535926, + "language_loss": 0.8502717, + "learning_rate": 0.0005475938067324014, + "loss": 0.86124021, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.32055664, + "step": 2524, + "time_per_iteration": 2.8200628757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07339168, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.064836171654712, + "language_loss": 0.83736813, + "learning_rate": 0.0005472836698082098, + "loss": 0.84842694, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.32495117, + "step": 2525, + "time_per_iteration": 2.4986329078674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100608, + "balance_loss_mlp": 1.06763458, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.05406459595211624, + "language_loss": 0.8394289, + "learning_rate": 0.0005469735145268694, + "loss": 0.8504349, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.32983398, + "step": 2526, + "time_per_iteration": 2.7246296405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107934, + "balance_loss_mlp": 1.07455492, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0623071528474554, + "language_loss": 0.8099308, + "learning_rate": 0.0005466633410087933, + "loss": 0.82101017, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.33398438, + "step": 2527, + "time_per_iteration": 2.660274028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049375, + "balance_loss_mlp": 1.03955197, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.029762737629489368, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78310198, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.09814453, + "step": 2528, + "time_per_iteration": 4.886114835739136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098205, + "balance_loss_mlp": 1.06663859, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.05348067523581763, + "language_loss": 0.88341582, + "learning_rate": 0.0005460429397441214, + "loss": 0.89439785, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.31542969, + "step": 2529, + "time_per_iteration": 2.556168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06572175, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.07361694113297405, + "language_loss": 0.86787206, + "learning_rate": 0.0005457327122383866, + "loss": 0.87883973, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.31030273, + "step": 2530, + "time_per_iteration": 2.6101198196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_mlp": 1.02248013, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.016416545513431694, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75668502, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.09521484, + "step": 2531, + "time_per_iteration": 4.807017803192139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102878, + "balance_loss_mlp": 1.071383, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.061169122564006716, + "language_loss": 0.75803703, + "learning_rate": 0.0005451122040823244, + "loss": 0.7690658, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.31469727, + "step": 2532, + "time_per_iteration": 2.778230667114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110046, + "balance_loss_mlp": 1.07611895, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.05283553568044795, + "language_loss": 0.77404439, + "learning_rate": 0.0005448019236728997, + "loss": 0.78514493, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.33959961, + "step": 2533, + "time_per_iteration": 2.8531336784362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106047, + "balance_loss_mlp": 1.07521987, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.06480756266699016, + "language_loss": 0.84952033, + "learning_rate": 0.0005444916258698255, + "loss": 0.8605808, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.30810547, + "step": 2534, + "time_per_iteration": 2.5989930629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108181, + "balance_loss_mlp": 1.07701969, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.058540646847924545, + "language_loss": 0.8623631, + "learning_rate": 0.0005441813107935704, + "loss": 0.87344491, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.31152344, + "step": 2535, + "time_per_iteration": 2.6970572471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.0836966, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.06249509461195645, + "language_loss": 0.85908329, + "learning_rate": 0.0005438709785646091, + "loss": 0.87024212, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.32177734, + "step": 2536, + "time_per_iteration": 2.5461835861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109072, + "balance_loss_mlp": 1.07688498, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06859245202813889, + "language_loss": 0.87149572, + "learning_rate": 0.0005435606293034234, + "loss": 0.88258648, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.32177734, + "step": 2537, + "time_per_iteration": 2.6585540771484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_mlp": 1.07018018, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.07107602922960535, + "language_loss": 0.84916604, + "learning_rate": 0.0005432502631305016, + "loss": 0.86016917, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.30126953, + "step": 2538, + "time_per_iteration": 2.6976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103363, + "balance_loss_mlp": 1.07055688, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.04961852862161645, + "language_loss": 0.83663356, + "learning_rate": 0.0005429398801663386, + "loss": 0.84766722, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.32788086, + "step": 2539, + "time_per_iteration": 2.9294815063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101134, + "balance_loss_mlp": 1.06916165, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.06193008336457455, + "language_loss": 0.83023834, + "learning_rate": 0.0005426294805314355, + "loss": 0.84124964, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.31958008, + "step": 2540, + "time_per_iteration": 2.5207223892211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099167, + "balance_loss_mlp": 1.06593108, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.0603925409034683, + "language_loss": 0.80357647, + "learning_rate": 0.0005423190643463003, + "loss": 0.8145681, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.33251953, + "step": 2541, + "time_per_iteration": 3.0720365047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101012, + "balance_loss_mlp": 1.06915879, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.0609118032347285, + "language_loss": 0.83149743, + "learning_rate": 0.0005420086317314473, + "loss": 0.84250748, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.31835938, + "step": 2542, + "time_per_iteration": 2.7291080951690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06470084, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.056070719415307675, + "language_loss": 0.81426919, + "learning_rate": 0.0005416981828073971, + "loss": 0.8252514, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.33544922, + "step": 2543, + "time_per_iteration": 2.7784368991851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_mlp": 1.02441669, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.02316516352555082, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78148878, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09423828, + "step": 2544, + "time_per_iteration": 4.838131666183472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.07023609, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07449943721079477, + "language_loss": 0.85317016, + "learning_rate": 0.000541077236513819, + "loss": 0.86419702, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.32446289, + "step": 2545, + "time_per_iteration": 2.5264503955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101803, + "balance_loss_mlp": 1.07071328, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.056060473734182076, + "language_loss": 0.82499588, + "learning_rate": 0.0005407667393853638, + "loss": 0.83601391, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31054688, + "step": 2546, + "time_per_iteration": 2.66180157661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06699038, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06590134685442105, + "language_loss": 0.83337891, + "learning_rate": 0.0005404562264298569, + "loss": 0.84437472, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32592773, + "step": 2547, + "time_per_iteration": 2.8525304794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098759, + "balance_loss_mlp": 1.06390238, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.05425762766620139, + "language_loss": 0.83855128, + "learning_rate": 0.0005401456977678498, + "loss": 0.84953886, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.34838867, + "step": 2548, + "time_per_iteration": 2.6519198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098656, + "balance_loss_mlp": 1.06561112, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.06384769679028596, + "language_loss": 0.77718782, + "learning_rate": 0.0005398351535199008, + "loss": 0.78817439, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.33056641, + "step": 2549, + "time_per_iteration": 3.0877339839935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096351, + "balance_loss_mlp": 1.06499887, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.053089286344054805, + "language_loss": 0.83930391, + "learning_rate": 0.0005395245938065735, + "loss": 0.85026741, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31347656, + "step": 2550, + "time_per_iteration": 2.8241264820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099597, + "balance_loss_mlp": 1.06669557, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.0641036113016549, + "language_loss": 0.82636213, + "learning_rate": 0.0005392140187484379, + "loss": 0.83735812, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.32885742, + "step": 2551, + "time_per_iteration": 2.593710422515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105531, + "balance_loss_mlp": 1.07332087, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.06156906510059403, + "language_loss": 0.89866853, + "learning_rate": 0.0005389034284660701, + "loss": 0.90972388, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.32202148, + "step": 2552, + "time_per_iteration": 2.8167800903320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112217, + "balance_loss_mlp": 1.07957709, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.06543971253041776, + "language_loss": 0.82440078, + "learning_rate": 0.000538592823080052, + "loss": 0.83552289, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.32641602, + "step": 2553, + "time_per_iteration": 3.190459966659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110985, + "balance_loss_mlp": 1.07817876, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.061393832790464745, + "language_loss": 0.85407627, + "learning_rate": 0.000538282202710971, + "loss": 0.8651861, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.328125, + "step": 2554, + "time_per_iteration": 2.5911953449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111409, + "balance_loss_mlp": 1.07907963, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.06886607309109279, + "language_loss": 0.82350785, + "learning_rate": 0.000537971567479421, + "loss": 0.83462197, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.32324219, + "step": 2555, + "time_per_iteration": 2.7882654666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110188, + "balance_loss_mlp": 1.07783484, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.07781814230506547, + "language_loss": 0.87956369, + "learning_rate": 0.0005376609175060011, + "loss": 0.89066565, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32348633, + "step": 2556, + "time_per_iteration": 2.6131739616394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121597, + "balance_loss_mlp": 1.08850408, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07736545907619681, + "language_loss": 0.80871999, + "learning_rate": 0.0005373502529113162, + "loss": 0.81993598, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.33105469, + "step": 2557, + "time_per_iteration": 2.8115434646606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125925, + "balance_loss_mlp": 1.09154499, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.06369400741363575, + "language_loss": 0.81534445, + "learning_rate": 0.0005370395738159773, + "loss": 0.82660365, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.34375, + "step": 2558, + "time_per_iteration": 2.645482063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134081, + "balance_loss_mlp": 1.10120285, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.06840745530844954, + "language_loss": 0.83582544, + "learning_rate": 0.0005367288803406003, + "loss": 0.84716624, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.32885742, + "step": 2559, + "time_per_iteration": 2.6290056705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113348, + "balance_loss_mlp": 1.09895754, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.06026988921967747, + "language_loss": 0.81393933, + "learning_rate": 0.0005364181726058073, + "loss": 0.82527417, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.34545898, + "step": 2560, + "time_per_iteration": 2.683072805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113164, + "balance_loss_mlp": 1.09771323, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.10364093826622443, + "language_loss": 0.8257041, + "learning_rate": 0.0005361074507322261, + "loss": 0.83702052, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.33935547, + "step": 2561, + "time_per_iteration": 2.5988388061523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127176, + "balance_loss_mlp": 1.09420276, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.08607714934124724, + "language_loss": 0.81995922, + "learning_rate": 0.000535796714840489, + "loss": 0.831231, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.32983398, + "step": 2562, + "time_per_iteration": 2.617560625076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124157, + "balance_loss_mlp": 1.09099317, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.06602924000575079, + "language_loss": 0.84137893, + "learning_rate": 0.0005354859650512348, + "loss": 0.85262048, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.33154297, + "step": 2563, + "time_per_iteration": 2.7547245025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118883, + "balance_loss_mlp": 1.08707833, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.060127327089604984, + "language_loss": 0.87529951, + "learning_rate": 0.0005351752014851074, + "loss": 0.88648832, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31787109, + "step": 2564, + "time_per_iteration": 2.5543923377990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115017, + "balance_loss_mlp": 1.08199644, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.057267908508465526, + "language_loss": 0.83867848, + "learning_rate": 0.0005348644242627553, + "loss": 0.84982872, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.33032227, + "step": 2565, + "time_per_iteration": 2.7361738681793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074248, + "balance_loss_mlp": 1.06585574, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.028047824457769776, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76360869, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.08398438, + "step": 2566, + "time_per_iteration": 4.955476760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126385, + "balance_loss_mlp": 1.09605825, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.0818104780923525, + "language_loss": 0.81442422, + "learning_rate": 0.0005342428293320013, + "loss": 0.825688, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30297852, + "step": 2567, + "time_per_iteration": 2.7417242527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133289, + "balance_loss_mlp": 1.10160363, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.06602747501781048, + "language_loss": 0.83786738, + "learning_rate": 0.0005339320118649238, + "loss": 0.84920025, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.31665039, + "step": 2568, + "time_per_iteration": 2.6943705081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.11111128, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.08080827100230976, + "language_loss": 0.86562729, + "learning_rate": 0.000533621181224271, + "loss": 0.87704599, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30737305, + "step": 2569, + "time_per_iteration": 2.7706520557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140818, + "balance_loss_mlp": 1.10748696, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.0686138609954652, + "language_loss": 0.81810164, + "learning_rate": 0.0005333103375307182, + "loss": 0.82950985, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.33349609, + "step": 2570, + "time_per_iteration": 2.86440372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114456, + "balance_loss_mlp": 1.11196864, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06689740684779927, + "language_loss": 0.86211395, + "learning_rate": 0.0005329994809049451, + "loss": 0.87355959, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.32592773, + "step": 2571, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.10243487, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.10119095251173513, + "language_loss": 0.87867194, + "learning_rate": 0.0005326886114676375, + "loss": 0.89004534, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.34936523, + "step": 2572, + "time_per_iteration": 2.7414114475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122524, + "balance_loss_mlp": 1.09017086, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06560191593845013, + "language_loss": 0.8820219, + "learning_rate": 0.0005323777293394854, + "loss": 0.89324713, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.32348633, + "step": 2573, + "time_per_iteration": 2.5354294776916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08838177, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.057507807941180766, + "language_loss": 0.8235743, + "learning_rate": 0.000532066834641184, + "loss": 0.83478361, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32543945, + "step": 2574, + "time_per_iteration": 2.6555819511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110618, + "balance_loss_mlp": 1.07401729, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.06325814646706406, + "language_loss": 0.85261214, + "learning_rate": 0.0005317559274934334, + "loss": 0.86367393, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.3215332, + "step": 2575, + "time_per_iteration": 2.7056500911712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109208, + "balance_loss_mlp": 1.07559085, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.06593319291759459, + "language_loss": 0.81090045, + "learning_rate": 0.0005314450080169382, + "loss": 0.82199252, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33642578, + "step": 2576, + "time_per_iteration": 2.6029012203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06544995, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.07692863745295915, + "language_loss": 0.80917549, + "learning_rate": 0.0005311340763324083, + "loss": 0.82014352, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.31323242, + "step": 2577, + "time_per_iteration": 2.5615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092477, + "balance_loss_mlp": 1.06081462, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06627487899009786, + "language_loss": 0.82433712, + "learning_rate": 0.0005308231325605578, + "loss": 0.83526182, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.31665039, + "step": 2578, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096329, + "balance_loss_mlp": 1.06473827, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.053568999050238396, + "language_loss": 0.77453893, + "learning_rate": 0.0005305121768221061, + "loss": 0.7855022, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.31542969, + "step": 2579, + "time_per_iteration": 3.0817010402679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_mlp": 1.00046897, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.016247003132607515, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76047277, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08496094, + "step": 2580, + "time_per_iteration": 4.813999176025391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.05099821, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06693938938040958, + "language_loss": 0.92087269, + "learning_rate": 0.0005298902299282984, + "loss": 0.93170166, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.3190918, + "step": 2581, + "time_per_iteration": 2.622823715209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096273, + "balance_loss_mlp": 1.0638243, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.06032323910602905, + "language_loss": 0.84543586, + "learning_rate": 0.0005295792390144033, + "loss": 0.85639858, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.32446289, + "step": 2582, + "time_per_iteration": 2.68511962890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110589, + "balance_loss_mlp": 1.07236862, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.06277392630469315, + "language_loss": 0.84023589, + "learning_rate": 0.0005292682366168294, + "loss": 0.85129476, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.33544922, + "step": 2583, + "time_per_iteration": 2.5309059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095632, + "balance_loss_mlp": 1.06256378, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06727867389441711, + "language_loss": 0.79973817, + "learning_rate": 0.0005289572228563181, + "loss": 0.81069446, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.33081055, + "step": 2584, + "time_per_iteration": 4.178269386291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095977, + "balance_loss_mlp": 1.06362402, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.05530053735156927, + "language_loss": 0.83410299, + "learning_rate": 0.000528646197853616, + "loss": 0.84506273, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.32373047, + "step": 2585, + "time_per_iteration": 2.706878900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101894, + "balance_loss_mlp": 1.07032776, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.05706454291548468, + "language_loss": 0.86111611, + "learning_rate": 0.0005283351617294735, + "loss": 0.87213504, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.31567383, + "step": 2586, + "time_per_iteration": 2.9042582511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017241, + "balance_loss_mlp": 1.00732255, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.020630801148902787, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77653909, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.09912109, + "step": 2587, + "time_per_iteration": 4.9974682331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099928, + "balance_loss_mlp": 1.06676388, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.07253805127360792, + "language_loss": 0.86542678, + "learning_rate": 0.0005277130565998916, + "loss": 0.87642598, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.33178711, + "step": 2588, + "time_per_iteration": 2.7639453411102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092536, + "balance_loss_mlp": 1.06144667, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05247127963424023, + "language_loss": 0.82351577, + "learning_rate": 0.0005274019878359748, + "loss": 0.83444113, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.31054688, + "step": 2589, + "time_per_iteration": 2.706843137741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109391, + "balance_loss_mlp": 1.05943429, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.06499700543891603, + "language_loss": 0.87299156, + "learning_rate": 0.0005270909084336628, + "loss": 0.88393074, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.34472656, + "step": 2590, + "time_per_iteration": 2.627092123031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095191, + "balance_loss_mlp": 1.06174052, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.06358626343280155, + "language_loss": 0.89192379, + "learning_rate": 0.0005267798185137276, + "loss": 0.90287566, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.33447266, + "step": 2591, + "time_per_iteration": 2.6053519248962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098275, + "balance_loss_mlp": 1.06434834, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.06851868017892651, + "language_loss": 0.89230084, + "learning_rate": 0.0005264687181969444, + "loss": 0.9032836, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.33959961, + "step": 2592, + "time_per_iteration": 2.7227771282196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097456, + "balance_loss_mlp": 1.06255198, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.06920907227035335, + "language_loss": 0.75419706, + "learning_rate": 0.0005261576076040937, + "loss": 0.76517165, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.34936523, + "step": 2593, + "time_per_iteration": 3.2559545040130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06430554, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.06727068797895107, + "language_loss": 0.84462249, + "learning_rate": 0.0005258464868559591, + "loss": 0.85559052, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.32519531, + "step": 2594, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06432104, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.05920105575352037, + "language_loss": 0.88943779, + "learning_rate": 0.0005255353560733284, + "loss": 0.90040118, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.32006836, + "step": 2595, + "time_per_iteration": 2.5696520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_mlp": 1.02894819, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.021649763717819466, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76616704, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.09130859, + "step": 2596, + "time_per_iteration": 4.785402059555054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096305, + "balance_loss_mlp": 1.06354642, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.055871474183400556, + "language_loss": 0.83429074, + "learning_rate": 0.0005249130648877492, + "loss": 0.84525383, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.32763672, + "step": 2597, + "time_per_iteration": 2.768077850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096166, + "balance_loss_mlp": 1.0628823, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.06479225622172138, + "language_loss": 0.85305572, + "learning_rate": 0.0005246019047263953, + "loss": 0.86401737, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.33300781, + "step": 2598, + "time_per_iteration": 2.4575202465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109757, + "balance_loss_mlp": 1.06471562, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.06552285864087816, + "language_loss": 0.82716942, + "learning_rate": 0.0005242907350137353, + "loss": 0.83814514, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.32836914, + "step": 2599, + "time_per_iteration": 2.545402765274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.06773996, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.060184934170799446, + "language_loss": 0.79316103, + "learning_rate": 0.0005239795558705754, + "loss": 0.80416048, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.32202148, + "step": 2600, + "time_per_iteration": 2.6259560585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094505, + "balance_loss_mlp": 1.06279588, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.07180292739942261, + "language_loss": 0.89506614, + "learning_rate": 0.0005236683674177264, + "loss": 0.90601116, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.31713867, + "step": 2601, + "time_per_iteration": 2.6216633319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.06531632, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.05820446715743302, + "language_loss": 0.82377899, + "learning_rate": 0.0005233571697760021, + "loss": 0.83476663, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3347168, + "step": 2602, + "time_per_iteration": 2.8286540508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107785, + "balance_loss_mlp": 1.07447851, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06262770013006937, + "language_loss": 0.83391154, + "learning_rate": 0.0005230459630662203, + "loss": 0.84498942, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.33325195, + "step": 2603, + "time_per_iteration": 2.9667811393737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107928, + "balance_loss_mlp": 1.07562184, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.06520686758548196, + "language_loss": 0.81425881, + "learning_rate": 0.0005227347474092022, + "loss": 0.82533813, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.32250977, + "step": 2604, + "time_per_iteration": 2.7840375900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109186, + "balance_loss_mlp": 1.07616544, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.04693444517106987, + "language_loss": 0.83613992, + "learning_rate": 0.0005224235229257724, + "loss": 0.84723175, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.33032227, + "step": 2605, + "time_per_iteration": 2.6730735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_mlp": 1.06970012, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.05305580167320912, + "language_loss": 0.87095463, + "learning_rate": 0.0005221122897367589, + "loss": 0.88196945, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.31762695, + "step": 2606, + "time_per_iteration": 2.804161310195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106275, + "balance_loss_mlp": 1.07384968, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.07402045106641765, + "language_loss": 0.81512845, + "learning_rate": 0.0005218010479629932, + "loss": 0.82619125, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.32421875, + "step": 2607, + "time_per_iteration": 2.6673223972320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111463, + "balance_loss_mlp": 1.0777508, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.06695708261577327, + "language_loss": 0.82331049, + "learning_rate": 0.0005214897977253102, + "loss": 0.83442515, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.33740234, + "step": 2608, + "time_per_iteration": 2.641615390777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109683, + "balance_loss_mlp": 1.06538224, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.057424183285493445, + "language_loss": 0.84299719, + "learning_rate": 0.0005211785391445473, + "loss": 0.85396552, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.31445312, + "step": 2609, + "time_per_iteration": 2.736565589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098049, + "balance_loss_mlp": 1.06381226, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.15505754048194495, + "language_loss": 0.79028511, + "learning_rate": 0.0005208672723415467, + "loss": 0.8012656, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.3425293, + "step": 2610, + "time_per_iteration": 2.7740700244903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06371355, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.06293902841757802, + "language_loss": 0.79232705, + "learning_rate": 0.0005205559974371525, + "loss": 0.80331105, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.34716797, + "step": 2611, + "time_per_iteration": 2.7674527168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096957, + "balance_loss_mlp": 1.06455564, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06270244311506845, + "language_loss": 0.82445353, + "learning_rate": 0.0005202447145522123, + "loss": 0.83542311, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.32397461, + "step": 2612, + "time_per_iteration": 2.6602847576141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100141, + "balance_loss_mlp": 1.06700087, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.1708463718003921, + "language_loss": 0.79453385, + "learning_rate": 0.0005199334238075769, + "loss": 0.80553526, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.33154297, + "step": 2613, + "time_per_iteration": 2.5568900108337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06802678, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.0528689770317124, + "language_loss": 0.92217171, + "learning_rate": 0.0005196221253241, + "loss": 0.93318725, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.3347168, + "step": 2614, + "time_per_iteration": 2.6126556396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099044, + "balance_loss_mlp": 1.06490254, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.060608661488991786, + "language_loss": 0.83149332, + "learning_rate": 0.0005193108192226383, + "loss": 0.84248376, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.34155273, + "step": 2615, + "time_per_iteration": 2.74265456199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099568, + "balance_loss_mlp": 1.06599879, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.05036532075051116, + "language_loss": 0.87427437, + "learning_rate": 0.000518999505624052, + "loss": 0.88527, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.33569336, + "step": 2616, + "time_per_iteration": 2.6870973110198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098357, + "balance_loss_mlp": 1.06483543, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.047696485592571475, + "language_loss": 0.83320528, + "learning_rate": 0.000518688184649203, + "loss": 0.84418881, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.33544922, + "step": 2617, + "time_per_iteration": 2.8016743659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.06434643, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.046578345586746416, + "language_loss": 0.83902323, + "learning_rate": 0.0005183768564189577, + "loss": 0.85000026, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.33374023, + "step": 2618, + "time_per_iteration": 2.5473384857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103149, + "balance_loss_mlp": 1.07158208, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.06435350107251939, + "language_loss": 0.81610096, + "learning_rate": 0.0005180655210541838, + "loss": 0.82713246, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31542969, + "step": 2619, + "time_per_iteration": 2.6063601970672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109319, + "balance_loss_mlp": 1.07362747, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.07554849641883571, + "language_loss": 0.83428431, + "learning_rate": 0.0005177541786757527, + "loss": 0.8453775, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.35693359, + "step": 2620, + "time_per_iteration": 2.7651278972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109868, + "balance_loss_mlp": 1.07589293, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.07801269652965341, + "language_loss": 0.8344717, + "learning_rate": 0.000517442829404538, + "loss": 0.84557039, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.33959961, + "step": 2621, + "time_per_iteration": 2.991288661956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07401848, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07509105999805234, + "language_loss": 0.87522292, + "learning_rate": 0.0005171314733614166, + "loss": 0.8862952, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.33227539, + "step": 2622, + "time_per_iteration": 2.8980941772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107621, + "balance_loss_mlp": 1.07357442, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.05402993794527385, + "language_loss": 0.78464615, + "learning_rate": 0.0005168201106672671, + "loss": 0.79572237, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.34057617, + "step": 2623, + "time_per_iteration": 2.7572929859161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106537, + "balance_loss_mlp": 1.07394505, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.0666138467605724, + "language_loss": 0.85413206, + "learning_rate": 0.0005165087414429717, + "loss": 0.86519742, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.32592773, + "step": 2624, + "time_per_iteration": 2.6197690963745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104325, + "balance_loss_mlp": 1.07178128, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.0890371890087988, + "language_loss": 0.83553296, + "learning_rate": 0.0005161973658094144, + "loss": 0.84657621, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.32543945, + "step": 2625, + "time_per_iteration": 2.688664436340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114503, + "balance_loss_mlp": 1.08188796, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.10293664596100507, + "language_loss": 0.82534152, + "learning_rate": 0.000515885983887482, + "loss": 0.83648658, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.32592773, + "step": 2626, + "time_per_iteration": 2.7382290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117287, + "balance_loss_mlp": 1.08467126, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.06112991005583596, + "language_loss": 0.84654796, + "learning_rate": 0.0005155745957980636, + "loss": 0.85772085, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.32617188, + "step": 2627, + "time_per_iteration": 2.5833873748779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.0852921, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.05493055898841465, + "language_loss": 0.88454115, + "learning_rate": 0.000515263201662051, + "loss": 0.89571404, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.31982422, + "step": 2628, + "time_per_iteration": 2.6362485885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112533, + "balance_loss_mlp": 1.09264278, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.05313724215790835, + "language_loss": 0.8271699, + "learning_rate": 0.0005149518016003378, + "loss": 0.83842319, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.3269043, + "step": 2629, + "time_per_iteration": 3.1579666137695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121904, + "balance_loss_mlp": 1.09109998, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.05858406869857789, + "language_loss": 0.82627785, + "learning_rate": 0.0005146403957338206, + "loss": 0.83749688, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30786133, + "step": 2630, + "time_per_iteration": 2.5554275512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128543, + "balance_loss_mlp": 1.09664297, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.05139775445636508, + "language_loss": 0.82087231, + "learning_rate": 0.0005143289841833975, + "loss": 0.83215779, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31884766, + "step": 2631, + "time_per_iteration": 2.866208076477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136223, + "balance_loss_mlp": 1.10332084, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.07049680225310351, + "language_loss": 0.82485932, + "learning_rate": 0.0005140175670699696, + "loss": 0.83622158, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.32885742, + "step": 2632, + "time_per_iteration": 2.589662551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136954, + "balance_loss_mlp": 1.10464883, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04937719013853961, + "language_loss": 0.83023763, + "learning_rate": 0.0005137061445144395, + "loss": 0.84160721, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.32299805, + "step": 2633, + "time_per_iteration": 2.907914161682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145225, + "balance_loss_mlp": 1.11308646, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06298038708728138, + "language_loss": 0.87351924, + "learning_rate": 0.000513394716637712, + "loss": 0.8849715, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.32128906, + "step": 2634, + "time_per_iteration": 2.7392778396606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_mlp": 1.05677319, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03015814476855984, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80255967, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.07470703, + "step": 2635, + "time_per_iteration": 4.8476762771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138418, + "balance_loss_mlp": 1.10549188, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0660835824728649, + "language_loss": 0.80952996, + "learning_rate": 0.0005127718454042958, + "loss": 0.82091409, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.3293457, + "step": 2636, + "time_per_iteration": 2.801945447921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122948, + "balance_loss_mlp": 1.09083319, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.06804864770708682, + "language_loss": 0.8454951, + "learning_rate": 0.0005124604022894269, + "loss": 0.85672456, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.32104492, + "step": 2637, + "time_per_iteration": 2.9412965774536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_mlp": 1.0316422, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.020904454547095577, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78227401, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.07519531, + "step": 2638, + "time_per_iteration": 4.857941389083862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.09507418, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.058859738864391845, + "language_loss": 0.83504963, + "learning_rate": 0.0005118375016679325, + "loss": 0.84632552, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.32495117, + "step": 2639, + "time_per_iteration": 2.7467126846313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115219, + "balance_loss_mlp": 1.08169687, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.06748446003243579, + "language_loss": 0.80393875, + "learning_rate": 0.0005115260444031382, + "loss": 0.81509095, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.33544922, + "step": 2640, + "time_per_iteration": 2.5831897258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.01354098, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011909310640322752, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79752946, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.07568359, + "step": 2641, + "time_per_iteration": 4.96182656288147 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118257, + "balance_loss_mlp": 1.08506942, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.06566448453374539, + "language_loss": 0.87279713, + "learning_rate": 0.0005109031165700483, + "loss": 0.88397968, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.33178711, + "step": 2642, + "time_per_iteration": 2.5608396530151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114089, + "balance_loss_mlp": 1.08228409, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07470030174236865, + "language_loss": 0.83423924, + "learning_rate": 0.0005105916462435945, + "loss": 0.84538019, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.31787109, + "step": 2643, + "time_per_iteration": 2.840092420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114248, + "balance_loss_mlp": 1.08272934, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0540496938056118, + "language_loss": 0.8565858, + "learning_rate": 0.0005102801718050989, + "loss": 0.86772823, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.31494141, + "step": 2644, + "time_per_iteration": 2.687993288040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111024, + "balance_loss_mlp": 1.08024383, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.0657522571772089, + "language_loss": 0.89181781, + "learning_rate": 0.0005099686933754867, + "loss": 0.90292799, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.30737305, + "step": 2645, + "time_per_iteration": 2.676555633544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110963, + "balance_loss_mlp": 1.07589364, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.06525501329559952, + "language_loss": 0.84646904, + "learning_rate": 0.0005096572110756845, + "loss": 0.85756534, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.33740234, + "step": 2646, + "time_per_iteration": 2.722046136856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098497, + "balance_loss_mlp": 1.06502318, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.055343813231999515, + "language_loss": 0.85733652, + "learning_rate": 0.0005093457250266205, + "loss": 0.86832154, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.33496094, + "step": 2647, + "time_per_iteration": 2.726637363433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105884, + "balance_loss_mlp": 1.07260132, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.07566246752622155, + "language_loss": 0.83174831, + "learning_rate": 0.000509034235349224, + "loss": 0.84280717, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.33276367, + "step": 2648, + "time_per_iteration": 2.7163400650024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06480372, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.05726246002698667, + "language_loss": 0.81403017, + "learning_rate": 0.0005087227421644266, + "loss": 0.82501602, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.33813477, + "step": 2649, + "time_per_iteration": 2.753593683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090769, + "balance_loss_mlp": 1.05836821, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.062073163804743356, + "language_loss": 0.86567879, + "learning_rate": 0.0005084112455931602, + "loss": 0.87658644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.32397461, + "step": 2650, + "time_per_iteration": 2.6115548610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109227, + "balance_loss_mlp": 1.05986929, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.07224314043681272, + "language_loss": 0.85185993, + "learning_rate": 0.0005080997457563586, + "loss": 0.8627826, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.32397461, + "step": 2651, + "time_per_iteration": 2.562626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091424, + "balance_loss_mlp": 1.05797434, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.12059659360832554, + "language_loss": 0.79420835, + "learning_rate": 0.0005077882427749569, + "loss": 0.80512255, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.3347168, + "step": 2652, + "time_per_iteration": 2.532801866531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.06072092, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09167141678281196, + "language_loss": 0.85065627, + "learning_rate": 0.0005074767367698913, + "loss": 0.86160588, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.34277344, + "step": 2653, + "time_per_iteration": 2.718952178955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06184387, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.05265423612140712, + "language_loss": 0.83726275, + "learning_rate": 0.0005071652278620988, + "loss": 0.84820282, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.3215332, + "step": 2654, + "time_per_iteration": 3.0578973293304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093541, + "balance_loss_mlp": 1.06082976, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.057781922950613636, + "language_loss": 0.8368457, + "learning_rate": 0.0005068537161725186, + "loss": 0.84778106, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.32714844, + "step": 2655, + "time_per_iteration": 2.763050079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109333, + "balance_loss_mlp": 1.06035662, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.06748478853261292, + "language_loss": 0.84411526, + "learning_rate": 0.0005065422018220893, + "loss": 0.85504854, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.32983398, + "step": 2656, + "time_per_iteration": 2.8346335887908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099653, + "balance_loss_mlp": 1.06744266, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.05948045399752535, + "language_loss": 0.80220234, + "learning_rate": 0.0005062306849317521, + "loss": 0.8131988, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.32226562, + "step": 2657, + "time_per_iteration": 2.8443868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011092, + "balance_loss_mlp": 1.07832527, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.06625791562361402, + "language_loss": 0.83381897, + "learning_rate": 0.0005059191656224487, + "loss": 0.84491098, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30859375, + "step": 2658, + "time_per_iteration": 2.7093002796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110636, + "balance_loss_mlp": 1.07883072, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.06672155578926672, + "language_loss": 0.88962573, + "learning_rate": 0.0005056076440151212, + "loss": 0.90073204, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.31787109, + "step": 2659, + "time_per_iteration": 2.6441903114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072549, + "balance_loss_mlp": 1.06272602, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.032966871601824974, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77360666, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.09814453, + "step": 2660, + "time_per_iteration": 4.922346353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124135, + "balance_loss_mlp": 1.09111381, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06875691586697516, + "language_loss": 0.87086922, + "learning_rate": 0.0005049845943901691, + "loss": 0.8821106, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.33032227, + "step": 2661, + "time_per_iteration": 2.8344130516052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107104, + "balance_loss_mlp": 1.07703924, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.06167047048505293, + "language_loss": 0.86829108, + "learning_rate": 0.0005046730666144338, + "loss": 0.87936211, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.30078125, + "step": 2662, + "time_per_iteration": 2.7832746505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110467, + "balance_loss_mlp": 1.07780349, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.05618387348962469, + "language_loss": 0.8811537, + "learning_rate": 0.0005043615370244532, + "loss": 0.89225835, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.32666016, + "step": 2663, + "time_per_iteration": 3.3585264682769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_mlp": 1.02664995, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02051261915929333, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79279995, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.08984375, + "step": 2664, + "time_per_iteration": 4.639116048812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.07670689, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.057959232824292994, + "language_loss": 0.85514903, + "learning_rate": 0.0005037384728855425, + "loss": 0.86621535, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.29882812, + "step": 2665, + "time_per_iteration": 2.7972493171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106635, + "balance_loss_mlp": 1.07456732, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08985416920229425, + "language_loss": 0.84974313, + "learning_rate": 0.0005034269385785075, + "loss": 0.86080956, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.3203125, + "step": 2666, + "time_per_iteration": 2.6164255142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.08135498, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09072509808708462, + "language_loss": 0.85031348, + "learning_rate": 0.0005031154029410168, + "loss": 0.86144769, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.32055664, + "step": 2667, + "time_per_iteration": 2.5188395977020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112022, + "balance_loss_mlp": 1.07873833, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.08345403251216076, + "language_loss": 0.86623496, + "learning_rate": 0.0005028038660940197, + "loss": 0.87735522, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.33300781, + "step": 2668, + "time_per_iteration": 2.5099217891693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104597, + "balance_loss_mlp": 1.07360303, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.051835294009996306, + "language_loss": 0.8459934, + "learning_rate": 0.0005024923281584648, + "loss": 0.85703939, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.30981445, + "step": 2669, + "time_per_iteration": 2.6409177780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113407, + "balance_loss_mlp": 1.08103013, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.05618222104131465, + "language_loss": 0.82660598, + "learning_rate": 0.0005021807892553026, + "loss": 0.83774006, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.32397461, + "step": 2670, + "time_per_iteration": 2.7168080806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105439, + "balance_loss_mlp": 1.07458735, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.052268384876698444, + "language_loss": 0.84909296, + "learning_rate": 0.0005018692495054828, + "loss": 0.86014736, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.30834961, + "step": 2671, + "time_per_iteration": 2.769845485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07063007, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.059994941655344296, + "language_loss": 0.80935681, + "learning_rate": 0.0005015577090299561, + "loss": 0.82036185, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.29833984, + "step": 2672, + "time_per_iteration": 2.681316375732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_mlp": 1.07245326, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.05683100055240327, + "language_loss": 0.86631596, + "learning_rate": 0.0005012461679496729, + "loss": 0.87733757, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.29711914, + "step": 2673, + "time_per_iteration": 2.5961544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100162, + "balance_loss_mlp": 1.06883335, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.05638845856922635, + "language_loss": 0.88303345, + "learning_rate": 0.0005009346263855848, + "loss": 0.8940351, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.31323242, + "step": 2674, + "time_per_iteration": 2.607531785964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.06903887, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.05523698149533188, + "language_loss": 0.84251857, + "learning_rate": 0.0005006230844586422, + "loss": 0.85352582, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.31665039, + "step": 2675, + "time_per_iteration": 2.766676664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106245, + "balance_loss_mlp": 1.07384396, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.054179282011379754, + "language_loss": 0.79421759, + "learning_rate": 0.0005003115422897968, + "loss": 0.80528009, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.32397461, + "step": 2676, + "time_per_iteration": 2.7511518001556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101702, + "balance_loss_mlp": 1.0696342, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06371145669365144, + "language_loss": 0.86998433, + "learning_rate": 0.0005, + "loss": 0.88100135, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.32055664, + "step": 2677, + "time_per_iteration": 2.6361911296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096983, + "balance_loss_mlp": 1.06508231, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06720272484805691, + "language_loss": 0.79773581, + "learning_rate": 0.0004996884577102033, + "loss": 0.80870569, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.3190918, + "step": 2678, + "time_per_iteration": 3.078381299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101165, + "balance_loss_mlp": 1.06726193, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.05338815308435362, + "language_loss": 0.84963048, + "learning_rate": 0.000499376915541358, + "loss": 0.86064208, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.33911133, + "step": 2679, + "time_per_iteration": 2.6979198455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096582, + "balance_loss_mlp": 1.06506324, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.0530977146452018, + "language_loss": 0.8140825, + "learning_rate": 0.0004990653736144155, + "loss": 0.82504833, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31494141, + "step": 2680, + "time_per_iteration": 2.8514578342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098157, + "balance_loss_mlp": 1.06547022, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.091547983778046, + "language_loss": 0.86229038, + "learning_rate": 0.0004987538320503271, + "loss": 0.87327194, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3269043, + "step": 2681, + "time_per_iteration": 2.478638172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_mlp": 1.06798983, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.07018643811750969, + "language_loss": 0.83312553, + "learning_rate": 0.0004984422909700442, + "loss": 0.8441304, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.32495117, + "step": 2682, + "time_per_iteration": 2.6546084880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.06783557, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.15069020701750013, + "language_loss": 0.84435642, + "learning_rate": 0.0004981307504945173, + "loss": 0.85534728, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31225586, + "step": 2683, + "time_per_iteration": 2.71260929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110147, + "balance_loss_mlp": 1.06914032, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.0559262102608404, + "language_loss": 0.89665949, + "learning_rate": 0.0004978192107446976, + "loss": 0.90767419, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.32324219, + "step": 2684, + "time_per_iteration": 2.767662763595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097385, + "balance_loss_mlp": 1.06650972, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.06901755479732997, + "language_loss": 0.87345654, + "learning_rate": 0.0004975076718415353, + "loss": 0.88443041, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30834961, + "step": 2685, + "time_per_iteration": 2.6287574768066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110088, + "balance_loss_mlp": 1.06988525, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.05502113672837593, + "language_loss": 0.91023147, + "learning_rate": 0.0004971961339059806, + "loss": 0.92124021, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.30957031, + "step": 2686, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_mlp": 1.07256198, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06476684801011888, + "language_loss": 0.84195554, + "learning_rate": 0.0004968845970589832, + "loss": 0.85300732, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.32617188, + "step": 2687, + "time_per_iteration": 2.6715877056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102414, + "balance_loss_mlp": 1.06896389, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0648303600022088, + "language_loss": 0.84613401, + "learning_rate": 0.0004965730614214926, + "loss": 0.85715812, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.3347168, + "step": 2688, + "time_per_iteration": 2.6734025478363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099959, + "balance_loss_mlp": 1.06720066, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.05675548235902804, + "language_loss": 0.85410345, + "learning_rate": 0.0004962615271144576, + "loss": 0.86510307, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.32739258, + "step": 2689, + "time_per_iteration": 2.4930050373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101842, + "balance_loss_mlp": 1.0703702, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.06418610502647971, + "language_loss": 0.82956815, + "learning_rate": 0.0004959499942588264, + "loss": 0.8405866, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.31469727, + "step": 2690, + "time_per_iteration": 2.904674768447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.070189, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.04799778536167862, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79278797, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.0859375, + "step": 2691, + "time_per_iteration": 4.761531591415405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105601, + "balance_loss_mlp": 1.07255602, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.051278898576550616, + "language_loss": 0.85877872, + "learning_rate": 0.0004953269333855661, + "loss": 0.86983472, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.33032227, + "step": 2692, + "time_per_iteration": 2.729318857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104257, + "balance_loss_mlp": 1.07328665, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.05911618517599564, + "language_loss": 0.84474307, + "learning_rate": 0.0004950154056098309, + "loss": 0.85578561, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.30932617, + "step": 2693, + "time_per_iteration": 2.6833436489105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.09158325, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.059128614865360495, + "language_loss": 0.83972096, + "learning_rate": 0.0004947038797692867, + "loss": 0.85096538, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.32861328, + "step": 2694, + "time_per_iteration": 2.82362961769104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119523, + "balance_loss_mlp": 1.08635902, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.05692767589933962, + "language_loss": 0.77609885, + "learning_rate": 0.0004943923559848789, + "loss": 0.78729415, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.33178711, + "step": 2695, + "time_per_iteration": 2.7919468879699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123482, + "balance_loss_mlp": 1.09112859, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06299979408052762, + "language_loss": 0.90267843, + "learning_rate": 0.0004940808343775515, + "loss": 0.91391325, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.32348633, + "step": 2696, + "time_per_iteration": 2.6863224506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112015, + "balance_loss_mlp": 1.08748627, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.06289973384355804, + "language_loss": 0.82184958, + "learning_rate": 0.0004937693150682479, + "loss": 0.83305109, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.32666016, + "step": 2697, + "time_per_iteration": 2.5169589519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124428, + "balance_loss_mlp": 1.09109747, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.0748090565006246, + "language_loss": 0.76575571, + "learning_rate": 0.0004934577981779107, + "loss": 0.77699995, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.33325195, + "step": 2698, + "time_per_iteration": 2.65891432762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111403, + "balance_loss_mlp": 1.08103275, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.34709701447359415, + "language_loss": 0.81575179, + "learning_rate": 0.0004931462838274817, + "loss": 0.82689214, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.33007812, + "step": 2699, + "time_per_iteration": 2.829094648361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113032, + "balance_loss_mlp": 1.09694147, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.06002024337813523, + "language_loss": 0.84538823, + "learning_rate": 0.0004928347721379011, + "loss": 0.85669148, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.33398438, + "step": 2700, + "time_per_iteration": 2.685887098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128853, + "balance_loss_mlp": 1.09499812, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.07280089907997458, + "language_loss": 0.82063133, + "learning_rate": 0.0004925232632301089, + "loss": 0.83191985, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.33886719, + "step": 2701, + "time_per_iteration": 2.5586745738983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139592, + "balance_loss_mlp": 1.10711944, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.05869071142497867, + "language_loss": 0.7981168, + "learning_rate": 0.0004922117572250431, + "loss": 0.80951279, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.32495117, + "step": 2702, + "time_per_iteration": 2.652883768081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154601, + "balance_loss_mlp": 1.12041199, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08372395695209851, + "language_loss": 0.80792272, + "learning_rate": 0.0004919002542436414, + "loss": 0.8194688, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.34155273, + "step": 2703, + "time_per_iteration": 2.8069591522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156311, + "balance_loss_mlp": 1.12131107, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.06918407740604555, + "language_loss": 0.81692028, + "learning_rate": 0.0004915887544068399, + "loss": 0.82848334, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.35009766, + "step": 2704, + "time_per_iteration": 2.6484997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159384, + "balance_loss_mlp": 1.12228656, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.0754612517988151, + "language_loss": 0.78528553, + "learning_rate": 0.0004912772578355736, + "loss": 0.79687935, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.37084961, + "step": 2705, + "time_per_iteration": 2.889177083969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115407, + "balance_loss_mlp": 1.11825967, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.06509959827239385, + "language_loss": 0.83146906, + "learning_rate": 0.000490965764650776, + "loss": 0.84300983, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.3581543, + "step": 2706, + "time_per_iteration": 2.885923385620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115916, + "balance_loss_mlp": 1.12346911, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06296986612889613, + "language_loss": 0.82775491, + "learning_rate": 0.0004906542749733798, + "loss": 0.83934653, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.35693359, + "step": 2707, + "time_per_iteration": 3.6151185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152032, + "balance_loss_mlp": 1.11653161, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.046885737032271585, + "language_loss": 0.85312223, + "learning_rate": 0.0004903427889243156, + "loss": 0.86464256, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.35498047, + "step": 2708, + "time_per_iteration": 2.8592212200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169903, + "balance_loss_mlp": 1.13335371, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07702072033180815, + "language_loss": 0.85470927, + "learning_rate": 0.0004900313066245134, + "loss": 0.86640829, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.36547852, + "step": 2709, + "time_per_iteration": 2.7046992778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155719, + "balance_loss_mlp": 1.12145817, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.049948125939344834, + "language_loss": 0.80970949, + "learning_rate": 0.0004897198281949012, + "loss": 0.82126665, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.34277344, + "step": 2710, + "time_per_iteration": 2.728750228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164738, + "balance_loss_mlp": 1.12837923, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.06520397862885238, + "language_loss": 0.77818954, + "learning_rate": 0.0004894083537564057, + "loss": 0.78983688, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.36352539, + "step": 2711, + "time_per_iteration": 2.7362277507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163972, + "balance_loss_mlp": 1.12913883, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.051241123094768644, + "language_loss": 0.81174654, + "learning_rate": 0.0004890968834299519, + "loss": 0.82338625, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.34838867, + "step": 2712, + "time_per_iteration": 2.768146514892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156146, + "balance_loss_mlp": 1.12026405, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.05945211160457726, + "language_loss": 0.78877795, + "learning_rate": 0.0004887854173364633, + "loss": 0.80033934, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.35913086, + "step": 2713, + "time_per_iteration": 2.8356804847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149792, + "balance_loss_mlp": 1.1157217, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.05274159181021226, + "language_loss": 0.81621301, + "learning_rate": 0.0004884739555968617, + "loss": 0.82771093, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.34057617, + "step": 2714, + "time_per_iteration": 2.831137180328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.09369898, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.02923312307597506, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8007924, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08496094, + "step": 2715, + "time_per_iteration": 4.95891547203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149677, + "balance_loss_mlp": 1.11534512, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06614932878153669, + "language_loss": 0.86865598, + "learning_rate": 0.0004878510456629992, + "loss": 0.88015276, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.34326172, + "step": 2716, + "time_per_iteration": 2.968658924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145792, + "balance_loss_mlp": 1.1120801, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.05224698034347332, + "language_loss": 0.8526777, + "learning_rate": 0.00048753959771057314, + "loss": 0.86413562, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.33740234, + "step": 2717, + "time_per_iteration": 2.6395833492279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140286, + "balance_loss_mlp": 1.10736012, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.0584811227693513, + "language_loss": 0.83152837, + "learning_rate": 0.0004872281545957044, + "loss": 0.84293115, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.3293457, + "step": 2718, + "time_per_iteration": 2.7039849758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135383, + "balance_loss_mlp": 1.10069275, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.050310473622198856, + "language_loss": 0.85946554, + "learning_rate": 0.0004869167164393055, + "loss": 0.87081933, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.34692383, + "step": 2719, + "time_per_iteration": 2.91475510597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132518, + "balance_loss_mlp": 1.10028338, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.0697291023285212, + "language_loss": 0.89398658, + "learning_rate": 0.00048660528336228793, + "loss": 0.90531176, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.32226562, + "step": 2720, + "time_per_iteration": 2.792276620864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124032, + "balance_loss_mlp": 1.09115386, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05026677719306565, + "language_loss": 0.90367562, + "learning_rate": 0.0004862938554855606, + "loss": 0.91491592, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.32885742, + "step": 2721, + "time_per_iteration": 2.7964749336242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129388, + "balance_loss_mlp": 1.09643817, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.0663768296863652, + "language_loss": 0.86310339, + "learning_rate": 0.0004859824329300304, + "loss": 0.87439728, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.32958984, + "step": 2722, + "time_per_iteration": 2.6039419174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128053, + "balance_loss_mlp": 1.09403062, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.0581387375581185, + "language_loss": 0.84092689, + "learning_rate": 0.00048567101581660244, + "loss": 0.85220736, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.34033203, + "step": 2723, + "time_per_iteration": 2.5987517833709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.09227037, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.06184026942262611, + "language_loss": 0.87479013, + "learning_rate": 0.00048535960426617956, + "loss": 0.88604021, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.32739258, + "step": 2724, + "time_per_iteration": 2.6038565635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121549, + "balance_loss_mlp": 1.08724082, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.05825945935903347, + "language_loss": 0.81925243, + "learning_rate": 0.0004850481983996621, + "loss": 0.83046794, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.34350586, + "step": 2725, + "time_per_iteration": 2.7633490562438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122144, + "balance_loss_mlp": 1.08907521, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.06367267368201004, + "language_loss": 0.88101065, + "learning_rate": 0.0004847367983379492, + "loss": 0.89223206, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.33081055, + "step": 2726, + "time_per_iteration": 2.520050287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119938, + "balance_loss_mlp": 1.08837104, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.059069616726974465, + "language_loss": 0.79169118, + "learning_rate": 0.00048442540420193643, + "loss": 0.80289054, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.31567383, + "step": 2727, + "time_per_iteration": 2.9363925457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125304, + "balance_loss_mlp": 1.09278345, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.06091521023817234, + "language_loss": 0.7936945, + "learning_rate": 0.0004841140161125182, + "loss": 0.8049475, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.32543945, + "step": 2728, + "time_per_iteration": 3.5786640644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127178, + "balance_loss_mlp": 1.09666038, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.054648351094499156, + "language_loss": 0.85262787, + "learning_rate": 0.0004838026341905857, + "loss": 0.86389971, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.30517578, + "step": 2729, + "time_per_iteration": 2.7021641731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.09909368, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.06068419443661206, + "language_loss": 0.85131037, + "learning_rate": 0.00048349125855702844, + "loss": 0.8626138, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.3125, + "step": 2730, + "time_per_iteration": 2.794691562652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129298, + "balance_loss_mlp": 1.09754109, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.0500444792759443, + "language_loss": 0.81508827, + "learning_rate": 0.00048317988933273287, + "loss": 0.82638121, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.31738281, + "step": 2731, + "time_per_iteration": 2.7251734733581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124341, + "balance_loss_mlp": 1.09291768, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.06596294225314246, + "language_loss": 0.82520533, + "learning_rate": 0.00048286852663858367, + "loss": 0.83644867, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.31420898, + "step": 2732, + "time_per_iteration": 2.972963571548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.0889498, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.055500139325311094, + "language_loss": 0.84107697, + "learning_rate": 0.000482557170595462, + "loss": 0.85228211, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.31542969, + "step": 2733, + "time_per_iteration": 2.858245849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112503, + "balance_loss_mlp": 1.09401202, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.13743136293517658, + "language_loss": 0.87933344, + "learning_rate": 0.0004822458213242475, + "loss": 0.89058375, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31005859, + "step": 2734, + "time_per_iteration": 2.522383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112386, + "balance_loss_mlp": 1.08115363, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.05651199089550523, + "language_loss": 0.86197513, + "learning_rate": 0.00048193447894581627, + "loss": 0.87309897, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.31201172, + "step": 2735, + "time_per_iteration": 3.0866682529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111368, + "balance_loss_mlp": 1.08235216, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06879211849592783, + "language_loss": 0.88187921, + "learning_rate": 0.00048162314358104243, + "loss": 0.89301598, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.31298828, + "step": 2736, + "time_per_iteration": 2.5985138416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108841, + "balance_loss_mlp": 1.07713127, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.05778047820427569, + "language_loss": 0.83687961, + "learning_rate": 0.0004813118153507969, + "loss": 0.84796798, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.31713867, + "step": 2737, + "time_per_iteration": 2.73371958732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_mlp": 1.01416731, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.01810308130118829, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83469975, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.08691406, + "step": 2738, + "time_per_iteration": 4.790890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097772, + "balance_loss_mlp": 1.06670594, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.05745954748436515, + "language_loss": 0.83672923, + "learning_rate": 0.00048068918077736163, + "loss": 0.84770691, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.31030273, + "step": 2739, + "time_per_iteration": 3.239821195602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06309009, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06477195420820829, + "language_loss": 0.81728363, + "learning_rate": 0.0004803778746759001, + "loss": 0.82822424, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.30932617, + "step": 2740, + "time_per_iteration": 2.942760944366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.06614065, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.05799868370730736, + "language_loss": 0.81935298, + "learning_rate": 0.00048006657619242317, + "loss": 0.83032262, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.30810547, + "step": 2741, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06550419, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07558439368734231, + "language_loss": 0.78591353, + "learning_rate": 0.00047975528544778775, + "loss": 0.79689896, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.33056641, + "step": 2742, + "time_per_iteration": 2.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.06058371, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.06405052151098177, + "language_loss": 0.88749677, + "learning_rate": 0.00047944400256284754, + "loss": 0.89840853, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.30566406, + "step": 2743, + "time_per_iteration": 2.6816787719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098065, + "balance_loss_mlp": 1.06809616, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07088810283562207, + "language_loss": 0.80461031, + "learning_rate": 0.0004791327276584532, + "loss": 0.81559092, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.29956055, + "step": 2744, + "time_per_iteration": 2.8708317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098246, + "balance_loss_mlp": 1.06596446, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.06685009455993486, + "language_loss": 0.8087393, + "learning_rate": 0.00047882146085545264, + "loss": 0.81972182, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.32250977, + "step": 2745, + "time_per_iteration": 2.610027551651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_mlp": 1.01204121, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.008936429220158798, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76423383, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.08984375, + "step": 2746, + "time_per_iteration": 5.000555038452148 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097643, + "balance_loss_mlp": 1.06767416, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.06348628312729114, + "language_loss": 0.79553157, + "learning_rate": 0.00047819895203700684, + "loss": 0.806508, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29907227, + "step": 2747, + "time_per_iteration": 2.7115635871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017614, + "balance_loss_mlp": 1.0085541, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.007776557121409109, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76530045, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09082031, + "step": 2748, + "time_per_iteration": 4.672155141830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092605, + "balance_loss_mlp": 1.06263614, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.06781776650114792, + "language_loss": 0.8852309, + "learning_rate": 0.0004775764770742277, + "loss": 0.89615691, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29931641, + "step": 2749, + "time_per_iteration": 2.8029801845550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06542146, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.07126893850665976, + "language_loss": 0.86776084, + "learning_rate": 0.00047726525259079777, + "loss": 0.87873781, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.32299805, + "step": 2750, + "time_per_iteration": 2.7803709506988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097184, + "balance_loss_mlp": 1.06568849, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.07487878206236488, + "language_loss": 0.88641649, + "learning_rate": 0.0004769540369337798, + "loss": 0.89738834, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.31469727, + "step": 2751, + "time_per_iteration": 2.7477662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103352, + "balance_loss_mlp": 1.07166588, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06303354467879724, + "language_loss": 0.86111081, + "learning_rate": 0.00047664283022399794, + "loss": 0.87214434, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.31665039, + "step": 2752, + "time_per_iteration": 2.8321616649627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111513, + "balance_loss_mlp": 1.08142424, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.1009265551294561, + "language_loss": 0.81372654, + "learning_rate": 0.00047633163258227376, + "loss": 0.82484162, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.30053711, + "step": 2753, + "time_per_iteration": 2.866710662841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107072, + "balance_loss_mlp": 1.07536244, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06597410250171662, + "language_loss": 0.85720521, + "learning_rate": 0.0004760204441294247, + "loss": 0.86827588, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.31689453, + "step": 2754, + "time_per_iteration": 2.635411500930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123028, + "balance_loss_mlp": 1.09172344, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06814428712155127, + "language_loss": 0.86859232, + "learning_rate": 0.00047570926498626486, + "loss": 0.87982261, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31274414, + "step": 2755, + "time_per_iteration": 2.678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.10846841, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05259166917973927, + "language_loss": 0.8179211, + "learning_rate": 0.00047539809527360474, + "loss": 0.82931906, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31298828, + "step": 2756, + "time_per_iteration": 2.8505630493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139868, + "balance_loss_mlp": 1.1087544, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.23589307030508885, + "language_loss": 0.82282543, + "learning_rate": 0.0004750869351122511, + "loss": 0.83422416, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.31079102, + "step": 2757, + "time_per_iteration": 3.007599353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114789, + "balance_loss_mlp": 1.11598992, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.06932827369161218, + "language_loss": 0.81883401, + "learning_rate": 0.00047477578462300685, + "loss": 0.83031291, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.31884766, + "step": 2758, + "time_per_iteration": 2.7112765312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.11215043, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.060390901611552056, + "language_loss": 0.79751188, + "learning_rate": 0.0004744646439266718, + "loss": 0.80895996, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.32641602, + "step": 2759, + "time_per_iteration": 2.9956624507904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_mlp": 1.10905194, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.0692957942514688, + "language_loss": 0.92371601, + "learning_rate": 0.000474153513144041, + "loss": 0.93513119, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.32470703, + "step": 2760, + "time_per_iteration": 2.902304172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114025, + "balance_loss_mlp": 1.10756326, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06953135792158749, + "language_loss": 0.87208283, + "learning_rate": 0.00047384239239590633, + "loss": 0.88348538, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.3269043, + "step": 2761, + "time_per_iteration": 2.9197542667388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127002, + "balance_loss_mlp": 1.09414792, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06520154041113266, + "language_loss": 0.89041948, + "learning_rate": 0.0004735312818030556, + "loss": 0.90168953, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.32861328, + "step": 2762, + "time_per_iteration": 2.699882745742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128123, + "balance_loss_mlp": 1.0964613, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.0963196289257929, + "language_loss": 0.83125454, + "learning_rate": 0.0004732201814862727, + "loss": 0.84253573, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31640625, + "step": 2763, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113884, + "balance_loss_mlp": 1.08155453, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.058489246415432364, + "language_loss": 0.81845987, + "learning_rate": 0.0004729090915663373, + "loss": 0.82959872, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.32324219, + "step": 2764, + "time_per_iteration": 2.880218029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112044, + "balance_loss_mlp": 1.07930923, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.08176902294326427, + "language_loss": 0.85593212, + "learning_rate": 0.00047259801216402534, + "loss": 0.86705256, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.32739258, + "step": 2765, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.0809716, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.0984419544464696, + "language_loss": 0.86589384, + "learning_rate": 0.00047228694340010845, + "loss": 0.87702894, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.32543945, + "step": 2766, + "time_per_iteration": 2.615323781967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106832, + "balance_loss_mlp": 1.07288122, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.06857994992356635, + "language_loss": 0.85894436, + "learning_rate": 0.0004719758853953544, + "loss": 0.87001264, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.33984375, + "step": 2767, + "time_per_iteration": 3.580965042114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109799, + "balance_loss_mlp": 1.07475162, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.07966941077078553, + "language_loss": 0.84403044, + "learning_rate": 0.00047166483827052645, + "loss": 0.85512847, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.35083008, + "step": 2768, + "time_per_iteration": 2.3937976360321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112761, + "balance_loss_mlp": 1.09797895, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.05218838233145069, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78191251, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.14746094, + "step": 2769, + "time_per_iteration": 4.980372905731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112083, + "balance_loss_mlp": 1.07910919, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.05422451751257763, + "language_loss": 0.8393681, + "learning_rate": 0.000471042777143682, + "loss": 0.8504889, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.32958984, + "step": 2770, + "time_per_iteration": 3.2559990882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109219, + "balance_loss_mlp": 1.07576907, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.05619534531580183, + "language_loss": 0.79500479, + "learning_rate": 0.0004707317633831707, + "loss": 0.80609697, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.3347168, + "step": 2771, + "time_per_iteration": 2.580369472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113861, + "balance_loss_mlp": 1.07976723, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.07426752742264173, + "language_loss": 0.78140616, + "learning_rate": 0.00047042076098559673, + "loss": 0.79254484, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.34130859, + "step": 2772, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115026, + "balance_loss_mlp": 1.08131373, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07148667655520102, + "language_loss": 0.74185407, + "learning_rate": 0.00047010977007170174, + "loss": 0.75300431, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.3371582, + "step": 2773, + "time_per_iteration": 3.2167580127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103553, + "balance_loss_mlp": 1.07079434, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.05649801417476766, + "language_loss": 0.82702589, + "learning_rate": 0.00046979879076222334, + "loss": 0.83806139, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.32763672, + "step": 2774, + "time_per_iteration": 2.6618025302886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109156, + "balance_loss_mlp": 1.07689798, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.05944272870619304, + "language_loss": 0.85247773, + "learning_rate": 0.0004694878231778939, + "loss": 0.86356932, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.32250977, + "step": 2775, + "time_per_iteration": 3.381577968597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105801, + "balance_loss_mlp": 1.07459164, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.05869389504796052, + "language_loss": 0.84721255, + "learning_rate": 0.0004691768674394423, + "loss": 0.85827059, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.31176758, + "step": 2776, + "time_per_iteration": 2.9549882411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_mlp": 1.03230345, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.020468065913813137, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85525757, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09423828, + "step": 2777, + "time_per_iteration": 4.780264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_mlp": 1.03013933, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.02045845897293101, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77692783, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09130859, + "step": 2778, + "time_per_iteration": 5.030272960662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101269, + "balance_loss_mlp": 1.06870127, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06089610481991967, + "language_loss": 0.7961477, + "learning_rate": 0.00046824407250656676, + "loss": 0.80716044, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.32568359, + "step": 2779, + "time_per_iteration": 2.6681063175201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096395, + "balance_loss_mlp": 1.06537652, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.04990324067280663, + "language_loss": 0.83819127, + "learning_rate": 0.0004679331653588161, + "loss": 0.84915525, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.30981445, + "step": 2780, + "time_per_iteration": 2.635774612426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092346, + "balance_loss_mlp": 1.05999231, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.06684745885443293, + "language_loss": 0.85806221, + "learning_rate": 0.0004676222706605147, + "loss": 0.86898565, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.32348633, + "step": 2781, + "time_per_iteration": 2.6137733459472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092057, + "balance_loss_mlp": 1.05886936, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08426708268962642, + "language_loss": 0.85464495, + "learning_rate": 0.0004673113885323626, + "loss": 0.86556554, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.33203125, + "step": 2782, + "time_per_iteration": 2.861581802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083804, + "balance_loss_mlp": 1.05083072, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.060311716473253056, + "language_loss": 0.78792584, + "learning_rate": 0.00046700051909505494, + "loss": 0.79876387, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.32983398, + "step": 2783, + "time_per_iteration": 3.182298183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089723, + "balance_loss_mlp": 1.05407953, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678042842361867, + "language_loss": 0.84239137, + "learning_rate": 0.000466689662469282, + "loss": 0.85328859, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.35644531, + "step": 2784, + "time_per_iteration": 2.6519503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.04891968, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06002174049054728, + "language_loss": 0.83905756, + "learning_rate": 0.00046637881877572917, + "loss": 0.84987772, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.33105469, + "step": 2785, + "time_per_iteration": 3.1058127880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_mlp": 1.051754, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.0580195679012457, + "language_loss": 0.8490684, + "learning_rate": 0.0004660679881350764, + "loss": 0.85991538, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.32958984, + "step": 2786, + "time_per_iteration": 2.77021861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053559, + "balance_loss_mlp": 1.0447371, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.032864625150969516, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76661706, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.08837891, + "step": 2787, + "time_per_iteration": 5.029211044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087215, + "balance_loss_mlp": 1.05335903, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07679411484967892, + "language_loss": 0.77928644, + "learning_rate": 0.0004654463664951667, + "loss": 0.79015857, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.33886719, + "step": 2788, + "time_per_iteration": 2.9762089252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088019, + "balance_loss_mlp": 1.05464029, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06025701653165108, + "language_loss": 0.83150423, + "learning_rate": 0.0004651355757372447, + "loss": 0.84238434, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.33398438, + "step": 2789, + "time_per_iteration": 2.5971946716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089252, + "balance_loss_mlp": 1.05604005, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.08338964083328992, + "language_loss": 0.8607617, + "learning_rate": 0.00046482479851489274, + "loss": 0.87165421, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.33227539, + "step": 2790, + "time_per_iteration": 2.6431193351745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109405, + "balance_loss_mlp": 1.06119633, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.07763218475438792, + "language_loss": 0.77609432, + "learning_rate": 0.00046451403494876525, + "loss": 0.78703481, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.32861328, + "step": 2791, + "time_per_iteration": 2.860164165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092942, + "balance_loss_mlp": 1.05918157, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.06279789357775317, + "language_loss": 0.84532517, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625458, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.33789062, + "step": 2792, + "time_per_iteration": 2.7511003017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106074, + "balance_loss_mlp": 1.07081246, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.05029896863334896, + "language_loss": 0.85103881, + "learning_rate": 0.00046389254926777404, + "loss": 0.86209953, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.35253906, + "step": 2793, + "time_per_iteration": 2.7946324348449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.07229924, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.05473465194283574, + "language_loss": 0.78127646, + "learning_rate": 0.0004635818273941926, + "loss": 0.79232681, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.32739258, + "step": 2794, + "time_per_iteration": 3.5742921829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109863, + "balance_loss_mlp": 1.07641304, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.07615315185796866, + "language_loss": 0.82079315, + "learning_rate": 0.0004632711196593997, + "loss": 0.83189178, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.3347168, + "step": 2795, + "time_per_iteration": 2.7694544792175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110741, + "balance_loss_mlp": 1.07907939, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.07020702036152926, + "language_loss": 0.85457337, + "learning_rate": 0.00046296042618402297, + "loss": 0.86568069, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.31640625, + "step": 2796, + "time_per_iteration": 3.0587034225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109738, + "balance_loss_mlp": 1.07883883, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06759922925686453, + "language_loss": 0.7969842, + "learning_rate": 0.0004626497470886839, + "loss": 0.80808163, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30883789, + "step": 2797, + "time_per_iteration": 3.002824068069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105945, + "balance_loss_mlp": 1.07299602, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.07466819588637175, + "language_loss": 0.82158947, + "learning_rate": 0.00046233908249399897, + "loss": 0.83264899, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.32958984, + "step": 2798, + "time_per_iteration": 2.7746241092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097876, + "balance_loss_mlp": 1.06559372, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.05453000178981586, + "language_loss": 0.78238356, + "learning_rate": 0.00046202843252057905, + "loss": 0.79336226, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.32275391, + "step": 2799, + "time_per_iteration": 2.581350803375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097308, + "balance_loss_mlp": 1.06478727, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06584834464031906, + "language_loss": 0.84020996, + "learning_rate": 0.00046171779728902896, + "loss": 0.85118306, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.32495117, + "step": 2800, + "time_per_iteration": 2.577760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.05988431, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.0769580423168035, + "language_loss": 0.85918987, + "learning_rate": 0.000461407176919948, + "loss": 0.87011129, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.32250977, + "step": 2801, + "time_per_iteration": 2.5490942001342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093913, + "balance_loss_mlp": 1.06189322, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.05361052263899676, + "language_loss": 0.85168314, + "learning_rate": 0.00046109657153392997, + "loss": 0.86262226, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.32006836, + "step": 2802, + "time_per_iteration": 2.7699196338653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095158, + "balance_loss_mlp": 1.06132686, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07003946535384918, + "language_loss": 0.82877356, + "learning_rate": 0.0004607859812515622, + "loss": 0.83972514, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.33862305, + "step": 2803, + "time_per_iteration": 2.6007485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093716, + "balance_loss_mlp": 1.06198251, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06322278970979951, + "language_loss": 0.88066649, + "learning_rate": 0.00046047540619342667, + "loss": 0.89160359, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.31713867, + "step": 2804, + "time_per_iteration": 2.5943124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06163239, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.060964528711389604, + "language_loss": 0.80115181, + "learning_rate": 0.00046016484648009933, + "loss": 0.81207782, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30957031, + "step": 2805, + "time_per_iteration": 2.707387924194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096878, + "balance_loss_mlp": 1.0659312, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.05960799154457967, + "language_loss": 0.80838758, + "learning_rate": 0.0004598543022321501, + "loss": 0.81935638, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.30908203, + "step": 2806, + "time_per_iteration": 2.606360673904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103257, + "balance_loss_mlp": 1.07080865, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.059370042319646085, + "language_loss": 0.80030453, + "learning_rate": 0.0004595437735701433, + "loss": 0.81133705, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.32446289, + "step": 2807, + "time_per_iteration": 2.674914836883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.06448901, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.07129928038264445, + "language_loss": 0.83467567, + "learning_rate": 0.00045923326061463623, + "loss": 0.84564078, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.32006836, + "step": 2808, + "time_per_iteration": 2.7732136249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093728, + "balance_loss_mlp": 1.0615654, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.061183409959599915, + "language_loss": 0.81861985, + "learning_rate": 0.00045892276348618113, + "loss": 0.82955706, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.3215332, + "step": 2809, + "time_per_iteration": 2.9496963024139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_mlp": 1.03318524, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.03295349175272743, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79301834, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.078125, + "step": 2810, + "time_per_iteration": 4.980771064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095175, + "balance_loss_mlp": 1.06375122, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.048089637178950914, + "language_loss": 0.80807102, + "learning_rate": 0.000458301817192603, + "loss": 0.81902277, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.31396484, + "step": 2811, + "time_per_iteration": 2.819394111633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014174, + "balance_loss_mlp": 1.00659227, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.018125943247431338, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81855953, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.07568359, + "step": 2812, + "time_per_iteration": 4.830869197845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094625, + "balance_loss_mlp": 1.06312966, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.07142535441885249, + "language_loss": 0.8774603, + "learning_rate": 0.00045768093565369983, + "loss": 0.88840652, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31494141, + "step": 2813, + "time_per_iteration": 2.7351324558258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06911242, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.0566212514723048, + "language_loss": 0.82215679, + "learning_rate": 0.0004573705194685646, + "loss": 0.83316934, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.32128906, + "step": 2814, + "time_per_iteration": 2.6945576667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100716, + "balance_loss_mlp": 1.06860089, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.06333436634677812, + "language_loss": 0.85428321, + "learning_rate": 0.00045706011983366157, + "loss": 0.86529034, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.32080078, + "step": 2815, + "time_per_iteration": 2.681619882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108066, + "balance_loss_mlp": 1.07623768, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.068256039366798, + "language_loss": 0.8269453, + "learning_rate": 0.00045674973686949847, + "loss": 0.83802599, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.31835938, + "step": 2816, + "time_per_iteration": 2.5405073165893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109346, + "balance_loss_mlp": 1.07830381, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.0555657817841838, + "language_loss": 0.85590029, + "learning_rate": 0.0004564393706965766, + "loss": 0.86699367, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 2.9834089279174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102481, + "balance_loss_mlp": 1.07079506, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.052731051534337416, + "language_loss": 0.81111342, + "learning_rate": 0.00045612902143539116, + "loss": 0.82213825, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31665039, + "step": 2818, + "time_per_iteration": 2.5867249965667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06935942, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.08027643777933474, + "language_loss": 0.82169372, + "learning_rate": 0.00045581868920642986, + "loss": 0.83268583, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.29833984, + "step": 2819, + "time_per_iteration": 2.538219928741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100605, + "balance_loss_mlp": 1.06968212, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.056746529630016036, + "language_loss": 0.79290533, + "learning_rate": 0.00045550837413017457, + "loss": 0.80391139, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30883789, + "step": 2820, + "time_per_iteration": 2.6461877822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100089, + "balance_loss_mlp": 1.06995249, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06471497165860861, + "language_loss": 0.85196662, + "learning_rate": 0.0004551980763271005, + "loss": 0.86296749, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30102539, + "step": 2821, + "time_per_iteration": 2.6883745193481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_mlp": 1.07015133, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.058885459141671155, + "language_loss": 0.84080005, + "learning_rate": 0.0004548877959176756, + "loss": 0.85182083, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.3190918, + "step": 2822, + "time_per_iteration": 2.861867666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.06595802, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06589540393120931, + "language_loss": 0.86233151, + "learning_rate": 0.00045457753302236166, + "loss": 0.87329865, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30737305, + "step": 2823, + "time_per_iteration": 2.687164068222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097063, + "balance_loss_mlp": 1.06685555, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07425338305054356, + "language_loss": 0.87034917, + "learning_rate": 0.00045426728776161353, + "loss": 0.88131976, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30175781, + "step": 2824, + "time_per_iteration": 2.7938835620880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104052, + "balance_loss_mlp": 1.07224679, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.05711338707468448, + "language_loss": 0.81608665, + "learning_rate": 0.00045395706025587863, + "loss": 0.82712722, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.31787109, + "step": 2825, + "time_per_iteration": 2.6212074756622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099159, + "balance_loss_mlp": 1.06907105, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.07865669635555295, + "language_loss": 0.8299852, + "learning_rate": 0.00045364685062559843, + "loss": 0.84097683, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30078125, + "step": 2826, + "time_per_iteration": 2.8868184089660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104022, + "balance_loss_mlp": 1.07505381, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06023434626032839, + "language_loss": 0.91765273, + "learning_rate": 0.0004533366589912067, + "loss": 0.92869294, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.28955078, + "step": 2827, + "time_per_iteration": 2.9981062412261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105445, + "balance_loss_mlp": 1.07557106, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.06990968055660145, + "language_loss": 0.78070033, + "learning_rate": 0.0004530264854731306, + "loss": 0.79175478, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29858398, + "step": 2828, + "time_per_iteration": 3.0054330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107215, + "balance_loss_mlp": 1.07605386, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05020371190449787, + "language_loss": 0.84383601, + "learning_rate": 0.00045271633019179034, + "loss": 0.85490811, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.3112793, + "step": 2829, + "time_per_iteration": 2.775956630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107068, + "balance_loss_mlp": 1.07605028, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05805566098722391, + "language_loss": 0.88203323, + "learning_rate": 0.0004524061932675986, + "loss": 0.8931039, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.30981445, + "step": 2830, + "time_per_iteration": 2.8221793174743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106595, + "balance_loss_mlp": 1.07555294, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.0740029895366448, + "language_loss": 0.87459874, + "learning_rate": 0.00045209607482096125, + "loss": 0.8856647, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.31005859, + "step": 2831, + "time_per_iteration": 3.0393142700195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.0710969, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.08209208283258153, + "language_loss": 0.84651136, + "learning_rate": 0.0004517859749722772, + "loss": 0.85753322, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.31054688, + "step": 2832, + "time_per_iteration": 2.6821095943450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105232, + "balance_loss_mlp": 1.07368898, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.07359331276456935, + "language_loss": 0.79572821, + "learning_rate": 0.0004514758938419376, + "loss": 0.80678058, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.31518555, + "step": 2833, + "time_per_iteration": 2.8375093936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080375, + "balance_loss_mlp": 1.07288861, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03314547284214794, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78000963, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.07470703, + "step": 2834, + "time_per_iteration": 4.963228225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06930006, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.057187543491433894, + "language_loss": 0.83827722, + "learning_rate": 0.00045085578821782175, + "loss": 0.84927469, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.3046875, + "step": 2835, + "time_per_iteration": 2.5562217235565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054355, + "balance_loss_mlp": 1.04696393, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.02358753311446476, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77189088, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.07373047, + "step": 2836, + "time_per_iteration": 4.959676742553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100094, + "balance_loss_mlp": 1.06983829, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.0408398110042356, + "language_loss": 0.80949795, + "learning_rate": 0.00045023575891159866, + "loss": 0.82049894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30200195, + "step": 2837, + "time_per_iteration": 2.74700665473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_mlp": 1.01894093, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.01524116386105569, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75790191, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.07421875, + "step": 2838, + "time_per_iteration": 4.9733850955963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.07366681, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.05292635351535042, + "language_loss": 0.78244042, + "learning_rate": 0.0004496158068861354, + "loss": 0.79347491, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29760742, + "step": 2839, + "time_per_iteration": 2.8023805618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110962, + "balance_loss_mlp": 1.08010423, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.0535580092110964, + "language_loss": 0.80844593, + "learning_rate": 0.00044930586015455207, + "loss": 0.81954211, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.29492188, + "step": 2840, + "time_per_iteration": 2.816567897796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118684, + "balance_loss_mlp": 1.08804703, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.06541969342762931, + "language_loss": 0.89212978, + "learning_rate": 0.000448995933104179, + "loss": 0.90331668, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.3059082, + "step": 2841, + "time_per_iteration": 2.903371810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115887, + "balance_loss_mlp": 1.08601356, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06848140377985366, + "language_loss": 0.80388117, + "learning_rate": 0.00044868602585534077, + "loss": 0.81504011, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.29833984, + "step": 2842, + "time_per_iteration": 2.870833396911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104882, + "balance_loss_mlp": 1.07519853, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.06871095275450309, + "language_loss": 0.89058006, + "learning_rate": 0.0004483761385283541, + "loss": 0.90162885, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.29663086, + "step": 2843, + "time_per_iteration": 2.5367324352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.06863523, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.05633892340966096, + "language_loss": 0.81610817, + "learning_rate": 0.0004480662712435281, + "loss": 0.82710731, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.3125, + "step": 2844, + "time_per_iteration": 2.8301496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.0627687, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.05986468354955699, + "language_loss": 0.88694894, + "learning_rate": 0.0004477564241211635, + "loss": 0.89787042, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.2935791, + "step": 2845, + "time_per_iteration": 2.5813820362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086916, + "balance_loss_mlp": 1.05787718, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.059098326299960216, + "language_loss": 0.87329561, + "learning_rate": 0.0004474465972815541, + "loss": 0.88416475, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.2902832, + "step": 2846, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_mlp": 1.05730796, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05595262091427783, + "language_loss": 0.87812984, + "learning_rate": 0.000447136790844985, + "loss": 0.88898313, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.28027344, + "step": 2847, + "time_per_iteration": 2.698451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086514, + "balance_loss_mlp": 1.05726016, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.06538513229207209, + "language_loss": 0.81294727, + "learning_rate": 0.00044682700493173385, + "loss": 0.82381248, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.29223633, + "step": 2848, + "time_per_iteration": 2.8252742290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05441868, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06259253721450928, + "language_loss": 0.80796725, + "learning_rate": 0.00044651723966207004, + "loss": 0.81881809, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.30639648, + "step": 2849, + "time_per_iteration": 3.093806505203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083876, + "balance_loss_mlp": 1.05424023, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.05680096345280931, + "language_loss": 0.78538483, + "learning_rate": 0.00044620749515625536, + "loss": 0.79622364, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.29614258, + "step": 2850, + "time_per_iteration": 2.759477376937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_mlp": 1.0532248, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.054672764420471885, + "language_loss": 0.85281622, + "learning_rate": 0.00044589777153454334, + "loss": 0.86365175, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30297852, + "step": 2851, + "time_per_iteration": 2.7247886657714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082527, + "balance_loss_mlp": 1.0519855, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.05927586181396917, + "language_loss": 0.83792317, + "learning_rate": 0.00044558806891717895, + "loss": 0.84874845, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30493164, + "step": 2852, + "time_per_iteration": 2.480499267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078563, + "balance_loss_mlp": 1.04847419, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06995220773511122, + "language_loss": 0.79820019, + "learning_rate": 0.0004452783874243998, + "loss": 0.80898583, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.30053711, + "step": 2853, + "time_per_iteration": 2.815159559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_mlp": 1.05354142, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.0871194319773747, + "language_loss": 0.8473509, + "learning_rate": 0.00044496872717643475, + "loss": 0.85818863, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.30200195, + "step": 2854, + "time_per_iteration": 2.671760320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_mlp": 1.02099681, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.022692984636718958, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7811873, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.08447266, + "step": 2855, + "time_per_iteration": 4.943760633468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_mlp": 1.05152166, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08481580298187671, + "language_loss": 0.82385266, + "learning_rate": 0.0004443494708958217, + "loss": 0.83465844, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.2902832, + "step": 2856, + "time_per_iteration": 2.9592692852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081194, + "balance_loss_mlp": 1.05131996, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.054737825906261944, + "language_loss": 0.81019336, + "learning_rate": 0.0004440398751035906, + "loss": 0.82100528, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29858398, + "step": 2857, + "time_per_iteration": 2.8660449981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086545, + "balance_loss_mlp": 1.05612314, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.07506614425197558, + "language_loss": 0.84203708, + "learning_rate": 0.00044373030103700645, + "loss": 0.85290253, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.30395508, + "step": 2858, + "time_per_iteration": 2.589571475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.05769968, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.06400511299844665, + "language_loss": 0.80211353, + "learning_rate": 0.000443420748816257, + "loss": 0.81297493, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28442383, + "step": 2859, + "time_per_iteration": 2.775573492050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089751, + "balance_loss_mlp": 1.05894732, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.05990515883462961, + "language_loss": 0.78525764, + "learning_rate": 0.0004431112185615208, + "loss": 0.7961551, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.30786133, + "step": 2860, + "time_per_iteration": 2.79428768157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099065, + "balance_loss_mlp": 1.06942964, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.08012396807897051, + "language_loss": 0.80142951, + "learning_rate": 0.00044280171039296845, + "loss": 0.81242013, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29589844, + "step": 2861, + "time_per_iteration": 2.6075713634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097414, + "balance_loss_mlp": 1.06808829, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.055527438655266555, + "language_loss": 0.88317382, + "learning_rate": 0.0004424922244307616, + "loss": 0.89414799, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.29321289, + "step": 2862, + "time_per_iteration": 2.6453704833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093253, + "balance_loss_mlp": 1.06306958, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.0988044596240084, + "language_loss": 0.82273299, + "learning_rate": 0.00044218276079505315, + "loss": 0.83366549, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.30151367, + "step": 2863, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093494, + "balance_loss_mlp": 1.0636915, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.15366013773450377, + "language_loss": 0.74783754, + "learning_rate": 0.0004418733196059876, + "loss": 0.75877243, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29760742, + "step": 2864, + "time_per_iteration": 2.6593546867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092739, + "balance_loss_mlp": 1.06398571, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.05392307081741782, + "language_loss": 0.80104017, + "learning_rate": 0.0004415639009837008, + "loss": 0.81196761, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28759766, + "step": 2865, + "time_per_iteration": 2.8184585571289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096337, + "balance_loss_mlp": 1.06660628, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.0621710106813525, + "language_loss": 0.8235333, + "learning_rate": 0.00044125450504831955, + "loss": 0.83449662, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.29711914, + "step": 2866, + "time_per_iteration": 2.734349489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086542, + "balance_loss_mlp": 1.05592918, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.06271512147953057, + "language_loss": 0.82752901, + "learning_rate": 0.0004409451319199622, + "loss": 0.83839446, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.30566406, + "step": 2867, + "time_per_iteration": 2.683742046356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095264, + "balance_loss_mlp": 1.06417394, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07258101504897169, + "language_loss": 0.84457368, + "learning_rate": 0.0004406357817187381, + "loss": 0.85552633, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.31054688, + "step": 2868, + "time_per_iteration": 3.0147883892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103312, + "balance_loss_mlp": 1.07379591, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.05164398294731223, + "language_loss": 0.81673765, + "learning_rate": 0.0004403264545647474, + "loss": 0.82777071, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29492188, + "step": 2869, + "time_per_iteration": 3.5095975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107006, + "balance_loss_mlp": 1.07603574, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.04919714399659635, + "language_loss": 0.85006267, + "learning_rate": 0.00044001715057808154, + "loss": 0.86113274, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.30932617, + "step": 2870, + "time_per_iteration": 2.759791851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114514, + "balance_loss_mlp": 1.08330536, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06727866309699267, + "language_loss": 0.81942332, + "learning_rate": 0.0004397078698788232, + "loss": 0.83056843, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.31176758, + "step": 2871, + "time_per_iteration": 3.21431040763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104908, + "balance_loss_mlp": 1.09441757, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.04310408533027141, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81547272, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.10498047, + "step": 2872, + "time_per_iteration": 4.941087484359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114234, + "balance_loss_mlp": 1.082739, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.05898932962157328, + "language_loss": 0.78340954, + "learning_rate": 0.00043908937882281343, + "loss": 0.79455185, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.31469727, + "step": 2873, + "time_per_iteration": 2.577866554260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08501506, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.05969066171006231, + "language_loss": 0.82846034, + "learning_rate": 0.0004387801687061814, + "loss": 0.83962971, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.3190918, + "step": 2874, + "time_per_iteration": 2.8184196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117603, + "balance_loss_mlp": 1.08489251, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.05481480847886404, + "language_loss": 0.80685902, + "learning_rate": 0.0004384709823571958, + "loss": 0.81803501, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.32714844, + "step": 2875, + "time_per_iteration": 2.7496426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.07519674, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0703745986604158, + "language_loss": 0.83230788, + "learning_rate": 0.0004381618198958932, + "loss": 0.84336388, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.30371094, + "step": 2876, + "time_per_iteration": 3.4905495643615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110669, + "balance_loss_mlp": 1.07662511, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.06448913307816859, + "language_loss": 0.84021735, + "learning_rate": 0.00043785268144230137, + "loss": 0.85128427, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.30029297, + "step": 2877, + "time_per_iteration": 2.907133102416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102032, + "balance_loss_mlp": 1.07203865, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.0731230974557418, + "language_loss": 0.82496381, + "learning_rate": 0.00043754356711643837, + "loss": 0.83598411, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29980469, + "step": 2878, + "time_per_iteration": 2.715023994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097061, + "balance_loss_mlp": 1.06609011, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.0760081782140183, + "language_loss": 0.83909559, + "learning_rate": 0.0004372344770383132, + "loss": 0.85006618, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30932617, + "step": 2879, + "time_per_iteration": 2.822368621826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097203, + "balance_loss_mlp": 1.06756735, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.06372253861737541, + "language_loss": 0.83305293, + "learning_rate": 0.00043692541132792507, + "loss": 0.84402496, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29614258, + "step": 2880, + "time_per_iteration": 2.7154414653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093507, + "balance_loss_mlp": 1.06349051, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.057885594640944824, + "language_loss": 0.83464789, + "learning_rate": 0.00043661637010526384, + "loss": 0.84558296, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.30004883, + "step": 2881, + "time_per_iteration": 2.507059097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092859, + "balance_loss_mlp": 1.06255555, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.08329174894233551, + "language_loss": 0.83249325, + "learning_rate": 0.00043630735349031025, + "loss": 0.84342188, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30273438, + "step": 2882, + "time_per_iteration": 2.644418478012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06216836, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.047753182436236, + "language_loss": 0.81861913, + "learning_rate": 0.00043599836160303495, + "loss": 0.82952571, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.28491211, + "step": 2883, + "time_per_iteration": 2.8971407413482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090292, + "balance_loss_mlp": 1.06160986, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.057456379562134556, + "language_loss": 0.77759755, + "learning_rate": 0.0004356893945633995, + "loss": 0.78850043, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.28649902, + "step": 2884, + "time_per_iteration": 2.937133312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094576, + "balance_loss_mlp": 1.06620383, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.05754228747661135, + "language_loss": 0.81617516, + "learning_rate": 0.0004353804524913551, + "loss": 0.82712096, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.28344727, + "step": 2885, + "time_per_iteration": 2.579535722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109656, + "balance_loss_mlp": 1.08028293, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.06485446309889223, + "language_loss": 0.81926423, + "learning_rate": 0.0004350715355068441, + "loss": 0.83036083, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.29345703, + "step": 2886, + "time_per_iteration": 2.709717273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111013, + "balance_loss_mlp": 1.08142567, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.066893347852213, + "language_loss": 0.7961694, + "learning_rate": 0.00043476264372979847, + "loss": 0.80727959, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.2956543, + "step": 2887, + "time_per_iteration": 2.5216078758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113, + "balance_loss_mlp": 1.08441329, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.0640996529430707, + "language_loss": 0.78604692, + "learning_rate": 0.0004344537772801408, + "loss": 0.7971769, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.28540039, + "step": 2888, + "time_per_iteration": 3.8132436275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.05838752, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.028482200170008867, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.7448833, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.07470703, + "step": 2889, + "time_per_iteration": 4.947216987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117814, + "balance_loss_mlp": 1.08801198, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.06792095354006551, + "language_loss": 0.83771884, + "learning_rate": 0.0004338361208426298, + "loss": 0.84889698, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.29760742, + "step": 2890, + "time_per_iteration": 2.631476879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113406, + "balance_loss_mlp": 1.08350825, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.05967481099781226, + "language_loss": 0.81602627, + "learning_rate": 0.00043352733109457164, + "loss": 0.82716036, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.29858398, + "step": 2891, + "time_per_iteration": 2.907500743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111722, + "balance_loss_mlp": 1.08194315, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.04670195587242621, + "language_loss": 0.84789026, + "learning_rate": 0.00043321856715349244, + "loss": 0.85900748, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29760742, + "step": 2892, + "time_per_iteration": 2.9401984214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110696, + "balance_loss_mlp": 1.0810132, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.05439165995621742, + "language_loss": 0.80422115, + "learning_rate": 0.00043290982913926466, + "loss": 0.81532812, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.29614258, + "step": 2893, + "time_per_iteration": 2.7956430912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113402, + "balance_loss_mlp": 1.08312285, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.09922355360532673, + "language_loss": 0.8448714, + "learning_rate": 0.0004326011171717514, + "loss": 0.85600543, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30297852, + "step": 2894, + "time_per_iteration": 2.8997769355773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108454, + "balance_loss_mlp": 1.07676816, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06224988402754836, + "language_loss": 0.81240308, + "learning_rate": 0.0004322924313708051, + "loss": 0.82348764, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.31689453, + "step": 2895, + "time_per_iteration": 2.511643648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07825518, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.0621054083596477, + "language_loss": 0.84500259, + "learning_rate": 0.0004319837718562681, + "loss": 0.85607862, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.29321289, + "step": 2896, + "time_per_iteration": 2.580003023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106887, + "balance_loss_mlp": 1.07667959, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.05844968671659234, + "language_loss": 0.83570629, + "learning_rate": 0.0004316751387479726, + "loss": 0.84677517, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30175781, + "step": 2897, + "time_per_iteration": 2.7676987648010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122549, + "balance_loss_mlp": 1.0925082, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.06543352873326957, + "language_loss": 0.82800293, + "learning_rate": 0.0004313665321657409, + "loss": 0.83922845, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.30004883, + "step": 2898, + "time_per_iteration": 3.7584402561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120576, + "balance_loss_mlp": 1.08917618, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.06787906742385669, + "language_loss": 0.80272007, + "learning_rate": 0.00043105795222938436, + "loss": 0.81392586, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.31396484, + "step": 2899, + "time_per_iteration": 2.718045711517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.07795143, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.06698960298708169, + "language_loss": 0.78827435, + "learning_rate": 0.00043074939905870467, + "loss": 0.79937094, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.31713867, + "step": 2900, + "time_per_iteration": 2.639775514602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.08230579, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.09759490745534659, + "language_loss": 0.80356312, + "learning_rate": 0.0004304408727734927, + "loss": 0.81467754, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.29125977, + "step": 2901, + "time_per_iteration": 2.6272940635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107959, + "balance_loss_mlp": 1.07889545, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.06875313821095587, + "language_loss": 0.89200485, + "learning_rate": 0.0004301323734935288, + "loss": 0.9030844, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.29052734, + "step": 2902, + "time_per_iteration": 2.652219533920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_mlp": 1.07164121, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.05706751216847301, + "language_loss": 0.87298477, + "learning_rate": 0.000429823901338583, + "loss": 0.8839913, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.2902832, + "step": 2903, + "time_per_iteration": 2.611798048019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099623, + "balance_loss_mlp": 1.06872356, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.053536411753063035, + "language_loss": 0.87032712, + "learning_rate": 0.00042951545642841513, + "loss": 0.88132328, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.30883789, + "step": 2904, + "time_per_iteration": 3.067237377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099073, + "balance_loss_mlp": 1.06979561, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.04987560026618122, + "language_loss": 0.86746645, + "learning_rate": 0.0004292070388827737, + "loss": 0.87845719, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.29272461, + "step": 2905, + "time_per_iteration": 2.5981948375701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06426287, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06265536693897518, + "language_loss": 0.81292248, + "learning_rate": 0.00042889864882139753, + "loss": 0.82385433, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.2890625, + "step": 2906, + "time_per_iteration": 2.581113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107989, + "balance_loss_mlp": 1.07811511, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06493240221059006, + "language_loss": 0.81897962, + "learning_rate": 0.0004285902863640139, + "loss": 0.83005953, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29858398, + "step": 2907, + "time_per_iteration": 2.6115305423736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109856, + "balance_loss_mlp": 1.06973481, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.07849480564056018, + "language_loss": 0.8626982, + "learning_rate": 0.00042828195163033966, + "loss": 0.87368375, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.28833008, + "step": 2908, + "time_per_iteration": 2.6564390659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07654572, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07707498056388652, + "language_loss": 0.79454792, + "learning_rate": 0.0004279736447400812, + "loss": 0.80562025, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30664062, + "step": 2909, + "time_per_iteration": 2.580448627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_mlp": 1.07343817, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.055339920225342294, + "language_loss": 0.78979003, + "learning_rate": 0.00042766536581293385, + "loss": 0.80081677, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.29223633, + "step": 2910, + "time_per_iteration": 2.714306116104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112106, + "balance_loss_mlp": 1.09004188, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.06660982321180627, + "language_loss": 0.79863673, + "learning_rate": 0.0004273571149685819, + "loss": 0.80984735, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30981445, + "step": 2911, + "time_per_iteration": 2.738189220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117749, + "balance_loss_mlp": 1.08794653, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.07453286241806684, + "language_loss": 0.83875954, + "learning_rate": 0.00042704889232669937, + "loss": 0.84993702, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29785156, + "step": 2912, + "time_per_iteration": 2.7153878211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119265, + "balance_loss_mlp": 1.09003508, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.06505374842280261, + "language_loss": 0.85808718, + "learning_rate": 0.0004267406980069484, + "loss": 0.8692798, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29248047, + "step": 2913, + "time_per_iteration": 2.7438042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105259, + "balance_loss_mlp": 1.07490873, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.045730944132966495, + "language_loss": 0.79707301, + "learning_rate": 0.0004264325321289808, + "loss": 0.80812562, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.30322266, + "step": 2914, + "time_per_iteration": 2.787429094314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101375, + "balance_loss_mlp": 1.07131052, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.05941371213730478, + "language_loss": 0.8624413, + "learning_rate": 0.00042612439481243736, + "loss": 0.87345505, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.30078125, + "step": 2915, + "time_per_iteration": 2.7993295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_mlp": 1.06113064, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.06435914288601326, + "language_loss": 0.90124059, + "learning_rate": 0.00042581628617694735, + "loss": 0.91214895, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.296875, + "step": 2916, + "time_per_iteration": 2.744046449661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089958, + "balance_loss_mlp": 1.06032228, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.05771140503361017, + "language_loss": 0.81953394, + "learning_rate": 0.0004255082063421296, + "loss": 0.83043355, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.29638672, + "step": 2917, + "time_per_iteration": 2.705963134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.0655117, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.0764674514791775, + "language_loss": 0.84947777, + "learning_rate": 0.00042520015542759065, + "loss": 0.86043298, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.29980469, + "step": 2918, + "time_per_iteration": 2.9078075885772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085954, + "balance_loss_mlp": 1.05662882, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.049198929687541054, + "language_loss": 0.88353539, + "learning_rate": 0.00042489213355292687, + "loss": 0.89439487, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.29296875, + "step": 2919, + "time_per_iteration": 2.862194776535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093265, + "balance_loss_mlp": 1.06300998, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.0619251317266344, + "language_loss": 0.81301886, + "learning_rate": 0.00042458414083772276, + "loss": 0.82395148, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.30224609, + "step": 2920, + "time_per_iteration": 2.5329933166503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095136, + "balance_loss_mlp": 1.0651195, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.05517349890350355, + "language_loss": 0.8525691, + "learning_rate": 0.000424276177401552, + "loss": 0.86352038, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.29956055, + "step": 2921, + "time_per_iteration": 2.787318468093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092355, + "balance_loss_mlp": 1.06200445, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06500569481536145, + "language_loss": 0.85831988, + "learning_rate": 0.0004239682433639763, + "loss": 0.86924338, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.3034668, + "step": 2922, + "time_per_iteration": 2.697091817855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093283, + "balance_loss_mlp": 1.06386256, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.08309086608315261, + "language_loss": 0.85596514, + "learning_rate": 0.0004236603388445467, + "loss": 0.86689794, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.29394531, + "step": 2923, + "time_per_iteration": 2.5720105171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097102, + "balance_loss_mlp": 1.0683012, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07246274776201297, + "language_loss": 0.82229364, + "learning_rate": 0.00042335246396280166, + "loss": 0.83326471, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.28808594, + "step": 2924, + "time_per_iteration": 2.7669975757598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093178, + "balance_loss_mlp": 1.06320906, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06414999121973448, + "language_loss": 0.90646857, + "learning_rate": 0.0004230446188382693, + "loss": 0.91740036, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.29956055, + "step": 2925, + "time_per_iteration": 2.5662741661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.0595876, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.05389869275215176, + "language_loss": 0.80918074, + "learning_rate": 0.0004227368035904654, + "loss": 0.82006967, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.29296875, + "step": 2926, + "time_per_iteration": 2.964599370956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092675, + "balance_loss_mlp": 1.06249142, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06261422618617216, + "language_loss": 0.82895541, + "learning_rate": 0.00042242901833889474, + "loss": 0.83988214, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30151367, + "step": 2927, + "time_per_iteration": 2.6312665939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093424, + "balance_loss_mlp": 1.06376481, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.06041695665754469, + "language_loss": 0.86030155, + "learning_rate": 0.0004221212632030501, + "loss": 0.87123579, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.29614258, + "step": 2928, + "time_per_iteration": 3.0977063179016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06351972, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.06366283736150324, + "language_loss": 0.80857551, + "learning_rate": 0.0004218135383024124, + "loss": 0.81951618, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30541992, + "step": 2929, + "time_per_iteration": 2.749244213104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088519, + "balance_loss_mlp": 1.0590266, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.12143433952472552, + "language_loss": 0.85715157, + "learning_rate": 0.0004215058437564511, + "loss": 0.86803675, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.29467773, + "step": 2930, + "time_per_iteration": 2.593238115310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_mlp": 1.05512953, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.056033125460513485, + "language_loss": 0.82132083, + "learning_rate": 0.00042119817968462397, + "loss": 0.83216375, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.29125977, + "step": 2931, + "time_per_iteration": 2.591958522796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092676, + "balance_loss_mlp": 1.06275427, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.07812059497351068, + "language_loss": 0.87152535, + "learning_rate": 0.0004208905462063766, + "loss": 0.88245207, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.29907227, + "step": 2932, + "time_per_iteration": 2.6288535594940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086522, + "balance_loss_mlp": 1.0571723, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.06389283518633071, + "language_loss": 0.84869772, + "learning_rate": 0.00042058294344114315, + "loss": 0.85956293, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.29345703, + "step": 2933, + "time_per_iteration": 2.6064674854278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05672109, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.05718901807458546, + "language_loss": 0.77749109, + "learning_rate": 0.0004202753715083456, + "loss": 0.78835702, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.29858398, + "step": 2934, + "time_per_iteration": 3.075186014175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093891, + "balance_loss_mlp": 1.0630157, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07168087831316133, + "language_loss": 0.81911719, + "learning_rate": 0.0004199678305273936, + "loss": 0.83005607, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30883789, + "step": 2935, + "time_per_iteration": 2.6289923191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091967, + "balance_loss_mlp": 1.06316626, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.0664481148229904, + "language_loss": 0.81315005, + "learning_rate": 0.0004196603206176854, + "loss": 0.82406974, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.28808594, + "step": 2936, + "time_per_iteration": 2.941150426864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093274, + "balance_loss_mlp": 1.06404424, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07427925135014142, + "language_loss": 0.83779049, + "learning_rate": 0.000419352841898607, + "loss": 0.84872323, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29199219, + "step": 2937, + "time_per_iteration": 2.977189302444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092016, + "balance_loss_mlp": 1.06273842, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.061049572757767595, + "language_loss": 0.77780819, + "learning_rate": 0.000419045394489532, + "loss": 0.78872836, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29296875, + "step": 2938, + "time_per_iteration": 2.6722819805145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086194, + "balance_loss_mlp": 1.05622458, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.05727154642915785, + "language_loss": 0.77326584, + "learning_rate": 0.0004187379785098224, + "loss": 0.78412783, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.29931641, + "step": 2939, + "time_per_iteration": 3.100283622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.05665886, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.06949350877551969, + "language_loss": 0.83849806, + "learning_rate": 0.00041843059407882744, + "loss": 0.84936267, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.29785156, + "step": 2940, + "time_per_iteration": 2.9837162494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.05257499, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.068553917777786, + "language_loss": 0.82768112, + "learning_rate": 0.0004181232413158842, + "loss": 0.83850372, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.29638672, + "step": 2941, + "time_per_iteration": 2.636819839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083664, + "balance_loss_mlp": 1.05371857, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.06960240931548377, + "language_loss": 0.82932127, + "learning_rate": 0.0004178159203403179, + "loss": 0.84015793, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29931641, + "step": 2942, + "time_per_iteration": 2.822134494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_mlp": 1.0547837, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.05318601865014104, + "language_loss": 0.81807715, + "learning_rate": 0.0004175086312714409, + "loss": 0.8289094, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.28442383, + "step": 2943, + "time_per_iteration": 2.571985960006714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086509, + "balance_loss_mlp": 1.05625343, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05713331418457596, + "language_loss": 0.84120524, + "learning_rate": 0.00041720137422855366, + "loss": 0.85207033, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.30224609, + "step": 2944, + "time_per_iteration": 2.7213711738586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086525, + "balance_loss_mlp": 1.05758142, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.1661240742061477, + "language_loss": 0.79230917, + "learning_rate": 0.00041689414933094383, + "loss": 0.80317438, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.28930664, + "step": 2945, + "time_per_iteration": 2.628525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088063, + "balance_loss_mlp": 1.05954862, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.06338169436240754, + "language_loss": 0.81427538, + "learning_rate": 0.00041658695669788653, + "loss": 0.82515597, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.28515625, + "step": 2946, + "time_per_iteration": 2.736955404281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084859, + "balance_loss_mlp": 1.0541029, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.0612697940531113, + "language_loss": 0.81293368, + "learning_rate": 0.00041627979644864453, + "loss": 0.82378221, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.30712891, + "step": 2947, + "time_per_iteration": 2.780796766281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.05436563, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06710047446863547, + "language_loss": 0.81410027, + "learning_rate": 0.0004159726687024683, + "loss": 0.82493049, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.28662109, + "step": 2948, + "time_per_iteration": 2.6072115898132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108621, + "balance_loss_mlp": 1.05757558, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.06141378811636639, + "language_loss": 0.79485345, + "learning_rate": 0.00041566557357859506, + "loss": 0.80571556, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.28613281, + "step": 2949, + "time_per_iteration": 2.911865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.05443358, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.052257384193144164, + "language_loss": 0.79611081, + "learning_rate": 0.0004153585111962502, + "loss": 0.806961, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.30566406, + "step": 2950, + "time_per_iteration": 3.2808187007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_mlp": 1.05606341, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06672147261233864, + "language_loss": 0.84739614, + "learning_rate": 0.0004150514816746453, + "loss": 0.85826337, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.30639648, + "step": 2951, + "time_per_iteration": 2.680326461791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089698, + "balance_loss_mlp": 1.0602051, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.06944116544696582, + "language_loss": 0.85944223, + "learning_rate": 0.0004147444851329802, + "loss": 0.87033927, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29443359, + "step": 2952, + "time_per_iteration": 2.6477670669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086586, + "balance_loss_mlp": 1.05680704, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.054427499920313586, + "language_loss": 0.86026949, + "learning_rate": 0.00041443752169044126, + "loss": 0.87113535, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.29736328, + "step": 2953, + "time_per_iteration": 2.997781276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092153, + "balance_loss_mlp": 1.061993, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.055407826164880256, + "language_loss": 0.84948021, + "learning_rate": 0.0004141305914662025, + "loss": 0.86040175, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.30126953, + "step": 2954, + "time_per_iteration": 2.704019069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05688024, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0673052072270573, + "language_loss": 0.80911326, + "learning_rate": 0.0004138236945794246, + "loss": 0.81998718, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.30493164, + "step": 2955, + "time_per_iteration": 2.88403058052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108918, + "balance_loss_mlp": 1.05911565, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.06799730214965168, + "language_loss": 0.8379457, + "learning_rate": 0.00041351683114925576, + "loss": 0.84883749, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.30053711, + "step": 2956, + "time_per_iteration": 3.0439462661743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087169, + "balance_loss_mlp": 1.0562458, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06948923214023794, + "language_loss": 0.86469889, + "learning_rate": 0.0004132100012948308, + "loss": 0.87557054, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.30883789, + "step": 2957, + "time_per_iteration": 2.6431198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090494, + "balance_loss_mlp": 1.05959463, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.0655655158566539, + "language_loss": 0.84699452, + "learning_rate": 0.00041290320513527145, + "loss": 0.85789943, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.30883789, + "step": 2958, + "time_per_iteration": 2.5978519916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_mlp": 1.05528057, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.05333030562061355, + "language_loss": 0.8519215, + "learning_rate": 0.0004125964427896867, + "loss": 0.86277229, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29760742, + "step": 2959, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_mlp": 1.05468178, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06459683266000829, + "language_loss": 0.79222417, + "learning_rate": 0.0004122897143771723, + "loss": 0.80306756, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.29663086, + "step": 2960, + "time_per_iteration": 2.5457372665405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.05713725, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.057309213891239566, + "language_loss": 0.81961918, + "learning_rate": 0.0004119830200168109, + "loss": 0.83049381, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.30297852, + "step": 2961, + "time_per_iteration": 2.66658091545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.05180621, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.0578679247611712, + "language_loss": 0.88614476, + "learning_rate": 0.0004116763598276714, + "loss": 0.89694846, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.28564453, + "step": 2962, + "time_per_iteration": 2.5355417728424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083269, + "balance_loss_mlp": 1.05394387, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.05524318032555551, + "language_loss": 0.81030452, + "learning_rate": 0.00041136973392881017, + "loss": 0.82113719, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.29345703, + "step": 2963, + "time_per_iteration": 2.8497612476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.05540633, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.06477225886122127, + "language_loss": 0.82179135, + "learning_rate": 0.00041106314243926983, + "loss": 0.83264679, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.30102539, + "step": 2964, + "time_per_iteration": 2.735269069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080389, + "balance_loss_mlp": 1.05103993, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.05516182837620622, + "language_loss": 0.87329233, + "learning_rate": 0.0004107565854780798, + "loss": 0.88409621, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29296875, + "step": 2965, + "time_per_iteration": 2.6157355308532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.05596685, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.07414316825555053, + "language_loss": 0.81466991, + "learning_rate": 0.000410450063164256, + "loss": 0.82552361, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29370117, + "step": 2966, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083362, + "balance_loss_mlp": 1.05291581, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.06746080357230834, + "language_loss": 0.82004952, + "learning_rate": 0.00041014357561680115, + "loss": 0.83088315, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30395508, + "step": 2967, + "time_per_iteration": 2.51119065284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085478, + "balance_loss_mlp": 1.05519855, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.053142332405834165, + "language_loss": 0.86128843, + "learning_rate": 0.0004098371229547039, + "loss": 0.87214315, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.30249023, + "step": 2968, + "time_per_iteration": 2.6994621753692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022253, + "balance_loss_mlp": 1.01390862, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.025900339106917806, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81033063, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.08349609, + "step": 2969, + "time_per_iteration": 4.718291997909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092241, + "balance_loss_mlp": 1.06179523, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.05366083523781242, + "language_loss": 0.80585647, + "learning_rate": 0.00040922432276247107, + "loss": 0.8167789, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.30419922, + "step": 2970, + "time_per_iteration": 2.55259108543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.0609777, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.049420251796361614, + "language_loss": 0.84874177, + "learning_rate": 0.0004089179754702457, + "loss": 0.85966122, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.30932617, + "step": 2971, + "time_per_iteration": 2.771068572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109211, + "balance_loss_mlp": 1.06090152, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.06283275659801735, + "language_loss": 0.7981565, + "learning_rate": 0.00040861166353919843, + "loss": 0.80907762, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.31176758, + "step": 2972, + "time_per_iteration": 2.7827725410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091117, + "balance_loss_mlp": 1.06069493, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06507135137823726, + "language_loss": 0.818784, + "learning_rate": 0.00040830538708824983, + "loss": 0.82969517, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.30395508, + "step": 2973, + "time_per_iteration": 2.845456600189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108959, + "balance_loss_mlp": 1.05966854, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.07493195148818688, + "language_loss": 0.81968939, + "learning_rate": 0.000407999146236307, + "loss": 0.8305853, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29882812, + "step": 2974, + "time_per_iteration": 2.531430244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093173, + "balance_loss_mlp": 1.06284618, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.06121365308687838, + "language_loss": 0.8362776, + "learning_rate": 0.0004076929411022634, + "loss": 0.84720927, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.30322266, + "step": 2975, + "time_per_iteration": 2.645341634750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096437, + "balance_loss_mlp": 1.06591964, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.05509159729755976, + "language_loss": 0.79606473, + "learning_rate": 0.0004073867718049982, + "loss": 0.80702913, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.30493164, + "step": 2976, + "time_per_iteration": 3.085145950317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.07137978, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.06232705756749319, + "language_loss": 0.82691067, + "learning_rate": 0.00040708063846337704, + "loss": 0.83793509, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.31054688, + "step": 2977, + "time_per_iteration": 2.738443613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099181, + "balance_loss_mlp": 1.06813931, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.061703964741206326, + "language_loss": 0.81214464, + "learning_rate": 0.00040677454119625143, + "loss": 0.82313639, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.31005859, + "step": 2978, + "time_per_iteration": 2.6232175827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108887, + "balance_loss_mlp": 1.07758296, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07073355508195153, + "language_loss": 0.83247018, + "learning_rate": 0.0004064684801224587, + "loss": 0.84355903, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.31274414, + "step": 2979, + "time_per_iteration": 2.577918767929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101163, + "balance_loss_mlp": 1.07085991, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.05699497583041508, + "language_loss": 0.80492741, + "learning_rate": 0.00040616245536082224, + "loss": 0.81593907, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30273438, + "step": 2980, + "time_per_iteration": 2.6298904418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101523, + "balance_loss_mlp": 1.07167256, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.04979780559516064, + "language_loss": 0.81357765, + "learning_rate": 0.00040585646703015165, + "loss": 0.82459289, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29833984, + "step": 2981, + "time_per_iteration": 2.8170647621154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_mlp": 1.07118809, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07486213422343042, + "language_loss": 0.78689104, + "learning_rate": 0.0004055505152492419, + "loss": 0.79791927, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.31616211, + "step": 2982, + "time_per_iteration": 2.6379241943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_mlp": 1.06825066, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.05681665302183781, + "language_loss": 0.74231875, + "learning_rate": 0.00040524460013687425, + "loss": 0.75331908, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.31762695, + "step": 2983, + "time_per_iteration": 2.7545318603515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097699, + "balance_loss_mlp": 1.0663712, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.04476617807489617, + "language_loss": 0.81250238, + "learning_rate": 0.0004049387218118155, + "loss": 0.82347941, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.31298828, + "step": 2984, + "time_per_iteration": 2.9756665229797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108902, + "balance_loss_mlp": 1.05816841, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07928255171477795, + "language_loss": 0.85245347, + "learning_rate": 0.00040463288039281777, + "loss": 0.8633436, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30810547, + "step": 2985, + "time_per_iteration": 2.706669807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_mlp": 1.02681565, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.02538869827055974, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78910911, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.07666016, + "step": 2986, + "time_per_iteration": 4.949368953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.05462396, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.060127228305881374, + "language_loss": 0.82366645, + "learning_rate": 0.0004040213087479444, + "loss": 0.83451408, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.30102539, + "step": 2987, + "time_per_iteration": 2.9205455780029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086178, + "balance_loss_mlp": 1.05723405, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.05965622667733625, + "language_loss": 0.85328299, + "learning_rate": 0.0004037155787595018, + "loss": 0.86414474, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.2890625, + "step": 2988, + "time_per_iteration": 2.574509859085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088474, + "balance_loss_mlp": 1.0593158, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.05784717048255493, + "language_loss": 0.80869853, + "learning_rate": 0.000403409886151987, + "loss": 0.8195833, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29125977, + "step": 2989, + "time_per_iteration": 2.945080041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016432, + "balance_loss_mlp": 1.00894582, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.009946927491071988, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83015537, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.07470703, + "step": 2990, + "time_per_iteration": 4.807205677032471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015586, + "balance_loss_mlp": 1.00809932, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.009078458393910433, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79214191, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.07470703, + "step": 2991, + "time_per_iteration": 4.805190563201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.05380619, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.05637441563568418, + "language_loss": 0.76644433, + "learning_rate": 0.00040249303380173807, + "loss": 0.77728564, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.30322266, + "step": 2992, + "time_per_iteration": 3.049729108810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.05780125, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06616333205601678, + "language_loss": 0.79290402, + "learning_rate": 0.00040218749190459126, + "loss": 0.80381036, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.32836914, + "step": 2993, + "time_per_iteration": 2.7314000129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087925, + "balance_loss_mlp": 1.05795622, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.06422497492556134, + "language_loss": 0.82827115, + "learning_rate": 0.00040188198798162775, + "loss": 0.83915043, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29956055, + "step": 2994, + "time_per_iteration": 2.605794668197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089955, + "balance_loss_mlp": 1.06022453, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.05264744908201922, + "language_loss": 0.85358101, + "learning_rate": 0.000401576522151455, + "loss": 0.8644805, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29711914, + "step": 2995, + "time_per_iteration": 2.8504650592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_mlp": 1.05664682, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.05051873290535222, + "language_loss": 0.83133811, + "learning_rate": 0.0004012710945326651, + "loss": 0.8421973, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.29248047, + "step": 2996, + "time_per_iteration": 2.7823193073272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094348, + "balance_loss_mlp": 1.06545174, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.0711371625716349, + "language_loss": 0.81093514, + "learning_rate": 0.0004009657052438355, + "loss": 0.82187867, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28881836, + "step": 2997, + "time_per_iteration": 2.7743020057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091289, + "balance_loss_mlp": 1.06184435, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.06367440987852575, + "language_loss": 0.85650682, + "learning_rate": 0.00040066035440352904, + "loss": 0.86741972, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.29418945, + "step": 2998, + "time_per_iteration": 2.6359331607818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014946, + "balance_loss_mlp": 1.0071255, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.01828635150904939, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8030808, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.078125, + "step": 2999, + "time_per_iteration": 4.881432056427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104047, + "balance_loss_mlp": 1.07417345, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.0709915390631299, + "language_loss": 0.76451176, + "learning_rate": 0.00040004976854266145, + "loss": 0.77555221, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.2980957, + "step": 3000, + "time_per_iteration": 2.5374131202697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101005, + "balance_loss_mlp": 1.07017779, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.051209677129469174, + "language_loss": 0.81337965, + "learning_rate": 0.0003997445337591505, + "loss": 0.8243897, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.30810547, + "step": 3001, + "time_per_iteration": 2.647610902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102438, + "balance_loss_mlp": 1.07351804, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.0611265357111255, + "language_loss": 0.74261576, + "learning_rate": 0.0003994393378982635, + "loss": 0.75364017, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28979492, + "step": 3002, + "time_per_iteration": 2.602245330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013935, + "balance_loss_mlp": 1.00611448, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.01032263408282017, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80551934, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.078125, + "step": 3003, + "time_per_iteration": 4.818480968475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104089, + "balance_loss_mlp": 1.07304692, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.0604287320481862, + "language_loss": 0.88041145, + "learning_rate": 0.0003988290634182961, + "loss": 0.89145231, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.31005859, + "step": 3004, + "time_per_iteration": 2.7484169006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.07284904, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.06655998832299866, + "language_loss": 0.80592918, + "learning_rate": 0.0003985239850361453, + "loss": 0.81695324, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.29541016, + "step": 3005, + "time_per_iteration": 2.6148018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_mlp": 1.07281876, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.0659443256400084, + "language_loss": 0.84734911, + "learning_rate": 0.0003982189460504777, + "loss": 0.85836959, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.29199219, + "step": 3006, + "time_per_iteration": 2.7011501789093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105808, + "balance_loss_mlp": 1.07540917, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.06531961229333205, + "language_loss": 0.7939682, + "learning_rate": 0.00039791394657971935, + "loss": 0.80502629, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.30371094, + "step": 3007, + "time_per_iteration": 2.7082760334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102193, + "balance_loss_mlp": 1.07234263, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.06476760562978502, + "language_loss": 0.8421638, + "learning_rate": 0.00039760898674228205, + "loss": 0.85318571, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.29858398, + "step": 3008, + "time_per_iteration": 2.650878429412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105056, + "balance_loss_mlp": 1.07475293, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.05525540739637584, + "language_loss": 0.80765337, + "learning_rate": 0.0003973040666565613, + "loss": 0.81870395, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.30273438, + "step": 3009, + "time_per_iteration": 3.1226985454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100227, + "balance_loss_mlp": 1.07030547, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06024611276807751, + "language_loss": 0.82195163, + "learning_rate": 0.000396999186440938, + "loss": 0.83295393, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.29882812, + "step": 3010, + "time_per_iteration": 2.844270944595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096543, + "balance_loss_mlp": 1.06533396, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.06262665363935188, + "language_loss": 0.85208702, + "learning_rate": 0.000396694346213777, + "loss": 0.86305249, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.31176758, + "step": 3011, + "time_per_iteration": 2.613032817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109368, + "balance_loss_mlp": 1.06492627, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.05937601459412264, + "language_loss": 0.83947617, + "learning_rate": 0.0003963895460934276, + "loss": 0.85041296, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.28735352, + "step": 3012, + "time_per_iteration": 3.124514102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091421, + "balance_loss_mlp": 1.05992579, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07020347624432877, + "language_loss": 0.8493948, + "learning_rate": 0.00039608478619822376, + "loss": 0.86030906, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.31494141, + "step": 3013, + "time_per_iteration": 2.411346912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_mlp": 1.05544281, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.05715104374994747, + "language_loss": 0.826662, + "learning_rate": 0.00039578006664648394, + "loss": 0.83750206, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.28564453, + "step": 3014, + "time_per_iteration": 2.7363553047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.05310702, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06684904609650524, + "language_loss": 0.81588256, + "learning_rate": 0.0003954753875565105, + "loss": 0.82670951, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.2956543, + "step": 3015, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107747, + "balance_loss_mlp": 1.04890752, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06478579772376787, + "language_loss": 0.82758343, + "learning_rate": 0.00039517074904659057, + "loss": 0.83835804, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.28564453, + "step": 3016, + "time_per_iteration": 2.7099101543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084798, + "balance_loss_mlp": 1.05454302, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.05410468367994604, + "language_loss": 0.84939837, + "learning_rate": 0.00039486615123499535, + "loss": 0.8602463, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.30224609, + "step": 3017, + "time_per_iteration": 2.8504526615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085654, + "balance_loss_mlp": 1.05532694, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.05526317953318916, + "language_loss": 0.85137427, + "learning_rate": 0.00039456159423997996, + "loss": 0.86223084, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.30297852, + "step": 3018, + "time_per_iteration": 2.633484363555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.0523833, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.07104600615119407, + "language_loss": 0.8999185, + "learning_rate": 0.00039425707817978406, + "loss": 0.91074204, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29956055, + "step": 3019, + "time_per_iteration": 2.6299033164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082814, + "balance_loss_mlp": 1.05241609, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.05724387536038855, + "language_loss": 0.83951199, + "learning_rate": 0.00039395260317263124, + "loss": 0.85034013, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.30395508, + "step": 3020, + "time_per_iteration": 2.5456759929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080319, + "balance_loss_mlp": 1.04996824, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.07612516842687451, + "language_loss": 0.85048491, + "learning_rate": 0.0003936481693367291, + "loss": 0.86128807, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.3034668, + "step": 3021, + "time_per_iteration": 2.7192864418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094567, + "balance_loss_mlp": 1.06259549, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08707963459833061, + "language_loss": 0.882092, + "learning_rate": 0.0003933437767902697, + "loss": 0.89303768, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.31958008, + "step": 3022, + "time_per_iteration": 2.7938294410705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.05792677, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07541432505918821, + "language_loss": 0.7834546, + "learning_rate": 0.00039303942565142825, + "loss": 0.79433668, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.30249023, + "step": 3023, + "time_per_iteration": 2.7417471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091198, + "balance_loss_mlp": 1.06089532, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.05482425315239383, + "language_loss": 0.76731157, + "learning_rate": 0.0003927351160383644, + "loss": 0.77822357, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.30249023, + "step": 3024, + "time_per_iteration": 2.804474353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091546, + "balance_loss_mlp": 1.06193483, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05202928961884776, + "language_loss": 0.77983212, + "learning_rate": 0.000392430848069222, + "loss": 0.79074758, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.530200958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097141, + "balance_loss_mlp": 1.06814933, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.058580785743037773, + "language_loss": 0.82503867, + "learning_rate": 0.00039212662186212795, + "loss": 0.8360101, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.28979492, + "step": 3026, + "time_per_iteration": 2.592423677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.06676841, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.04855017878997747, + "language_loss": 0.7719928, + "learning_rate": 0.0003918224375351934, + "loss": 0.78294182, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.28149414, + "step": 3027, + "time_per_iteration": 2.7347710132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101546, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.05175541468331668, + "language_loss": 0.7881335, + "learning_rate": 0.0003915182952065135, + "loss": 0.79914892, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29858398, + "step": 3028, + "time_per_iteration": 2.698678493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_mlp": 1.07684946, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.051679899573834884, + "language_loss": 0.87814313, + "learning_rate": 0.0003912141949941664, + "loss": 0.88920105, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.2890625, + "step": 3029, + "time_per_iteration": 2.703824520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107968, + "balance_loss_mlp": 1.07675922, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.07311487113166662, + "language_loss": 0.82985795, + "learning_rate": 0.0003909101370162143, + "loss": 0.84093761, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.31201172, + "step": 3030, + "time_per_iteration": 2.601590633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101355, + "balance_loss_mlp": 1.00611103, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.01566462127280147, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73447442, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.07421875, + "step": 3031, + "time_per_iteration": 4.907916307449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103812, + "balance_loss_mlp": 1.07403314, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.05748921462157389, + "language_loss": 0.8307178, + "learning_rate": 0.0003903021482356622, + "loss": 0.84175599, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29760742, + "step": 3032, + "time_per_iteration": 2.8251240253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_mlp": 1.07525432, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.054780146703337314, + "language_loss": 0.82722723, + "learning_rate": 0.00038999821766910465, + "loss": 0.83827209, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.29248047, + "step": 3033, + "time_per_iteration": 2.9882729053497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108478, + "balance_loss_mlp": 1.07996285, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.08031037628307693, + "language_loss": 0.86154497, + "learning_rate": 0.00038969432980902606, + "loss": 0.87262976, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.28540039, + "step": 3034, + "time_per_iteration": 2.597313642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018692, + "balance_loss_mlp": 1.01149189, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.013503469394203483, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80803192, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.07177734, + "step": 3035, + "time_per_iteration": 4.801652669906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113026, + "balance_loss_mlp": 1.08374798, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0646542819028206, + "language_loss": 0.82506442, + "learning_rate": 0.00038908668268020953, + "loss": 0.83619463, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29223633, + "step": 3036, + "time_per_iteration": 2.6857457160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112518, + "balance_loss_mlp": 1.08381224, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.21422512196310703, + "language_loss": 0.85166728, + "learning_rate": 0.00038878292364738097, + "loss": 0.86279243, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.28662109, + "step": 3037, + "time_per_iteration": 2.776686191558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106641, + "balance_loss_mlp": 1.07726789, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0719771880124652, + "language_loss": 0.87355781, + "learning_rate": 0.0003884792077928508, + "loss": 0.88462424, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.5682616233825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_mlp": 1.07304573, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.06153670645771429, + "language_loss": 0.7661767, + "learning_rate": 0.0003881755352345322, + "loss": 0.77719897, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29174805, + "step": 3039, + "time_per_iteration": 2.5531814098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104927, + "balance_loss_mlp": 1.07560194, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05739173880603102, + "language_loss": 0.86896229, + "learning_rate": 0.0003878719060903207, + "loss": 0.88001162, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29296875, + "step": 3040, + "time_per_iteration": 2.593386650085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098868, + "balance_loss_mlp": 1.06908977, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.068924296543817, + "language_loss": 0.84256113, + "learning_rate": 0.0003875683204780961, + "loss": 0.85354984, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29785156, + "step": 3041, + "time_per_iteration": 2.6921916007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_mlp": 1.07145464, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.07404975426077917, + "language_loss": 0.85055083, + "learning_rate": 0.00038726477851572043, + "loss": 0.86155903, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.29394531, + "step": 3042, + "time_per_iteration": 2.76772403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090937, + "balance_loss_mlp": 1.06249356, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.06423863125550561, + "language_loss": 0.80573255, + "learning_rate": 0.0003869612803210395, + "loss": 0.81664193, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.28442383, + "step": 3043, + "time_per_iteration": 2.6271820068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092493, + "balance_loss_mlp": 1.06314421, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.07232729129784332, + "language_loss": 0.83455092, + "learning_rate": 0.0003866578260118817, + "loss": 0.84547591, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29345703, + "step": 3044, + "time_per_iteration": 2.583698272705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05616593, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.059856611418728146, + "language_loss": 0.83175647, + "learning_rate": 0.0003863544157060581, + "loss": 0.84260201, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.28369141, + "step": 3045, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090685, + "balance_loss_mlp": 1.06166923, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.05199684229497746, + "language_loss": 0.82254589, + "learning_rate": 0.0003860510495213634, + "loss": 0.8334527, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.28979492, + "step": 3046, + "time_per_iteration": 2.7998342514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090034, + "balance_loss_mlp": 1.05946922, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08208062967584176, + "language_loss": 0.78349328, + "learning_rate": 0.0003857477275755746, + "loss": 0.7943936, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.30517578, + "step": 3047, + "time_per_iteration": 2.6120448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088733, + "balance_loss_mlp": 1.05940795, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0525859268526321, + "language_loss": 0.83988523, + "learning_rate": 0.00038544444998645167, + "loss": 0.8507725, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.29296875, + "step": 3048, + "time_per_iteration": 2.9847609996795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085173, + "balance_loss_mlp": 1.0563724, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.06739730522499447, + "language_loss": 0.82059789, + "learning_rate": 0.00038514121687173767, + "loss": 0.83144969, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.28808594, + "step": 3049, + "time_per_iteration": 2.619170904159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081984, + "balance_loss_mlp": 1.0529443, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07072588382777995, + "language_loss": 0.82076973, + "learning_rate": 0.00038483802834915807, + "loss": 0.83158958, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.29003906, + "step": 3050, + "time_per_iteration": 2.9947521686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04742062, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.0556694240307722, + "language_loss": 0.7980268, + "learning_rate": 0.00038453488453642074, + "loss": 0.80879277, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29174805, + "step": 3051, + "time_per_iteration": 2.659647226333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081993, + "balance_loss_mlp": 1.05133235, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.055022006168623364, + "language_loss": 0.8682425, + "learning_rate": 0.00038423178555121697, + "loss": 0.87906241, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.30664062, + "step": 3052, + "time_per_iteration": 2.682971954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078016, + "balance_loss_mlp": 1.0489769, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.05776598371070369, + "language_loss": 0.85701603, + "learning_rate": 0.00038392873151121994, + "loss": 0.86779618, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.29052734, + "step": 3053, + "time_per_iteration": 3.060055732727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077537, + "balance_loss_mlp": 1.04883146, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.06401371867882108, + "language_loss": 0.83262593, + "learning_rate": 0.0003836257225340859, + "loss": 0.84340131, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.28686523, + "step": 3054, + "time_per_iteration": 2.680649995803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.04853082, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.058869654242756926, + "language_loss": 0.82344568, + "learning_rate": 0.00038332275873745336, + "loss": 0.83423615, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.3046875, + "step": 3055, + "time_per_iteration": 3.036266565322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108387, + "balance_loss_mlp": 1.05485463, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05256953045681507, + "language_loss": 0.83349717, + "learning_rate": 0.0003830198402389431, + "loss": 0.84433585, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.2902832, + "step": 3056, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.06163549, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.04626706953255302, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78418016, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07421875, + "step": 3057, + "time_per_iteration": 4.978636026382446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082198, + "balance_loss_mlp": 1.05349255, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.07448060145489646, + "language_loss": 0.83136308, + "learning_rate": 0.0003824141396066855, + "loss": 0.84218502, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28710938, + "step": 3058, + "time_per_iteration": 2.5531108379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088619, + "balance_loss_mlp": 1.05910254, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.059082946906010764, + "language_loss": 0.82999164, + "learning_rate": 0.000382111357708092, + "loss": 0.84087777, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29541016, + "step": 3059, + "time_per_iteration": 2.699920654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088385, + "balance_loss_mlp": 1.05917883, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.071653907002528, + "language_loss": 0.84021831, + "learning_rate": 0.00038180862157792864, + "loss": 0.85110211, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.29174805, + "step": 3060, + "time_per_iteration": 2.8073549270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_mlp": 1.05642152, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.05679216094879844, + "language_loss": 0.82328987, + "learning_rate": 0.0003815059313337279, + "loss": 0.83413565, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28198242, + "step": 3061, + "time_per_iteration": 2.6649534702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086963, + "balance_loss_mlp": 1.05906773, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.07322136366051005, + "language_loss": 0.78155029, + "learning_rate": 0.00038120328709300436, + "loss": 0.79241997, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.27905273, + "step": 3062, + "time_per_iteration": 2.9070422649383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091769, + "balance_loss_mlp": 1.06191885, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.07246450050077374, + "language_loss": 0.83913672, + "learning_rate": 0.0003809006889732549, + "loss": 0.85005438, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.29833984, + "step": 3063, + "time_per_iteration": 2.803724527359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06420445, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.05969034427320992, + "language_loss": 0.88370293, + "learning_rate": 0.0003805981370919589, + "loss": 0.89462918, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28442383, + "step": 3064, + "time_per_iteration": 2.495248556137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086784, + "balance_loss_mlp": 1.05877018, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.05081424319280643, + "language_loss": 0.83982229, + "learning_rate": 0.0003802956315665771, + "loss": 0.85069013, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28027344, + "step": 3065, + "time_per_iteration": 2.6511592864990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091365, + "balance_loss_mlp": 1.06182539, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.06728201091458674, + "language_loss": 0.81791949, + "learning_rate": 0.0003799931725145529, + "loss": 0.8288331, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.29516602, + "step": 3066, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095665, + "balance_loss_mlp": 1.06729341, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.05193283223246739, + "language_loss": 0.86020327, + "learning_rate": 0.00037969076005331083, + "loss": 0.87115991, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28369141, + "step": 3067, + "time_per_iteration": 2.763853073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06713736, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.05663918686290471, + "language_loss": 0.88129491, + "learning_rate": 0.00037938839430025817, + "loss": 0.89225829, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.29248047, + "step": 3068, + "time_per_iteration": 2.6258280277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089417, + "balance_loss_mlp": 1.06092644, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.05275324094783275, + "language_loss": 0.85889924, + "learning_rate": 0.0003790860753727835, + "loss": 0.86979342, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.28491211, + "step": 3069, + "time_per_iteration": 2.7926387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.05799568, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0573953914976859, + "language_loss": 0.8280952, + "learning_rate": 0.00037878380338825766, + "loss": 0.83896416, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28881836, + "step": 3070, + "time_per_iteration": 2.6791534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089394, + "balance_loss_mlp": 1.06209493, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.054269754776710775, + "language_loss": 0.81082213, + "learning_rate": 0.00037848157846403287, + "loss": 0.82171613, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.2734375, + "step": 3071, + "time_per_iteration": 2.897139549255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095364, + "balance_loss_mlp": 1.06792235, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.0725138562855444, + "language_loss": 0.83259237, + "learning_rate": 0.0003781794007174435, + "loss": 0.84354603, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.2746582, + "step": 3072, + "time_per_iteration": 2.724810838699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_mlp": 1.02988398, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.01939748854391394, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75111091, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.06689453, + "step": 3073, + "time_per_iteration": 4.9330198764801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090512, + "balance_loss_mlp": 1.06285512, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.048822002095482486, + "language_loss": 0.81208611, + "learning_rate": 0.0003775751872264152, + "loss": 0.82299125, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.27661133, + "step": 3074, + "time_per_iteration": 2.7631497383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084721, + "balance_loss_mlp": 1.05599189, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.06348444489710649, + "language_loss": 0.86787391, + "learning_rate": 0.0003772731517165527, + "loss": 0.87872112, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28710938, + "step": 3075, + "time_per_iteration": 2.7517099380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089134, + "balance_loss_mlp": 1.06069052, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.059695821747375526, + "language_loss": 0.83545357, + "learning_rate": 0.0003769711638534784, + "loss": 0.84634489, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28466797, + "step": 3076, + "time_per_iteration": 2.9352333545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090964, + "balance_loss_mlp": 1.06209183, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.08879190082108672, + "language_loss": 0.79118001, + "learning_rate": 0.00037666922375443446, + "loss": 0.80208963, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28857422, + "step": 3077, + "time_per_iteration": 2.5947184562683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093967, + "balance_loss_mlp": 1.06578577, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06374349472109522, + "language_loss": 0.81828058, + "learning_rate": 0.00037636733153664396, + "loss": 0.82922018, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.28149414, + "step": 3078, + "time_per_iteration": 2.8191051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109303, + "balance_loss_mlp": 1.0645864, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.06406278721713668, + "language_loss": 0.80298102, + "learning_rate": 0.0003760654873173124, + "loss": 0.81391132, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.28466797, + "step": 3079, + "time_per_iteration": 2.656822919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089541, + "balance_loss_mlp": 1.06081128, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.04854482848269962, + "language_loss": 0.82530022, + "learning_rate": 0.00037576369121362566, + "loss": 0.83619559, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.28759766, + "step": 3080, + "time_per_iteration": 2.589050531387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06840181, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.05673956944694001, + "language_loss": 0.82090509, + "learning_rate": 0.0003754619433427516, + "loss": 0.83188212, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29272461, + "step": 3081, + "time_per_iteration": 2.8826987743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086639, + "balance_loss_mlp": 1.05845797, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.06493823771045844, + "language_loss": 0.78039849, + "learning_rate": 0.0003751602438218392, + "loss": 0.79126489, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.28222656, + "step": 3082, + "time_per_iteration": 2.815852642059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087731, + "balance_loss_mlp": 1.05952644, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.08102695368832301, + "language_loss": 0.83818078, + "learning_rate": 0.0003748585927680186, + "loss": 0.84905803, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.28198242, + "step": 3083, + "time_per_iteration": 2.6566061973571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_mlp": 1.05651248, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.0619003043193751, + "language_loss": 0.8314001, + "learning_rate": 0.00037455699029840086, + "loss": 0.84224129, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.27612305, + "step": 3084, + "time_per_iteration": 2.609382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081588, + "balance_loss_mlp": 1.05436099, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.05433571826648474, + "language_loss": 0.84684891, + "learning_rate": 0.0003742554365300787, + "loss": 0.85766476, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.27270508, + "step": 3085, + "time_per_iteration": 2.725409746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05927253, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.05832989485618193, + "language_loss": 0.79031849, + "learning_rate": 0.0003739539315801255, + "loss": 0.80118442, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.27331543, + "step": 3086, + "time_per_iteration": 2.9751360416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092425, + "balance_loss_mlp": 1.06493533, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.05988774460659005, + "language_loss": 0.9182803, + "learning_rate": 0.000373652475565596, + "loss": 0.92920458, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.27490234, + "step": 3087, + "time_per_iteration": 2.535181999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090266, + "balance_loss_mlp": 1.06144142, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.07303028521355714, + "language_loss": 0.81608456, + "learning_rate": 0.00037335106860352587, + "loss": 0.82698727, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.28808594, + "step": 3088, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545377, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.0577260245362681, + "language_loss": 0.83174306, + "learning_rate": 0.00037304971081093146, + "loss": 0.84268945, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.29199219, + "step": 3089, + "time_per_iteration": 2.5568172931671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06479192, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.05440667028717182, + "language_loss": 0.80792761, + "learning_rate": 0.00037274840230481024, + "loss": 0.81884158, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.26635742, + "step": 3090, + "time_per_iteration": 2.7040512561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089877, + "balance_loss_mlp": 1.06152868, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07197994401815008, + "language_loss": 0.79483205, + "learning_rate": 0.00037244714320214077, + "loss": 0.80573082, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.28369141, + "step": 3091, + "time_per_iteration": 2.527803659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091312, + "balance_loss_mlp": 1.06317902, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06270949928795992, + "language_loss": 0.83166003, + "learning_rate": 0.000372145933619882, + "loss": 0.84257317, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.28137207, + "step": 3092, + "time_per_iteration": 2.869267225265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092404, + "balance_loss_mlp": 1.06455636, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.059066436199884755, + "language_loss": 0.82841283, + "learning_rate": 0.000371844773674974, + "loss": 0.83933693, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.27856445, + "step": 3093, + "time_per_iteration": 2.6301257610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097042, + "balance_loss_mlp": 1.06793106, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.06442112613973276, + "language_loss": 0.82118666, + "learning_rate": 0.0003715436634843375, + "loss": 0.83215708, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29101562, + "step": 3094, + "time_per_iteration": 2.8569583892822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091347, + "balance_loss_mlp": 1.06466842, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.04641072419683149, + "language_loss": 0.80758119, + "learning_rate": 0.00037124260316487355, + "loss": 0.81849468, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.26708984, + "step": 3095, + "time_per_iteration": 2.8417470455169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095419, + "balance_loss_mlp": 1.06838274, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.05475651988922655, + "language_loss": 0.89790189, + "learning_rate": 0.0003709415928334643, + "loss": 0.90885603, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.27075195, + "step": 3096, + "time_per_iteration": 2.5519328117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092318, + "balance_loss_mlp": 1.06382728, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09831894239475095, + "language_loss": 0.80721879, + "learning_rate": 0.00037064063260697233, + "loss": 0.818142, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.28491211, + "step": 3097, + "time_per_iteration": 2.8612656593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099184, + "balance_loss_mlp": 1.07157493, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.058836420710008684, + "language_loss": 0.78798771, + "learning_rate": 0.0003703397226022407, + "loss": 0.79897952, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.27612305, + "step": 3098, + "time_per_iteration": 3.069542169570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_mlp": 1.03243947, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.024027627375554906, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76539135, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.06835938, + "step": 3099, + "time_per_iteration": 4.940065860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109756, + "balance_loss_mlp": 1.06966519, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.059128365336986094, + "language_loss": 0.83247489, + "learning_rate": 0.0003697380537253339, + "loss": 0.84345049, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.27929688, + "step": 3100, + "time_per_iteration": 2.638352632522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098347, + "balance_loss_mlp": 1.06973624, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.05513129923941457, + "language_loss": 0.82084006, + "learning_rate": 0.0003694372950867471, + "loss": 0.83182353, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28637695, + "step": 3101, + "time_per_iteration": 2.7355875968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101199, + "balance_loss_mlp": 1.07282722, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.05863829677079808, + "language_loss": 0.77766848, + "learning_rate": 0.0003691365871370976, + "loss": 0.78868043, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.28393555, + "step": 3102, + "time_per_iteration": 3.0227084159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110316, + "balance_loss_mlp": 1.07533622, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06404713166930852, + "language_loss": 0.85323572, + "learning_rate": 0.00036883592999313093, + "loss": 0.86426735, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27832031, + "step": 3103, + "time_per_iteration": 2.659637689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.0700587, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.05340010645713243, + "language_loss": 0.79008019, + "learning_rate": 0.0003685353237715722, + "loss": 0.80105591, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27563477, + "step": 3104, + "time_per_iteration": 2.9019625186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109062, + "balance_loss_mlp": 1.06272471, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.053396202956180965, + "language_loss": 0.81746447, + "learning_rate": 0.0003682347685891274, + "loss": 0.82837057, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.27893066, + "step": 3105, + "time_per_iteration": 2.8479247093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093319, + "balance_loss_mlp": 1.06535256, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.061940030050424234, + "language_loss": 0.80626607, + "learning_rate": 0.0003679342645624822, + "loss": 0.81719923, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.2800293, + "step": 3106, + "time_per_iteration": 2.988600015640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088181, + "balance_loss_mlp": 1.06088209, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.06552701347411696, + "language_loss": 0.82154477, + "learning_rate": 0.0003676338118083025, + "loss": 0.83242655, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.2734375, + "step": 3107, + "time_per_iteration": 3.0211057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091455, + "balance_loss_mlp": 1.06372714, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.05808577111452716, + "language_loss": 0.79585344, + "learning_rate": 0.0003673334104432347, + "loss": 0.806768, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.27758789, + "step": 3108, + "time_per_iteration": 2.6277918815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109043, + "balance_loss_mlp": 1.06255877, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.05782699460566696, + "language_loss": 0.83817154, + "learning_rate": 0.0003670330605839048, + "loss": 0.84907585, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.27856445, + "step": 3109, + "time_per_iteration": 2.786181926727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094155, + "balance_loss_mlp": 1.06685627, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.06234839208499282, + "language_loss": 0.76878405, + "learning_rate": 0.0003667327623469191, + "loss": 0.77972555, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27319336, + "step": 3110, + "time_per_iteration": 2.731876850128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089583, + "balance_loss_mlp": 1.0621767, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.06451414709321307, + "language_loss": 0.78028917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79118496, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27429199, + "step": 3111, + "time_per_iteration": 2.796886682510376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088313, + "balance_loss_mlp": 1.06072783, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.06854484980093518, + "language_loss": 0.82222939, + "learning_rate": 0.00036613232120630393, + "loss": 0.83311254, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.27587891, + "step": 3112, + "time_per_iteration": 2.6065847873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_mlp": 1.05594933, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.06819300023171558, + "language_loss": 0.80318254, + "learning_rate": 0.00036583217853578643, + "loss": 0.81402361, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.28173828, + "step": 3113, + "time_per_iteration": 2.5723838806152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_mlp": 1.06200337, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.05495468357602656, + "language_loss": 0.77783948, + "learning_rate": 0.000365532087953837, + "loss": 0.78872508, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.26586914, + "step": 3114, + "time_per_iteration": 3.622190475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081565, + "balance_loss_mlp": 1.05359864, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07841874273871757, + "language_loss": 0.89431345, + "learning_rate": 0.00036523204957696065, + "loss": 0.90512908, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.27978516, + "step": 3115, + "time_per_iteration": 2.6414806842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084485, + "balance_loss_mlp": 1.05627978, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.0586823821525485, + "language_loss": 0.80958188, + "learning_rate": 0.00036493206352164324, + "loss": 0.8204267, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.28222656, + "step": 3116, + "time_per_iteration": 2.896613121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.05184269, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.05558165654665051, + "language_loss": 0.85426074, + "learning_rate": 0.000364632129904349, + "loss": 0.86506593, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.28662109, + "step": 3117, + "time_per_iteration": 2.7053070068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079719, + "balance_loss_mlp": 1.05215788, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.05806752486487043, + "language_loss": 0.78326154, + "learning_rate": 0.00036433224884152283, + "loss": 0.79405868, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.27587891, + "step": 3118, + "time_per_iteration": 2.6854429244995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083264, + "balance_loss_mlp": 1.0547967, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.06710995797512392, + "language_loss": 0.78089821, + "learning_rate": 0.00036403242044958875, + "loss": 0.79173082, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28466797, + "step": 3119, + "time_per_iteration": 2.53751540184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04949808, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.059219046094812676, + "language_loss": 0.91922826, + "learning_rate": 0.0003637326448449507, + "loss": 0.93000555, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.28222656, + "step": 3120, + "time_per_iteration": 2.7070553302764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075191, + "balance_loss_mlp": 1.04855967, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.05784920643643932, + "language_loss": 0.86244148, + "learning_rate": 0.00036343292214399177, + "loss": 0.87319338, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.2668457, + "step": 3121, + "time_per_iteration": 2.790273904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.05368924, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.061762558273937264, + "language_loss": 0.77535498, + "learning_rate": 0.00036313325246307456, + "loss": 0.78617358, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.28149414, + "step": 3122, + "time_per_iteration": 2.8160674571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085554, + "balance_loss_mlp": 1.05804014, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.06096373394010022, + "language_loss": 0.8757152, + "learning_rate": 0.0003628336359185411, + "loss": 0.88657075, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.27539062, + "step": 3123, + "time_per_iteration": 2.6819381713867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083737, + "balance_loss_mlp": 1.05708146, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.07022869927973763, + "language_loss": 0.75776213, + "learning_rate": 0.000362534072626713, + "loss": 0.76859951, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.2668457, + "step": 3124, + "time_per_iteration": 2.740907907485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083476, + "balance_loss_mlp": 1.05572367, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.05823250121923288, + "language_loss": 0.81532884, + "learning_rate": 0.00036223456270389093, + "loss": 0.82616365, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.27758789, + "step": 3125, + "time_per_iteration": 2.9345879554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.06254041, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.05438265607227417, + "language_loss": 0.81106913, + "learning_rate": 0.00036193510626635517, + "loss": 0.82197487, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.28076172, + "step": 3126, + "time_per_iteration": 2.719505786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092388, + "balance_loss_mlp": 1.06581664, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.06352965026992909, + "language_loss": 0.8166849, + "learning_rate": 0.0003616357034303649, + "loss": 0.82760876, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.26623535, + "step": 3127, + "time_per_iteration": 2.917137861251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.0748688, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.06152140222119449, + "language_loss": 0.7902928, + "learning_rate": 0.0003613363543121584, + "loss": 0.80131161, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.27050781, + "step": 3128, + "time_per_iteration": 2.8336853981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098033, + "balance_loss_mlp": 1.07082987, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.1105531777946672, + "language_loss": 0.85000741, + "learning_rate": 0.00036103705902795357, + "loss": 0.86098778, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.2722168, + "step": 3129, + "time_per_iteration": 2.6958324909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107326, + "balance_loss_mlp": 1.07933569, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.08057315277867966, + "language_loss": 0.79751796, + "learning_rate": 0.0003607378176939471, + "loss": 0.80859125, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.2800293, + "step": 3130, + "time_per_iteration": 2.609400510787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109547, + "balance_loss_mlp": 1.06817079, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.0756423011045038, + "language_loss": 0.82227194, + "learning_rate": 0.00036043863042631465, + "loss": 0.83322662, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.2734375, + "step": 3131, + "time_per_iteration": 2.6571097373962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.06409097, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.07645469837417121, + "language_loss": 0.76662207, + "learning_rate": 0.00036013949734121133, + "loss": 0.77753073, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.26782227, + "step": 3132, + "time_per_iteration": 3.118265390396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096725, + "balance_loss_mlp": 1.06995106, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.0687931043319398, + "language_loss": 0.82291925, + "learning_rate": 0.00035984041855477043, + "loss": 0.83388644, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.26794434, + "step": 3133, + "time_per_iteration": 2.777459144592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019209, + "balance_loss_mlp": 1.01186562, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.01616325084905853, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79729104, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.07324219, + "step": 3134, + "time_per_iteration": 4.925475597381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.05736887, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.06318710690497562, + "language_loss": 0.79746044, + "learning_rate": 0.00035924242434230637, + "loss": 0.80829811, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.2644043, + "step": 3135, + "time_per_iteration": 2.7011537551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085192, + "balance_loss_mlp": 1.05767858, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.07716145908862651, + "language_loss": 0.79063201, + "learning_rate": 0.00035894350914844516, + "loss": 0.80148399, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.27514648, + "step": 3136, + "time_per_iteration": 2.6126935482025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088376, + "balance_loss_mlp": 1.05995679, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.06075860838457364, + "language_loss": 0.827613, + "learning_rate": 0.0003586446487175703, + "loss": 0.83849669, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.28417969, + "step": 3137, + "time_per_iteration": 2.675171375274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088716, + "balance_loss_mlp": 1.06041527, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.0544690611172434, + "language_loss": 0.85478795, + "learning_rate": 0.0003583458431657099, + "loss": 0.86567509, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.28320312, + "step": 3138, + "time_per_iteration": 2.7620253562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089567, + "balance_loss_mlp": 1.06336451, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.07515995216766168, + "language_loss": 0.83139801, + "learning_rate": 0.00035804709260887056, + "loss": 0.84229362, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.26220703, + "step": 3139, + "time_per_iteration": 2.6879465579986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087318, + "balance_loss_mlp": 1.05985248, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.052915045266918946, + "language_loss": 0.89835536, + "learning_rate": 0.0003577483971630373, + "loss": 0.90922856, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.27514648, + "step": 3140, + "time_per_iteration": 2.6586039066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06398129, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.045195992370632855, + "language_loss": 0.85010505, + "learning_rate": 0.00035744975694417414, + "loss": 0.86101902, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.27416992, + "step": 3141, + "time_per_iteration": 2.8448941707611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084418, + "balance_loss_mlp": 1.05757236, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.07912966064455412, + "language_loss": 0.82233572, + "learning_rate": 0.00035715117206822344, + "loss": 0.83317983, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.26867676, + "step": 3142, + "time_per_iteration": 2.7542483806610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087563, + "balance_loss_mlp": 1.06026399, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.0701313453845953, + "language_loss": 0.80890429, + "learning_rate": 0.0003568526426511065, + "loss": 0.81977987, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.27331543, + "step": 3143, + "time_per_iteration": 2.6046767234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081818, + "balance_loss_mlp": 1.05658114, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.07379330487049819, + "language_loss": 0.83015585, + "learning_rate": 0.000356554168808722, + "loss": 0.84097409, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.25244141, + "step": 3144, + "time_per_iteration": 2.9466705322265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087357, + "balance_loss_mlp": 1.06141686, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.06250797721947925, + "language_loss": 0.84944713, + "learning_rate": 0.00035625575065694837, + "loss": 0.86032069, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.25952148, + "step": 3145, + "time_per_iteration": 2.9049606323242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083147, + "balance_loss_mlp": 1.05501366, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.05947586112106144, + "language_loss": 0.77504069, + "learning_rate": 0.0003559573883116415, + "loss": 0.78587222, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.28125, + "step": 3146, + "time_per_iteration": 2.70141339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095869, + "balance_loss_mlp": 1.06964314, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.050725714839995426, + "language_loss": 0.85750544, + "learning_rate": 0.00035565908188863604, + "loss": 0.86846411, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.26269531, + "step": 3147, + "time_per_iteration": 2.822096586227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097988, + "balance_loss_mlp": 1.07181001, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.06536005907217222, + "language_loss": 0.79801714, + "learning_rate": 0.00035536083150374464, + "loss": 0.80899704, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.26220703, + "step": 3148, + "time_per_iteration": 2.883934736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_mlp": 1.01980209, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.01728788780398527, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75775194, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.07421875, + "step": 3149, + "time_per_iteration": 4.850924015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07784474, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.06213460160212929, + "language_loss": 0.85822916, + "learning_rate": 0.0003547644993114475, + "loss": 0.8692801, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.27246094, + "step": 3150, + "time_per_iteration": 2.8107762336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102124, + "balance_loss_mlp": 1.0744915, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.06674612399311457, + "language_loss": 0.79958618, + "learning_rate": 0.00035446641773555806, + "loss": 0.81060743, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.27636719, + "step": 3151, + "time_per_iteration": 2.7216579914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101102, + "balance_loss_mlp": 1.07332611, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.052040510091589255, + "language_loss": 0.87343258, + "learning_rate": 0.000354168392660816, + "loss": 0.88444364, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.27758789, + "step": 3152, + "time_per_iteration": 2.726529836654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091719, + "balance_loss_mlp": 1.06484938, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.05990276634138019, + "language_loss": 0.82825845, + "learning_rate": 0.0003538704242029252, + "loss": 0.83917564, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.26879883, + "step": 3153, + "time_per_iteration": 2.695416212081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109717, + "balance_loss_mlp": 1.06884539, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.07600523103772844, + "language_loss": 0.77972901, + "learning_rate": 0.0003535725124775672, + "loss": 0.79070067, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.28320312, + "step": 3154, + "time_per_iteration": 2.8397514820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094038, + "balance_loss_mlp": 1.0649513, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.058609076283542554, + "language_loss": 0.86659074, + "learning_rate": 0.00035327465760040126, + "loss": 0.87753117, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.29077148, + "step": 3155, + "time_per_iteration": 2.6624228954315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.05640316, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.09292554424112281, + "language_loss": 0.84951353, + "learning_rate": 0.00035297685968706526, + "loss": 0.8603462, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.26867676, + "step": 3156, + "time_per_iteration": 2.7303812503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084205, + "balance_loss_mlp": 1.05590463, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.06445223486110697, + "language_loss": 0.83064741, + "learning_rate": 0.00035267911885317454, + "loss": 0.84148943, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.28271484, + "step": 3157, + "time_per_iteration": 2.6405463218688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06032109, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.05575059306658705, + "language_loss": 0.81712598, + "learning_rate": 0.0003523814352143222, + "loss": 0.82800603, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.27709961, + "step": 3158, + "time_per_iteration": 2.830343723297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093502, + "balance_loss_mlp": 1.06400919, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.06682067437398732, + "language_loss": 0.91622639, + "learning_rate": 0.00035208380888607937, + "loss": 0.9271614, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.29455566, + "step": 3159, + "time_per_iteration": 2.796640634536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_mlp": 1.01944304, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.020734540297120695, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80488676, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.07226562, + "step": 3160, + "time_per_iteration": 4.843371391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021138, + "balance_loss_mlp": 1.01393795, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.018390389893633168, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76713371, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.07177734, + "step": 3161, + "time_per_iteration": 5.065373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06481421, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.06046903146728731, + "language_loss": 0.81903481, + "learning_rate": 0.00035119127492038446, + "loss": 0.82996982, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.28637695, + "step": 3162, + "time_per_iteration": 2.7967278957366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083192, + "balance_loss_mlp": 1.0550108, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.05880363430465999, + "language_loss": 0.82486427, + "learning_rate": 0.00035089387898984436, + "loss": 0.83569616, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.28198242, + "step": 3163, + "time_per_iteration": 3.0665948390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089596, + "balance_loss_mlp": 1.06079483, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.064612412597244, + "language_loss": 0.8164137, + "learning_rate": 0.0003505965409474343, + "loss": 0.82730967, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.28808594, + "step": 3164, + "time_per_iteration": 2.9265527725219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078635, + "balance_loss_mlp": 1.05164599, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.0535577439830692, + "language_loss": 0.86276996, + "learning_rate": 0.0003502992609085913, + "loss": 0.87355632, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.27001953, + "step": 3165, + "time_per_iteration": 2.6794493198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082317, + "balance_loss_mlp": 1.05463672, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.05150827346349905, + "language_loss": 0.82492924, + "learning_rate": 0.00035000203898872954, + "loss": 0.83575243, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.27734375, + "step": 3166, + "time_per_iteration": 2.9775314331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081161, + "balance_loss_mlp": 1.0533855, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.0631204311292361, + "language_loss": 0.84789312, + "learning_rate": 0.0003497048753032406, + "loss": 0.85870469, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.27783203, + "step": 3167, + "time_per_iteration": 2.8659260272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082152, + "balance_loss_mlp": 1.05567539, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.05504676322369481, + "language_loss": 0.80827415, + "learning_rate": 0.000349407769967494, + "loss": 0.81909573, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.26525879, + "step": 3168, + "time_per_iteration": 3.3787014484405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081057, + "balance_loss_mlp": 1.05447292, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.05919699008213893, + "language_loss": 0.84490019, + "learning_rate": 0.0003491107230968361, + "loss": 0.85571074, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.26611328, + "step": 3169, + "time_per_iteration": 2.6599555015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078954, + "balance_loss_mlp": 1.05251288, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.05367575554300243, + "language_loss": 0.81929743, + "learning_rate": 0.00034881373480659085, + "loss": 0.83008707, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.26489258, + "step": 3170, + "time_per_iteration": 2.828599214553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089157, + "balance_loss_mlp": 1.06092811, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.07372507054287164, + "language_loss": 0.77562344, + "learning_rate": 0.0003485168052120594, + "loss": 0.78651506, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.28198242, + "step": 3171, + "time_per_iteration": 2.55070161819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092713, + "balance_loss_mlp": 1.06579578, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.06238864549227849, + "language_loss": 0.80073476, + "learning_rate": 0.00034821993442851973, + "loss": 0.81166196, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.26940918, + "step": 3172, + "time_per_iteration": 2.585115909576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.06593776, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.07089619767063425, + "language_loss": 0.82434714, + "learning_rate": 0.00034792312257122735, + "loss": 0.83527064, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.26428223, + "step": 3173, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109217, + "balance_loss_mlp": 1.06535971, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.06191738998776062, + "language_loss": 0.8083055, + "learning_rate": 0.00034762636975541506, + "loss": 0.81922722, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.26843262, + "step": 3174, + "time_per_iteration": 2.661529779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097622, + "balance_loss_mlp": 1.07096648, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.07934443203127389, + "language_loss": 0.81213707, + "learning_rate": 0.0003473296760962923, + "loss": 0.82311332, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.2668457, + "step": 3175, + "time_per_iteration": 2.730571746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105783, + "balance_loss_mlp": 1.05005765, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.03785121855584389, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79591566, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.07763672, + "step": 3176, + "time_per_iteration": 4.720510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094316, + "balance_loss_mlp": 1.06782722, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.05949259191251309, + "language_loss": 0.81672812, + "learning_rate": 0.00034673646670883976, + "loss": 0.82767129, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.26538086, + "step": 3177, + "time_per_iteration": 3.025146722793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_mlp": 1.02812171, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.027049018431207196, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76750535, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.07373047, + "step": 3178, + "time_per_iteration": 5.000125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085963, + "balance_loss_mlp": 1.05978417, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.07013069416081287, + "language_loss": 0.81980824, + "learning_rate": 0.0003461434953300865, + "loss": 0.83066785, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.26220703, + "step": 3179, + "time_per_iteration": 2.922963857650757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081362, + "balance_loss_mlp": 1.05501699, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.06339313471396843, + "language_loss": 0.81228697, + "learning_rate": 0.0003458470991817515, + "loss": 0.82310063, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.2635498, + "step": 3180, + "time_per_iteration": 2.9837453365325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.06111443, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.05673755911203457, + "language_loss": 0.84994721, + "learning_rate": 0.0003455507628808802, + "loss": 0.86083156, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.27319336, + "step": 3181, + "time_per_iteration": 2.6381750106811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088853, + "balance_loss_mlp": 1.06133974, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.08338525943087875, + "language_loss": 0.85169065, + "learning_rate": 0.00034525448654252076, + "loss": 0.86257923, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.27539062, + "step": 3182, + "time_per_iteration": 2.6688461303710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089639, + "balance_loss_mlp": 1.06263769, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.09017686395034887, + "language_loss": 0.83182716, + "learning_rate": 0.0003449582702816976, + "loss": 0.84272361, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.2701416, + "step": 3183, + "time_per_iteration": 2.6863620281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091522, + "balance_loss_mlp": 1.06479537, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.0548908554977987, + "language_loss": 0.82581168, + "learning_rate": 0.0003446621142134122, + "loss": 0.8367269, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.26757812, + "step": 3184, + "time_per_iteration": 2.673337459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093536, + "balance_loss_mlp": 1.06649971, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.06227229540090399, + "language_loss": 0.84346098, + "learning_rate": 0.0003443660184526424, + "loss": 0.85439634, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.27050781, + "step": 3185, + "time_per_iteration": 2.4706175327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092866, + "balance_loss_mlp": 1.06506586, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.05120570826610392, + "language_loss": 0.86619818, + "learning_rate": 0.0003440699831143429, + "loss": 0.87712687, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.27832031, + "step": 3186, + "time_per_iteration": 2.778033971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095882, + "balance_loss_mlp": 1.06884551, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.05392794478467523, + "language_loss": 0.82370943, + "learning_rate": 0.0003437740083134449, + "loss": 0.83466822, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.27050781, + "step": 3187, + "time_per_iteration": 2.6768150329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.06453919, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.07437759552236513, + "language_loss": 0.8353374, + "learning_rate": 0.00034347809416485574, + "loss": 0.84625435, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.27197266, + "step": 3188, + "time_per_iteration": 2.6008822917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085287, + "balance_loss_mlp": 1.05835748, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.053009634337547046, + "language_loss": 0.81880438, + "learning_rate": 0.0003431822407834597, + "loss": 0.82965726, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.26940918, + "step": 3189, + "time_per_iteration": 2.8121964931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090882, + "balance_loss_mlp": 1.06315422, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.06178667045305147, + "language_loss": 0.84739751, + "learning_rate": 0.00034288644828411706, + "loss": 0.85830629, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.27758789, + "step": 3190, + "time_per_iteration": 3.4740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087763, + "balance_loss_mlp": 1.06052327, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.08089532706522883, + "language_loss": 0.75991279, + "learning_rate": 0.0003425907167816649, + "loss": 0.77079034, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.27258301, + "step": 3191, + "time_per_iteration": 2.8420307636260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05866492, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.06652830958672214, + "language_loss": 0.84765488, + "learning_rate": 0.00034229504639091623, + "loss": 0.85850024, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.2590332, + "step": 3192, + "time_per_iteration": 2.805717945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079151, + "balance_loss_mlp": 1.05240059, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.06825133592780937, + "language_loss": 0.80015457, + "learning_rate": 0.0003419994372266606, + "loss": 0.81094611, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.26782227, + "step": 3193, + "time_per_iteration": 3.0882303714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084925, + "balance_loss_mlp": 1.05800796, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.061422659425354964, + "language_loss": 0.82002676, + "learning_rate": 0.00034170388940366335, + "loss": 0.83087599, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.26953125, + "step": 3194, + "time_per_iteration": 2.68253755569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085529, + "balance_loss_mlp": 1.0581584, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.0574427380686639, + "language_loss": 0.801368, + "learning_rate": 0.0003414084030366667, + "loss": 0.81222332, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.27380371, + "step": 3195, + "time_per_iteration": 3.079050302505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05421329, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.05079595978056556, + "language_loss": 0.83029908, + "learning_rate": 0.0003411129782403883, + "loss": 0.84111041, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.26953125, + "step": 3196, + "time_per_iteration": 2.632840871810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086159, + "balance_loss_mlp": 1.05881214, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.06298738141067967, + "language_loss": 0.85384542, + "learning_rate": 0.0003408176151295225, + "loss": 0.86470699, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.27392578, + "step": 3197, + "time_per_iteration": 2.5977203845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.05186343, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.07239010053944613, + "language_loss": 0.77357507, + "learning_rate": 0.00034052231381873944, + "loss": 0.78436762, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.2746582, + "step": 3198, + "time_per_iteration": 2.604996919631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078982, + "balance_loss_mlp": 1.05149233, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.060831146063345755, + "language_loss": 0.85285568, + "learning_rate": 0.00034022707442268494, + "loss": 0.86364555, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.27514648, + "step": 3199, + "time_per_iteration": 2.6032421588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_mlp": 1.05297375, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.04692312170218308, + "language_loss": 0.82051641, + "learning_rate": 0.0003399318970559813, + "loss": 0.83131248, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.26660156, + "step": 3200, + "time_per_iteration": 2.8085906505584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_mlp": 1.0479418, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.057177124175777416, + "language_loss": 0.8485775, + "learning_rate": 0.00033963678183322656, + "loss": 0.85931993, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.26330566, + "step": 3201, + "time_per_iteration": 3.032761335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.05809593, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.053866229864627496, + "language_loss": 0.82829523, + "learning_rate": 0.0003393417288689945, + "loss": 0.8391425, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.26623535, + "step": 3202, + "time_per_iteration": 2.6627390384674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084716, + "balance_loss_mlp": 1.05858481, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.08436696100910436, + "language_loss": 0.76289904, + "learning_rate": 0.00033904673827783504, + "loss": 0.77374619, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.26171875, + "step": 3203, + "time_per_iteration": 2.914370059967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082648, + "balance_loss_mlp": 1.05615926, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.06562773431598554, + "language_loss": 0.81864727, + "learning_rate": 0.00033875181017427357, + "loss": 0.82947373, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.26501465, + "step": 3204, + "time_per_iteration": 2.5992236137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.05155659, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.05911238695185789, + "language_loss": 0.8101759, + "learning_rate": 0.00033845694467281133, + "loss": 0.82095909, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.26782227, + "step": 3205, + "time_per_iteration": 2.857226848602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079906, + "balance_loss_mlp": 1.05366778, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.056333384320929165, + "language_loss": 0.83590877, + "learning_rate": 0.00033816214188792516, + "loss": 0.84670782, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.26281738, + "step": 3206, + "time_per_iteration": 3.133683443069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108227, + "balance_loss_mlp": 1.05523372, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.06870835299002895, + "language_loss": 0.85362953, + "learning_rate": 0.00033786740193406784, + "loss": 0.86445218, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.27050781, + "step": 3207, + "time_per_iteration": 2.5766866207122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_mlp": 1.05775416, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.16525433487855157, + "language_loss": 0.81557286, + "learning_rate": 0.00033757272492566736, + "loss": 0.82641208, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.26184082, + "step": 3208, + "time_per_iteration": 2.8717997074127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.05363393, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.050446978523455026, + "language_loss": 0.8752228, + "learning_rate": 0.0003372781109771278, + "loss": 0.88603711, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.27832031, + "step": 3209, + "time_per_iteration": 2.740673303604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_mlp": 1.05973852, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.060596341147957054, + "language_loss": 0.76554525, + "learning_rate": 0.0003369835602028281, + "loss": 0.77641892, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.27661133, + "step": 3210, + "time_per_iteration": 2.813253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078855, + "balance_loss_mlp": 1.05222404, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.060877692494739295, + "language_loss": 0.7966795, + "learning_rate": 0.0003366890727171232, + "loss": 0.80746806, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.26647949, + "step": 3211, + "time_per_iteration": 2.7572054862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083411, + "balance_loss_mlp": 1.05717349, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.07113437774281188, + "language_loss": 0.78650188, + "learning_rate": 0.00033639464863434313, + "loss": 0.79733604, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.26257324, + "step": 3212, + "time_per_iteration": 2.616605520248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_mlp": 1.0275538, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.020977694975075144, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79477704, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.07666016, + "step": 3213, + "time_per_iteration": 4.706260919570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077401, + "balance_loss_mlp": 1.05035281, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.055780003903401355, + "language_loss": 0.79908657, + "learning_rate": 0.00033580599113475543, + "loss": 0.80986065, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.27087402, + "step": 3214, + "time_per_iteration": 2.976040840148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068298, + "balance_loss_mlp": 1.04207242, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.06538485262612419, + "language_loss": 0.86450571, + "learning_rate": 0.00033551175794648507, + "loss": 0.87518871, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.2623291, + "step": 3215, + "time_per_iteration": 2.5857200622558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074795, + "balance_loss_mlp": 1.0478301, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.05115792818317019, + "language_loss": 0.81974953, + "learning_rate": 0.00033521758861821365, + "loss": 0.8304975, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.27001953, + "step": 3216, + "time_per_iteration": 2.6541965007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070889, + "balance_loss_mlp": 1.04368544, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.053233870679950265, + "language_loss": 0.89132476, + "learning_rate": 0.0003349234832641479, + "loss": 0.90203357, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.27246094, + "step": 3217, + "time_per_iteration": 2.5898375511169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072387, + "balance_loss_mlp": 1.04567194, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.06188675281587152, + "language_loss": 0.81109393, + "learning_rate": 0.00033462944199846975, + "loss": 0.82181776, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.26721191, + "step": 3218, + "time_per_iteration": 3.049302101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068592, + "balance_loss_mlp": 1.04186571, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07980114958498462, + "language_loss": 0.86682892, + "learning_rate": 0.00033433546493533606, + "loss": 0.87751484, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.26757812, + "step": 3219, + "time_per_iteration": 2.4988718032836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072803, + "balance_loss_mlp": 1.04562318, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.06437622918216304, + "language_loss": 0.84503907, + "learning_rate": 0.00033404155218887897, + "loss": 0.85576707, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.27246094, + "step": 3220, + "time_per_iteration": 2.755687952041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069297, + "balance_loss_mlp": 1.04323733, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.054816937967161604, + "language_loss": 0.87677366, + "learning_rate": 0.00033374770387320534, + "loss": 0.88746661, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.26074219, + "step": 3221, + "time_per_iteration": 2.806687831878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073987, + "balance_loss_mlp": 1.0476656, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.05319951203525016, + "language_loss": 0.85096419, + "learning_rate": 0.00033345392010239737, + "loss": 0.86170411, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.2635498, + "step": 3222, + "time_per_iteration": 2.726924419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078737, + "balance_loss_mlp": 1.05248737, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.06204822794999188, + "language_loss": 0.82752097, + "learning_rate": 0.0003331602009905118, + "loss": 0.83830827, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.26245117, + "step": 3223, + "time_per_iteration": 2.8067080974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074324, + "balance_loss_mlp": 1.04770494, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.06248384558092708, + "language_loss": 0.83894855, + "learning_rate": 0.00033286654665158085, + "loss": 0.84969175, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.26635742, + "step": 3224, + "time_per_iteration": 2.973839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071798, + "balance_loss_mlp": 1.04578674, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.058715923927156195, + "language_loss": 0.87385452, + "learning_rate": 0.0003325729571996109, + "loss": 0.88457251, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.26037598, + "step": 3225, + "time_per_iteration": 2.6299448013305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_mlp": 1.05295992, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.05622680554800681, + "language_loss": 0.84078681, + "learning_rate": 0.000332279432748584, + "loss": 0.85158628, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.27001953, + "step": 3226, + "time_per_iteration": 2.713651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_mlp": 1.05334759, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.05334260963219639, + "language_loss": 0.8767364, + "learning_rate": 0.00033198597341245576, + "loss": 0.88753092, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.26147461, + "step": 3227, + "time_per_iteration": 2.5617635250091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_mlp": 1.05208337, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.05016111197588362, + "language_loss": 0.82129073, + "learning_rate": 0.00033169257930515763, + "loss": 0.83207709, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.26611328, + "step": 3228, + "time_per_iteration": 3.025502920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080729, + "balance_loss_mlp": 1.0543834, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.08161989388785439, + "language_loss": 0.82274306, + "learning_rate": 0.0003313992505405951, + "loss": 0.83355033, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.26367188, + "step": 3229, + "time_per_iteration": 2.705948829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083196, + "balance_loss_mlp": 1.05582547, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.06417417083544033, + "language_loss": 0.81270546, + "learning_rate": 0.0003311059872326487, + "loss": 0.82353741, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.27368164, + "step": 3230, + "time_per_iteration": 2.6827783584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080325, + "balance_loss_mlp": 1.05426574, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.060558133954529886, + "language_loss": 0.79513329, + "learning_rate": 0.0003308127894951734, + "loss": 0.80593657, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.26074219, + "step": 3231, + "time_per_iteration": 2.621156692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_mlp": 1.05295873, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.05872122895707264, + "language_loss": 0.86601388, + "learning_rate": 0.00033051965744199834, + "loss": 0.87681365, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.27075195, + "step": 3232, + "time_per_iteration": 2.7616896629333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089938, + "balance_loss_mlp": 1.06414127, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.05765951293021458, + "language_loss": 0.90613365, + "learning_rate": 0.0003302265911869276, + "loss": 0.91703308, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.25830078, + "step": 3233, + "time_per_iteration": 2.911309242248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.04950833, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.0568406918617455, + "language_loss": 0.84234416, + "learning_rate": 0.0003299335908437397, + "loss": 0.8531056, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.26660156, + "step": 3234, + "time_per_iteration": 2.5690464973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083439, + "balance_loss_mlp": 1.05614042, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08458123774573062, + "language_loss": 0.79892743, + "learning_rate": 0.0003296406565261873, + "loss": 0.80976182, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.27294922, + "step": 3235, + "time_per_iteration": 2.519242763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082513, + "balance_loss_mlp": 1.05619192, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.04986850206195379, + "language_loss": 0.85312378, + "learning_rate": 0.0003293477883479978, + "loss": 0.86394894, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.26367188, + "step": 3236, + "time_per_iteration": 2.8095037937164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_mlp": 1.05704379, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.105420899843356, + "language_loss": 0.79857445, + "learning_rate": 0.0003290549864228727, + "loss": 0.80941153, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.2668457, + "step": 3237, + "time_per_iteration": 2.9599437713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092317, + "balance_loss_mlp": 1.0648514, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.05485346042827634, + "language_loss": 0.86677277, + "learning_rate": 0.0003287622508644875, + "loss": 0.87769592, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.27514648, + "step": 3238, + "time_per_iteration": 2.7735140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108562, + "balance_loss_mlp": 1.05971575, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.06697855581394702, + "language_loss": 0.86312807, + "learning_rate": 0.0003284695817864923, + "loss": 0.87398434, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.25939941, + "step": 3239, + "time_per_iteration": 2.5213680267333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086258, + "balance_loss_mlp": 1.05822039, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.0670229685198235, + "language_loss": 0.84466362, + "learning_rate": 0.0003281769793025116, + "loss": 0.85552621, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.28051758, + "step": 3240, + "time_per_iteration": 2.7121944427490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_mlp": 1.05725467, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.0702959592195009, + "language_loss": 0.89746368, + "learning_rate": 0.00032788444352614346, + "loss": 0.90830505, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.2689209, + "step": 3241, + "time_per_iteration": 2.5015249252319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05672646, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.06846716492041297, + "language_loss": 0.80880868, + "learning_rate": 0.0003275919745709606, + "loss": 0.81963438, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.25793457, + "step": 3242, + "time_per_iteration": 2.5576865673065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108516, + "balance_loss_mlp": 1.05925632, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.07943089105939449, + "language_loss": 0.82206035, + "learning_rate": 0.00032729957255050936, + "loss": 0.83291197, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.25939941, + "step": 3243, + "time_per_iteration": 2.6432876586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088614, + "balance_loss_mlp": 1.06160164, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.05697537119999913, + "language_loss": 0.81798017, + "learning_rate": 0.0003270072375783102, + "loss": 0.82886636, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.2701416, + "step": 3244, + "time_per_iteration": 2.8988003730773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.06048417, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.06396151885165319, + "language_loss": 0.79621661, + "learning_rate": 0.00032671496976785774, + "loss": 0.80708826, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.2668457, + "step": 3245, + "time_per_iteration": 2.619020938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.054075, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06315966353761295, + "language_loss": 0.75718981, + "learning_rate": 0.0003264227692326205, + "loss": 0.76798642, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.25610352, + "step": 3246, + "time_per_iteration": 3.0977470874786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092736, + "balance_loss_mlp": 1.0656991, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.05900529395790117, + "language_loss": 0.86342973, + "learning_rate": 0.00032613063608604055, + "loss": 0.8743571, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.27075195, + "step": 3247, + "time_per_iteration": 2.535694122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088316, + "balance_loss_mlp": 1.06239939, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.06304682930858534, + "language_loss": 0.83798397, + "learning_rate": 0.0003258385704415343, + "loss": 0.84886706, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.25952148, + "step": 3248, + "time_per_iteration": 2.5745623111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108485, + "balance_loss_mlp": 1.05835032, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.05590667245526839, + "language_loss": 0.83388865, + "learning_rate": 0.0003255465724124915, + "loss": 0.84473717, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.26550293, + "step": 3249, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088994, + "balance_loss_mlp": 1.06236219, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.05421846052548684, + "language_loss": 0.83201844, + "learning_rate": 0.00032525464211227587, + "loss": 0.84290838, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.2668457, + "step": 3250, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089648, + "balance_loss_mlp": 1.0646019, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.05949618394649944, + "language_loss": 0.85687059, + "learning_rate": 0.0003249627796542249, + "loss": 0.8677671, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.25048828, + "step": 3251, + "time_per_iteration": 2.657060384750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086593, + "balance_loss_mlp": 1.06070042, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06427979506588448, + "language_loss": 0.84404004, + "learning_rate": 0.00032467098515164943, + "loss": 0.85490596, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.25927734, + "step": 3252, + "time_per_iteration": 2.849217414855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095705, + "balance_loss_mlp": 1.06928802, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.07156536364550367, + "language_loss": 0.8424539, + "learning_rate": 0.00032437925871783456, + "loss": 0.85341096, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.26428223, + "step": 3253, + "time_per_iteration": 2.6556756496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089818, + "balance_loss_mlp": 1.06392598, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.06713167353527402, + "language_loss": 0.84369826, + "learning_rate": 0.00032408760046603803, + "loss": 0.85459638, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.25915527, + "step": 3254, + "time_per_iteration": 2.8115572929382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_mlp": 1.06096649, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.057744790831179095, + "language_loss": 0.77781522, + "learning_rate": 0.00032379601050949193, + "loss": 0.78869808, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.27319336, + "step": 3255, + "time_per_iteration": 3.076742649078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086608, + "balance_loss_mlp": 1.06120479, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.07189629851165658, + "language_loss": 0.88155556, + "learning_rate": 0.0003235044889614013, + "loss": 0.8924216, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.25390625, + "step": 3256, + "time_per_iteration": 2.5873968601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089869, + "balance_loss_mlp": 1.06373787, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.05771783178096878, + "language_loss": 0.83524096, + "learning_rate": 0.0003232130359349451, + "loss": 0.84613967, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.26147461, + "step": 3257, + "time_per_iteration": 2.819540500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.05381322, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.06862538521016108, + "language_loss": 0.81873524, + "learning_rate": 0.0003229216515432751, + "loss": 0.82953036, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.25732422, + "step": 3258, + "time_per_iteration": 2.7515103816986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081926, + "balance_loss_mlp": 1.05611742, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.0620904280551254, + "language_loss": 0.79984063, + "learning_rate": 0.0003226303358995174, + "loss": 0.81065989, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.25805664, + "step": 3259, + "time_per_iteration": 2.601327896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108309, + "balance_loss_mlp": 1.05641103, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.06264498249495759, + "language_loss": 0.88746321, + "learning_rate": 0.00032233908911677, + "loss": 0.89829409, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.26672363, + "step": 3260, + "time_per_iteration": 2.8746490478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108161, + "balance_loss_mlp": 1.0554074, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.05524835690099731, + "language_loss": 0.81054789, + "learning_rate": 0.0003220479113081053, + "loss": 0.82136405, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.26245117, + "step": 3261, + "time_per_iteration": 2.7250542640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086528, + "balance_loss_mlp": 1.06051612, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.07333417495650836, + "language_loss": 0.79077941, + "learning_rate": 0.00032175680258656836, + "loss": 0.80164468, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.26049805, + "step": 3262, + "time_per_iteration": 2.7318856716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084833, + "balance_loss_mlp": 1.0588572, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.054494688128012655, + "language_loss": 0.80530143, + "learning_rate": 0.00032146576306517794, + "loss": 0.81614971, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.26000977, + "step": 3263, + "time_per_iteration": 2.7811925411224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080063, + "balance_loss_mlp": 1.05290699, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.056666242848552414, + "language_loss": 0.81309682, + "learning_rate": 0.0003211747928569255, + "loss": 0.82389748, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.27197266, + "step": 3264, + "time_per_iteration": 2.7700881958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109074, + "balance_loss_mlp": 1.06416845, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.05464471596038141, + "language_loss": 0.82094646, + "learning_rate": 0.0003208838920747754, + "loss": 0.83185387, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.26599121, + "step": 3265, + "time_per_iteration": 2.8446507453918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090884, + "balance_loss_mlp": 1.06463385, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.056349937520850824, + "language_loss": 0.77076876, + "learning_rate": 0.0003205930608316656, + "loss": 0.7816776, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.26269531, + "step": 3266, + "time_per_iteration": 3.491666555404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_mlp": 1.07074392, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.06651261940051902, + "language_loss": 0.84897095, + "learning_rate": 0.00032030229924050673, + "loss": 0.85995495, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.27661133, + "step": 3267, + "time_per_iteration": 2.647298812866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089439, + "balance_loss_mlp": 1.06264114, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.055917272399638666, + "language_loss": 0.8022815, + "learning_rate": 0.00032001160741418247, + "loss": 0.81317586, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.26843262, + "step": 3268, + "time_per_iteration": 2.652388334274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094844, + "balance_loss_mlp": 1.06809378, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.059838942630291256, + "language_loss": 0.82543945, + "learning_rate": 0.0003197209854655494, + "loss": 0.83638787, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.26757812, + "step": 3269, + "time_per_iteration": 2.6375179290771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.07439375, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.061513094819297384, + "language_loss": 0.74642974, + "learning_rate": 0.0003194304335074371, + "loss": 0.75742888, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.25537109, + "step": 3270, + "time_per_iteration": 2.8767266273498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.06736612, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.08816092137491774, + "language_loss": 0.8863402, + "learning_rate": 0.0003191399516526475, + "loss": 0.89727688, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.26342773, + "step": 3271, + "time_per_iteration": 2.4882290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103501, + "balance_loss_mlp": 1.07775187, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.05301391071022918, + "language_loss": 0.80040759, + "learning_rate": 0.0003188495400139559, + "loss": 0.81144261, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.25732422, + "step": 3272, + "time_per_iteration": 2.755535364151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109861, + "balance_loss_mlp": 1.0714066, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.06865914840158399, + "language_loss": 0.84610647, + "learning_rate": 0.00031855919870411013, + "loss": 0.85709262, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.27246094, + "step": 3273, + "time_per_iteration": 2.8569116592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093778, + "balance_loss_mlp": 1.06794524, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.05727346843797417, + "language_loss": 0.84962982, + "learning_rate": 0.0003182689278358305, + "loss": 0.86056757, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.25866699, + "step": 3274, + "time_per_iteration": 2.690037727355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104729, + "balance_loss_mlp": 1.07783532, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.06020653166460469, + "language_loss": 0.80404747, + "learning_rate": 0.0003179787275218105, + "loss": 0.81509471, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.26928711, + "step": 3275, + "time_per_iteration": 2.5266408920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07448089, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.052538589715391014, + "language_loss": 0.84480745, + "learning_rate": 0.0003176885978747155, + "loss": 0.85581249, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.26037598, + "step": 3276, + "time_per_iteration": 2.639855146408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097356, + "balance_loss_mlp": 1.07041466, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.060305073155881905, + "language_loss": 0.82594693, + "learning_rate": 0.0003173985390071839, + "loss": 0.83692044, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.26977539, + "step": 3277, + "time_per_iteration": 2.860373020172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_mlp": 1.02755451, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.022211191249075446, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78934395, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.06396484, + "step": 3278, + "time_per_iteration": 4.8053810596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109346, + "balance_loss_mlp": 1.06688833, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.06392036419346926, + "language_loss": 0.8159709, + "learning_rate": 0.00031681863406122704, + "loss": 0.82690549, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.26574707, + "step": 3279, + "time_per_iteration": 2.7899298667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090056, + "balance_loss_mlp": 1.06425917, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.08623088614213353, + "language_loss": 0.85931206, + "learning_rate": 0.00031652878820794087, + "loss": 0.87021261, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.25817871, + "step": 3280, + "time_per_iteration": 2.9887900352478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099524, + "balance_loss_mlp": 1.07296467, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.06411033205686746, + "language_loss": 0.85853314, + "learning_rate": 0.00031623901358449627, + "loss": 0.86952841, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.26574707, + "step": 3281, + "time_per_iteration": 2.638303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.06183434, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.058317756156366925, + "language_loss": 0.88884354, + "learning_rate": 0.0003159493103033936, + "loss": 0.89973223, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.27038574, + "step": 3282, + "time_per_iteration": 2.577678918838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021333, + "balance_loss_mlp": 1.01494348, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.01678733827998209, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.8094039, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.06396484, + "step": 3283, + "time_per_iteration": 4.869993209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086941, + "balance_loss_mlp": 1.06035781, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.060116799925982296, + "language_loss": 0.82495177, + "learning_rate": 0.0003153701182180776, + "loss": 0.83582127, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.26611328, + "step": 3284, + "time_per_iteration": 2.792370319366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108947, + "balance_loss_mlp": 1.06271982, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.05700688218578944, + "language_loss": 0.81939638, + "learning_rate": 0.00031508062963872655, + "loss": 0.83029103, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.26757812, + "step": 3285, + "time_per_iteration": 2.5983989238739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080002, + "balance_loss_mlp": 1.05334699, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.06791630533273198, + "language_loss": 0.79373753, + "learning_rate": 0.0003147912128514423, + "loss": 0.80453753, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.2668457, + "step": 3286, + "time_per_iteration": 2.7027578353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085262, + "balance_loss_mlp": 1.05848765, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.061011344504073056, + "language_loss": 0.87480241, + "learning_rate": 0.0003145018679685859, + "loss": 0.88565505, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.26831055, + "step": 3287, + "time_per_iteration": 2.7283802032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081421, + "balance_loss_mlp": 1.05552864, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.05025789787573444, + "language_loss": 0.87796986, + "learning_rate": 0.00031421259510249134, + "loss": 0.88878405, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.25927734, + "step": 3288, + "time_per_iteration": 2.879518985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089164, + "balance_loss_mlp": 1.06193662, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.06343698340560998, + "language_loss": 0.81597275, + "learning_rate": 0.00031392339436546414, + "loss": 0.82686442, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.27246094, + "step": 3289, + "time_per_iteration": 2.8542826175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_mlp": 1.05521417, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.06408220142950623, + "language_loss": 0.83260751, + "learning_rate": 0.00031363426586978205, + "loss": 0.84343785, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.27832031, + "step": 3290, + "time_per_iteration": 2.79167103767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075181, + "balance_loss_mlp": 1.04847813, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.05376353557308444, + "language_loss": 0.84848088, + "learning_rate": 0.0003133452097276947, + "loss": 0.85923266, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.26708984, + "step": 3291, + "time_per_iteration": 2.751204252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108266, + "balance_loss_mlp": 1.05583799, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.07438043439458697, + "language_loss": 0.84737223, + "learning_rate": 0.0003130562260514238, + "loss": 0.85819882, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.26831055, + "step": 3292, + "time_per_iteration": 2.7716188430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083518, + "balance_loss_mlp": 1.05695808, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.050395454055006096, + "language_loss": 0.81929183, + "learning_rate": 0.0003127673149531626, + "loss": 0.830127, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.26550293, + "step": 3293, + "time_per_iteration": 2.7863051891326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_mlp": 1.05757475, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.05747867938132279, + "language_loss": 0.8319236, + "learning_rate": 0.0003124784765450762, + "loss": 0.84276778, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.26867676, + "step": 3294, + "time_per_iteration": 2.551786184310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109352, + "balance_loss_mlp": 1.0665071, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.0638400369710873, + "language_loss": 0.80339384, + "learning_rate": 0.0003121897109393017, + "loss": 0.81432903, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.27050781, + "step": 3295, + "time_per_iteration": 2.7408554553985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010907, + "balance_loss_mlp": 1.06406879, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.05476078823788279, + "language_loss": 0.89262557, + "learning_rate": 0.0003119010182479481, + "loss": 0.90353251, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.26623535, + "step": 3296, + "time_per_iteration": 2.658127784729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088039, + "balance_loss_mlp": 1.06214714, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.062377346698915814, + "language_loss": 0.82762587, + "learning_rate": 0.00031161239858309563, + "loss": 0.83850628, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.25915527, + "step": 3297, + "time_per_iteration": 2.5747482776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092867, + "balance_loss_mlp": 1.06669998, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.0650323770737515, + "language_loss": 0.83421898, + "learning_rate": 0.0003113238520567964, + "loss": 0.84514761, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.26208496, + "step": 3298, + "time_per_iteration": 2.6627304553985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089642, + "balance_loss_mlp": 1.06351149, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.06322814562663621, + "language_loss": 0.81827015, + "learning_rate": 0.00031103537878107403, + "loss": 0.82916659, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.26147461, + "step": 3299, + "time_per_iteration": 2.7386014461517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091865, + "balance_loss_mlp": 1.06578207, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.11045697323578996, + "language_loss": 0.80332845, + "learning_rate": 0.0003107469788679238, + "loss": 0.81424707, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.26086426, + "step": 3300, + "time_per_iteration": 2.7655692100524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084258, + "balance_loss_mlp": 1.05724525, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.06273525226286222, + "language_loss": 0.8685059, + "learning_rate": 0.00031045865242931267, + "loss": 0.8793484, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.27026367, + "step": 3301, + "time_per_iteration": 2.8187057971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092787, + "balance_loss_mlp": 1.06582153, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.06022790921544637, + "language_loss": 0.82959229, + "learning_rate": 0.00031017039957717877, + "loss": 0.84052014, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.27001953, + "step": 3302, + "time_per_iteration": 2.994527578353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088849, + "balance_loss_mlp": 1.0623126, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.2662546903518702, + "language_loss": 0.8874619, + "learning_rate": 0.0003098822204234318, + "loss": 0.89835036, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.265625, + "step": 3303, + "time_per_iteration": 2.6759462356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086338, + "balance_loss_mlp": 1.06104219, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.06306835331817585, + "language_loss": 0.87812388, + "learning_rate": 0.00030959411507995273, + "loss": 0.88898724, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.25317383, + "step": 3304, + "time_per_iteration": 3.2179057598114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06150627, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.09855035049223494, + "language_loss": 0.81458223, + "learning_rate": 0.00030930608365859407, + "loss": 0.82547283, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.27563477, + "step": 3305, + "time_per_iteration": 2.743131399154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093486, + "balance_loss_mlp": 1.06724787, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.08448546978670643, + "language_loss": 0.87924969, + "learning_rate": 0.00030901812627117943, + "loss": 0.89018464, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.26257324, + "step": 3306, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090032, + "balance_loss_mlp": 1.06258953, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.06217165595181868, + "language_loss": 0.85291284, + "learning_rate": 0.000308730243029504, + "loss": 0.86381316, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.27416992, + "step": 3307, + "time_per_iteration": 2.604104995727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091578, + "balance_loss_mlp": 1.06420732, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.05998324584382658, + "language_loss": 0.79413563, + "learning_rate": 0.0003084424340453339, + "loss": 0.80505145, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.27392578, + "step": 3308, + "time_per_iteration": 2.808955192565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.06555986, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.06232682903729, + "language_loss": 0.82260096, + "learning_rate": 0.0003081546994304064, + "loss": 0.83353913, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.28222656, + "step": 3309, + "time_per_iteration": 2.786863327026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090344, + "balance_loss_mlp": 1.06326008, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.059865329496528966, + "language_loss": 0.82539266, + "learning_rate": 0.0003078670392964298, + "loss": 0.83629608, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.27148438, + "step": 3310, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096243, + "balance_loss_mlp": 1.06832409, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.060559947779739796, + "language_loss": 0.82883835, + "learning_rate": 0.00030757945375508406, + "loss": 0.83980078, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.27929688, + "step": 3311, + "time_per_iteration": 2.6342813968658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102084, + "balance_loss_mlp": 1.07375956, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.06259292774484726, + "language_loss": 0.81409383, + "learning_rate": 0.00030729194291801944, + "loss": 0.82511473, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.28283691, + "step": 3312, + "time_per_iteration": 2.6879191398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102141, + "balance_loss_mlp": 1.07455623, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.07257562907286343, + "language_loss": 0.77341402, + "learning_rate": 0.00030700450689685787, + "loss": 0.78443545, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.27636719, + "step": 3313, + "time_per_iteration": 2.5379741191864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093269, + "balance_loss_mlp": 1.06732869, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.05810286920956277, + "language_loss": 0.85484838, + "learning_rate": 0.00030671714580319186, + "loss": 0.86578107, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.25952148, + "step": 3314, + "time_per_iteration": 2.800306797027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095409, + "balance_loss_mlp": 1.06806278, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07119187429341393, + "language_loss": 0.83300906, + "learning_rate": 0.0003064298597485846, + "loss": 0.84396315, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.27392578, + "step": 3315, + "time_per_iteration": 2.822500467300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089294, + "balance_loss_mlp": 1.06213832, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.07085511575360878, + "language_loss": 0.84058923, + "learning_rate": 0.00030614264884457054, + "loss": 0.85148215, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.27197266, + "step": 3316, + "time_per_iteration": 2.670797348022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090907, + "balance_loss_mlp": 1.06443071, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.0775113841286029, + "language_loss": 0.77307498, + "learning_rate": 0.000305855513202655, + "loss": 0.78398407, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.26477051, + "step": 3317, + "time_per_iteration": 2.585374355316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088816, + "balance_loss_mlp": 1.06235111, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.06790961033266373, + "language_loss": 0.77846622, + "learning_rate": 0.0003055684529343138, + "loss": 0.78935432, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.26501465, + "step": 3318, + "time_per_iteration": 2.4445385932922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085331, + "balance_loss_mlp": 1.0597012, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.06232442900596772, + "language_loss": 0.78594166, + "learning_rate": 0.00030528146815099374, + "loss": 0.79679501, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.25634766, + "step": 3319, + "time_per_iteration": 2.654273509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085377, + "balance_loss_mlp": 1.06078434, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.06473309855040241, + "language_loss": 0.72311449, + "learning_rate": 0.00030499455896411203, + "loss": 0.73396826, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.24597168, + "step": 3320, + "time_per_iteration": 2.60524320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043431, + "balance_loss_mlp": 1.03561127, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.03712674177895302, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77344245, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.078125, + "step": 3321, + "time_per_iteration": 4.941630601882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.06535614, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.06705543469002004, + "language_loss": 0.7662977, + "learning_rate": 0.0003044209678251865, + "loss": 0.77721143, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.26013184, + "step": 3322, + "time_per_iteration": 2.877448320388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091614, + "balance_loss_mlp": 1.06602025, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.07788084148223126, + "language_loss": 0.84920502, + "learning_rate": 0.0003041342860958306, + "loss": 0.86012113, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.25610352, + "step": 3323, + "time_per_iteration": 2.8169727325439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093622, + "balance_loss_mlp": 1.06809974, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.09386152906491808, + "language_loss": 0.91524851, + "learning_rate": 0.00030384768040828857, + "loss": 0.92618477, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.25537109, + "step": 3324, + "time_per_iteration": 2.6935789585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087009, + "balance_loss_mlp": 1.06294096, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.06024043560940697, + "language_loss": 0.85838866, + "learning_rate": 0.00030356115087383094, + "loss": 0.86925876, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.24047852, + "step": 3325, + "time_per_iteration": 2.645054340362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108735, + "balance_loss_mlp": 1.06102872, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.054064191473810044, + "language_loss": 0.84931785, + "learning_rate": 0.00030327469760369803, + "loss": 0.86019135, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.26367188, + "step": 3326, + "time_per_iteration": 2.563873767852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085961, + "balance_loss_mlp": 1.05992579, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.06028685713056784, + "language_loss": 0.85342407, + "learning_rate": 0.0003029883207091009, + "loss": 0.86428368, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.26074219, + "step": 3327, + "time_per_iteration": 2.705343723297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_mlp": 1.05283976, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.06637165202459654, + "language_loss": 0.78691089, + "learning_rate": 0.00030270202030122095, + "loss": 0.7977106, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.27172852, + "step": 3328, + "time_per_iteration": 2.708845853805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081887, + "balance_loss_mlp": 1.05516016, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.06780948867889915, + "language_loss": 0.86619353, + "learning_rate": 0.00030241579649121, + "loss": 0.87701237, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.26782227, + "step": 3329, + "time_per_iteration": 2.9923856258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084819, + "balance_loss_mlp": 1.05859339, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.052278869794255514, + "language_loss": 0.79563975, + "learning_rate": 0.00030212964939018994, + "loss": 0.80648792, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.26220703, + "step": 3330, + "time_per_iteration": 2.5270252227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091276, + "balance_loss_mlp": 1.06483507, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.06193541615368343, + "language_loss": 0.85849935, + "learning_rate": 0.0003018435791092527, + "loss": 0.86941212, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.26489258, + "step": 3331, + "time_per_iteration": 2.4754018783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081779, + "balance_loss_mlp": 1.05531454, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.08536903731672153, + "language_loss": 0.81342864, + "learning_rate": 0.00030155758575946083, + "loss": 0.82424641, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.26489258, + "step": 3332, + "time_per_iteration": 2.626554489135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087465, + "balance_loss_mlp": 1.06135845, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.05880203513690982, + "language_loss": 0.83905303, + "learning_rate": 0.0003012716694518467, + "loss": 0.84992766, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.26135254, + "step": 3333, + "time_per_iteration": 2.563870906829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088918, + "balance_loss_mlp": 1.06233454, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.060655550998304664, + "language_loss": 0.85066408, + "learning_rate": 0.000300985830297413, + "loss": 0.86155331, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.26635742, + "step": 3334, + "time_per_iteration": 2.720207691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085977, + "balance_loss_mlp": 1.05846334, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.0660382374422698, + "language_loss": 0.87618732, + "learning_rate": 0.00030070006840713205, + "loss": 0.88704705, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.27563477, + "step": 3335, + "time_per_iteration": 3.3882405757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086202, + "balance_loss_mlp": 1.06003511, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.05326551396050189, + "language_loss": 0.738437, + "learning_rate": 0.000300414383891947, + "loss": 0.74929905, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.26184082, + "step": 3336, + "time_per_iteration": 2.841377019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089325, + "balance_loss_mlp": 1.06317008, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.05652358101135248, + "language_loss": 0.88883501, + "learning_rate": 0.00030012877686276973, + "loss": 0.89972824, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.26196289, + "step": 3337, + "time_per_iteration": 2.729287624359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109448, + "balance_loss_mlp": 1.06825364, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.05602708574683237, + "language_loss": 0.87052727, + "learning_rate": 0.0002998432474304832, + "loss": 0.88147211, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.26269531, + "step": 3338, + "time_per_iteration": 2.763936996459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_mlp": 1.02664769, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.022262190661506177, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80270433, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.06445312, + "step": 3339, + "time_per_iteration": 4.899634838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085698, + "balance_loss_mlp": 1.06067634, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.05329063171196326, + "language_loss": 0.88739842, + "learning_rate": 0.00029927242179996107, + "loss": 0.89825541, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.25036621, + "step": 3340, + "time_per_iteration": 2.6731433868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_mlp": 1.05887282, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.05323225781899137, + "language_loss": 0.83480287, + "learning_rate": 0.0002989871258233398, + "loss": 0.84564984, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.25830078, + "step": 3341, + "time_per_iteration": 2.7728755474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092018, + "balance_loss_mlp": 1.06558967, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.07706425942828801, + "language_loss": 0.82514536, + "learning_rate": 0.0002987019078868373, + "loss": 0.83606553, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.26477051, + "step": 3342, + "time_per_iteration": 2.4401304721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.06178701, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.05844856820656981, + "language_loss": 0.81969512, + "learning_rate": 0.00029841676810118484, + "loss": 0.83056593, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.25317383, + "step": 3343, + "time_per_iteration": 2.662538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081725, + "balance_loss_mlp": 1.05664337, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.059827459557400715, + "language_loss": 0.8744089, + "learning_rate": 0.0002981317065770839, + "loss": 0.88522613, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.25097656, + "step": 3344, + "time_per_iteration": 3.0547289848327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084317, + "balance_loss_mlp": 1.05733991, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.06660327590373825, + "language_loss": 0.80995148, + "learning_rate": 0.00029784672342520493, + "loss": 0.8207947, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.2701416, + "step": 3345, + "time_per_iteration": 2.665701389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_mlp": 1.05967772, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.06646117456198827, + "language_loss": 0.83675933, + "learning_rate": 0.00029756181875618834, + "loss": 0.84762478, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.26904297, + "step": 3346, + "time_per_iteration": 2.5859789848327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.06036818, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.0635179791741207, + "language_loss": 0.83513415, + "learning_rate": 0.0002972769926806439, + "loss": 0.84600508, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.26757812, + "step": 3347, + "time_per_iteration": 2.4656190872192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087894, + "balance_loss_mlp": 1.06159616, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.0627778117475219, + "language_loss": 0.89043599, + "learning_rate": 0.0002969922453091508, + "loss": 0.90131485, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.26342773, + "step": 3348, + "time_per_iteration": 2.5913443565368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_mlp": 1.05721855, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.05415378993081624, + "language_loss": 0.85013533, + "learning_rate": 0.00029670757675225777, + "loss": 0.8609767, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.26953125, + "step": 3349, + "time_per_iteration": 2.739000082015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085102, + "balance_loss_mlp": 1.05906665, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.06396799690402781, + "language_loss": 0.79375887, + "learning_rate": 0.0002964229871204831, + "loss": 0.80460995, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.26049805, + "step": 3350, + "time_per_iteration": 2.6291356086730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079691, + "balance_loss_mlp": 1.0546335, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.05949012862270097, + "language_loss": 0.83774936, + "learning_rate": 0.00029613847652431403, + "loss": 0.84854627, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.25073242, + "step": 3351, + "time_per_iteration": 2.839716672897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077701, + "balance_loss_mlp": 1.05226183, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.056904070769954795, + "language_loss": 0.79438174, + "learning_rate": 0.0002958540450742078, + "loss": 0.80515873, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.2545166, + "step": 3352, + "time_per_iteration": 2.913639545440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077873, + "balance_loss_mlp": 1.05242181, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.058859243742432434, + "language_loss": 0.77210569, + "learning_rate": 0.0002955696928805901, + "loss": 0.78288442, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.2545166, + "step": 3353, + "time_per_iteration": 2.923433780670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081125, + "balance_loss_mlp": 1.05607951, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.061599648682054316, + "language_loss": 0.8637355, + "learning_rate": 0.0002952854200538563, + "loss": 0.87454677, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.25061035, + "step": 3354, + "time_per_iteration": 2.8201682567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.05513, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.055453256805671876, + "language_loss": 0.82204401, + "learning_rate": 0.000295001226704371, + "loss": 0.83286464, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.26965332, + "step": 3355, + "time_per_iteration": 2.555814743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073686, + "balance_loss_mlp": 1.04755521, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.07998397222578815, + "language_loss": 0.83098918, + "learning_rate": 0.00029471711294246783, + "loss": 0.84172606, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.26171875, + "step": 3356, + "time_per_iteration": 2.7683768272399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04853272, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.06636958548337468, + "language_loss": 0.82041395, + "learning_rate": 0.0002944330788784494, + "loss": 0.83114803, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.24865723, + "step": 3357, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070626, + "balance_loss_mlp": 1.04476953, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.05791600567825564, + "language_loss": 0.84893548, + "learning_rate": 0.00029414912462258786, + "loss": 0.85964179, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.25878906, + "step": 3358, + "time_per_iteration": 2.811368227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074683, + "balance_loss_mlp": 1.04814672, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.06332395198444683, + "language_loss": 0.81536913, + "learning_rate": 0.00029386525028512366, + "loss": 0.82611591, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.265625, + "step": 3359, + "time_per_iteration": 2.7373340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074991, + "balance_loss_mlp": 1.04820502, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.06353277324280042, + "language_loss": 0.87003738, + "learning_rate": 0.0002935814559762666, + "loss": 0.88078725, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.26794434, + "step": 3360, + "time_per_iteration": 2.7775824069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.04590034, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.05137930427454231, + "language_loss": 0.79679728, + "learning_rate": 0.0002932977418061957, + "loss": 0.80750829, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.25183105, + "step": 3361, + "time_per_iteration": 2.6293880939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073227, + "balance_loss_mlp": 1.04677427, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.06432809284202623, + "language_loss": 0.80709672, + "learning_rate": 0.00029301410788505833, + "loss": 0.81782901, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.26489258, + "step": 3362, + "time_per_iteration": 2.772700071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073962, + "balance_loss_mlp": 1.04715228, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.06908164950227988, + "language_loss": 0.81278014, + "learning_rate": 0.00029273055432297126, + "loss": 0.82351977, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.26782227, + "step": 3363, + "time_per_iteration": 2.479120969772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068037, + "balance_loss_mlp": 1.04115558, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06524076988807553, + "language_loss": 0.80934191, + "learning_rate": 0.00029244708123001917, + "loss": 0.82002234, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.26916504, + "step": 3364, + "time_per_iteration": 2.9441330432891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068068, + "balance_loss_mlp": 1.04217577, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.06372584124812569, + "language_loss": 0.84562182, + "learning_rate": 0.0002921636887162565, + "loss": 0.8563025, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.25927734, + "step": 3365, + "time_per_iteration": 2.732980489730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067865, + "balance_loss_mlp": 1.04277182, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.0749500155659675, + "language_loss": 0.83798963, + "learning_rate": 0.00029188037689170595, + "loss": 0.84866834, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.25109863, + "step": 3366, + "time_per_iteration": 2.9474096298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068033, + "balance_loss_mlp": 1.04130602, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.06502471406083535, + "language_loss": 0.84043062, + "learning_rate": 0.0002915971458663586, + "loss": 0.85111088, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.26782227, + "step": 3367, + "time_per_iteration": 3.0719544887542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069519, + "balance_loss_mlp": 1.04331708, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.05257796695915082, + "language_loss": 0.81762713, + "learning_rate": 0.00029131399575017494, + "loss": 0.82832229, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.26245117, + "step": 3368, + "time_per_iteration": 3.195772171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071807, + "balance_loss_mlp": 1.04481828, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.05387315925396133, + "language_loss": 0.86003518, + "learning_rate": 0.0002910309266530836, + "loss": 0.87075323, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.27026367, + "step": 3369, + "time_per_iteration": 2.790093421936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075463, + "balance_loss_mlp": 1.04854584, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.057981542205969905, + "language_loss": 0.85403055, + "learning_rate": 0.0002907479386849814, + "loss": 0.86478519, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.26977539, + "step": 3370, + "time_per_iteration": 2.628838062286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074904, + "balance_loss_mlp": 1.04926252, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.05712160703015161, + "language_loss": 0.80363882, + "learning_rate": 0.0002904650319557339, + "loss": 0.81438786, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.2565918, + "step": 3371, + "time_per_iteration": 2.9755005836486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073278, + "balance_loss_mlp": 1.04574049, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.07266117839515142, + "language_loss": 0.81511021, + "learning_rate": 0.0002901822065751758, + "loss": 0.82584298, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.27539062, + "step": 3372, + "time_per_iteration": 2.646740198135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079407, + "balance_loss_mlp": 1.05310917, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.060084455172548096, + "language_loss": 0.85821176, + "learning_rate": 0.0002898994626531093, + "loss": 0.8690058, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.26318359, + "step": 3373, + "time_per_iteration": 2.8307554721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079841, + "balance_loss_mlp": 1.05368662, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.06412256257505489, + "language_loss": 0.88422716, + "learning_rate": 0.00028961680029930526, + "loss": 0.89502561, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.26196289, + "step": 3374, + "time_per_iteration": 2.5427072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078278, + "balance_loss_mlp": 1.05246949, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.05984187516424017, + "language_loss": 0.77025837, + "learning_rate": 0.00028933421962350317, + "loss": 0.78104115, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.25830078, + "step": 3375, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076768, + "balance_loss_mlp": 1.05101824, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.06098588343511283, + "language_loss": 0.8395189, + "learning_rate": 0.0002890517207354104, + "loss": 0.8502866, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.2578125, + "step": 3376, + "time_per_iteration": 2.8559377193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108488, + "balance_loss_mlp": 1.05872583, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.061051185041057866, + "language_loss": 0.81743991, + "learning_rate": 0.0002887693037447029, + "loss": 0.82828867, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.26196289, + "step": 3377, + "time_per_iteration": 2.5842373371124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081133, + "balance_loss_mlp": 1.0550499, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.06328579672333946, + "language_loss": 0.82031405, + "learning_rate": 0.00028848696876102443, + "loss": 0.83112538, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.26086426, + "step": 3378, + "time_per_iteration": 2.6148552894592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085126, + "balance_loss_mlp": 1.05910289, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.06296964534395977, + "language_loss": 0.83665496, + "learning_rate": 0.00028820471589398723, + "loss": 0.84750628, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.26062012, + "step": 3379, + "time_per_iteration": 2.5984256267547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087258, + "balance_loss_mlp": 1.06153309, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.06986995614305117, + "language_loss": 0.77549016, + "learning_rate": 0.00028792254525317196, + "loss": 0.78636277, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.25732422, + "step": 3380, + "time_per_iteration": 2.6670660972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091352, + "balance_loss_mlp": 1.06519723, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.07163565487878029, + "language_loss": 0.81605381, + "learning_rate": 0.00028764045694812645, + "loss": 0.82696736, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.26159668, + "step": 3381, + "time_per_iteration": 2.7534923553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.06213295, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.07829383117608732, + "language_loss": 0.76753044, + "learning_rate": 0.0002873584510883671, + "loss": 0.77842152, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.26989746, + "step": 3382, + "time_per_iteration": 2.5738234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089393, + "balance_loss_mlp": 1.0616889, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.05561178380226362, + "language_loss": 0.86494762, + "learning_rate": 0.0002870765277833788, + "loss": 0.87584156, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.27709961, + "step": 3383, + "time_per_iteration": 2.6669375896453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080554, + "balance_loss_mlp": 1.05552006, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.06569130604090773, + "language_loss": 0.80749148, + "learning_rate": 0.00028679468714261347, + "loss": 0.81829703, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.25048828, + "step": 3384, + "time_per_iteration": 2.7443134784698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078779, + "balance_loss_mlp": 1.05354261, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.06683297733149338, + "language_loss": 0.76978695, + "learning_rate": 0.0002865129292754918, + "loss": 0.7805748, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.25256348, + "step": 3385, + "time_per_iteration": 2.553633213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077167, + "balance_loss_mlp": 1.05206108, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.07067523232573529, + "language_loss": 0.81812489, + "learning_rate": 0.00028623125429140105, + "loss": 0.82889658, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.25097656, + "step": 3386, + "time_per_iteration": 2.8174142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081783, + "balance_loss_mlp": 1.05555665, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.06558978791095729, + "language_loss": 0.8706044, + "learning_rate": 0.00028594966229969785, + "loss": 0.88142228, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.2623291, + "step": 3387, + "time_per_iteration": 2.680281639099121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078521, + "balance_loss_mlp": 1.05267668, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.06492635522068706, + "language_loss": 0.81586945, + "learning_rate": 0.00028566815340970577, + "loss": 0.82665467, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.25878906, + "step": 3388, + "time_per_iteration": 2.732487916946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075271, + "balance_loss_mlp": 1.05048704, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.06387919000258871, + "language_loss": 0.81219792, + "learning_rate": 0.0002853867277307162, + "loss": 0.8229506, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.2479248, + "step": 3389, + "time_per_iteration": 2.6404130458831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081113, + "balance_loss_mlp": 1.05424297, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.06082372499882378, + "language_loss": 0.82760382, + "learning_rate": 0.00028510538537198824, + "loss": 0.83841497, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.26928711, + "step": 3390, + "time_per_iteration": 2.5929770469665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079433, + "balance_loss_mlp": 1.05408919, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.055684590981588886, + "language_loss": 0.86515808, + "learning_rate": 0.00028482412644274867, + "loss": 0.87595236, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.25366211, + "step": 3391, + "time_per_iteration": 2.9085311889648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074445, + "balance_loss_mlp": 1.04809964, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.061522898278110257, + "language_loss": 0.74154258, + "learning_rate": 0.00028454295105219207, + "loss": 0.75228703, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.26367188, + "step": 3392, + "time_per_iteration": 2.604851245880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.05011857, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.04678981860923424, + "language_loss": 0.79518068, + "learning_rate": 0.0002842618593094802, + "loss": 0.805933, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.25134277, + "step": 3393, + "time_per_iteration": 3.0968527793884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073633, + "balance_loss_mlp": 1.04734683, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.08516397934584916, + "language_loss": 0.80839396, + "learning_rate": 0.00028398085132374243, + "loss": 0.8191303, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.26306152, + "step": 3394, + "time_per_iteration": 2.802588701248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071874, + "balance_loss_mlp": 1.04662573, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.059849085460161155, + "language_loss": 0.84382617, + "learning_rate": 0.0002836999272040761, + "loss": 0.85454488, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.25268555, + "step": 3395, + "time_per_iteration": 3.1209001541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073974, + "balance_loss_mlp": 1.04781914, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.07079508853897194, + "language_loss": 0.84454936, + "learning_rate": 0.00028341908705954575, + "loss": 0.8552891, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.26196289, + "step": 3396, + "time_per_iteration": 2.5430474281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014992, + "balance_loss_mlp": 1.00736308, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.020137853963587818, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82776797, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.07617188, + "step": 3397, + "time_per_iteration": 4.857236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073644, + "balance_loss_mlp": 1.04739439, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.05698390619804648, + "language_loss": 0.78328836, + "learning_rate": 0.00028285765913198604, + "loss": 0.79402483, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.26269531, + "step": 3398, + "time_per_iteration": 2.542471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076746, + "balance_loss_mlp": 1.05030537, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.05420440820194427, + "language_loss": 0.821926, + "learning_rate": 0.0002825770715669227, + "loss": 0.83269352, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.26489258, + "step": 3399, + "time_per_iteration": 2.718555450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106936, + "balance_loss_mlp": 1.04285991, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06072932855544304, + "language_loss": 0.81462443, + "learning_rate": 0.00028229656841292634, + "loss": 0.82531804, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.26525879, + "step": 3400, + "time_per_iteration": 2.6755053997039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074211, + "balance_loss_mlp": 1.04766357, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.06986785605378391, + "language_loss": 0.762591, + "learning_rate": 0.0002820161497788979, + "loss": 0.77333307, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.265625, + "step": 3401, + "time_per_iteration": 2.56740140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076961, + "balance_loss_mlp": 1.05193925, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.05855571008796804, + "language_loss": 0.87057543, + "learning_rate": 0.00028173581577370545, + "loss": 0.88134497, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.25036621, + "step": 3402, + "time_per_iteration": 2.7579104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.04957581, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.05140393354142716, + "language_loss": 0.79220372, + "learning_rate": 0.0002814555665061844, + "loss": 0.80294883, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.24938965, + "step": 3403, + "time_per_iteration": 2.7005770206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078214, + "balance_loss_mlp": 1.05273879, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.06470448826772422, + "language_loss": 0.77704948, + "learning_rate": 0.00028117540208513715, + "loss": 0.7878316, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.25476074, + "step": 3404, + "time_per_iteration": 2.6598384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.05403566, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.06510521460794984, + "language_loss": 0.84932673, + "learning_rate": 0.00028089532261933313, + "loss": 0.860116, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.24890137, + "step": 3405, + "time_per_iteration": 2.693470001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107722, + "balance_loss_mlp": 1.05179238, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.06574306959894075, + "language_loss": 0.85646415, + "learning_rate": 0.0002806153282175087, + "loss": 0.86723638, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.25439453, + "step": 3406, + "time_per_iteration": 2.5597920417785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.05415273, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.06979692390704297, + "language_loss": 0.83091819, + "learning_rate": 0.0002803354189883679, + "loss": 0.84171414, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.2545166, + "step": 3407, + "time_per_iteration": 2.8204212188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.05349612, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.05468628056475838, + "language_loss": 0.85987842, + "learning_rate": 0.00028005559504058053, + "loss": 0.8706665, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.2532959, + "step": 3408, + "time_per_iteration": 2.693559408187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076206, + "balance_loss_mlp": 1.05038548, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.07417771883494789, + "language_loss": 0.7684713, + "learning_rate": 0.0002797758564827838, + "loss": 0.77923334, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.25842285, + "step": 3409, + "time_per_iteration": 2.802828788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.05920529, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.06335346821862926, + "language_loss": 0.83560646, + "learning_rate": 0.0002794962034235824, + "loss": 0.84645367, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.25537109, + "step": 3410, + "time_per_iteration": 2.6147637367248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108148, + "balance_loss_mlp": 1.05519438, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.06069626440640027, + "language_loss": 0.74793261, + "learning_rate": 0.00027921663597154695, + "loss": 0.7587474, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.26281738, + "step": 3411, + "time_per_iteration": 2.7347841262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081407, + "balance_loss_mlp": 1.05633736, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.07186540610549816, + "language_loss": 0.81030178, + "learning_rate": 0.00027893715423521525, + "loss": 0.82111579, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.25085449, + "step": 3412, + "time_per_iteration": 2.4426064491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090629, + "balance_loss_mlp": 1.06429613, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.057164257181416274, + "language_loss": 0.83953196, + "learning_rate": 0.00027865775832309163, + "loss": 0.85043824, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.26379395, + "step": 3413, + "time_per_iteration": 2.661008358001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089453, + "balance_loss_mlp": 1.06320286, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.059355909745470246, + "language_loss": 0.86547339, + "learning_rate": 0.00027837844834364733, + "loss": 0.87636793, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.26269531, + "step": 3414, + "time_per_iteration": 2.6107146739959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108986, + "balance_loss_mlp": 1.06451583, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.058864717061538036, + "language_loss": 0.86578488, + "learning_rate": 0.00027809922440532, + "loss": 0.87668347, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.25366211, + "step": 3415, + "time_per_iteration": 2.8214099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085792, + "balance_loss_mlp": 1.05929208, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.06707421916858435, + "language_loss": 0.80825239, + "learning_rate": 0.00027782008661651406, + "loss": 0.81911027, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.26513672, + "step": 3416, + "time_per_iteration": 2.772441864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087765, + "balance_loss_mlp": 1.06200361, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.054600094461814935, + "language_loss": 0.87535822, + "learning_rate": 0.00027754103508560013, + "loss": 0.88623583, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.25769043, + "step": 3417, + "time_per_iteration": 2.5883491039276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108732, + "balance_loss_mlp": 1.06016374, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.057346286211937464, + "language_loss": 0.83059859, + "learning_rate": 0.0002772620699209163, + "loss": 0.84147179, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.27197266, + "step": 3418, + "time_per_iteration": 2.560173988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080511, + "balance_loss_mlp": 1.05552435, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.07342594011001312, + "language_loss": 0.80011356, + "learning_rate": 0.0002769831912307658, + "loss": 0.81091869, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.24987793, + "step": 3419, + "time_per_iteration": 2.5090081691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077943, + "balance_loss_mlp": 1.05116832, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.15397597060543888, + "language_loss": 0.80397606, + "learning_rate": 0.00027670439912341917, + "loss": 0.81475556, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.26782227, + "step": 3420, + "time_per_iteration": 2.6002025604248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107857, + "balance_loss_mlp": 1.05198634, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.05399899267227409, + "language_loss": 0.83793807, + "learning_rate": 0.0002764256937071129, + "loss": 0.84872377, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.26611328, + "step": 3421, + "time_per_iteration": 2.7873942852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074524, + "balance_loss_mlp": 1.04920375, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.0598445160882451, + "language_loss": 0.87503046, + "learning_rate": 0.00027614707509005036, + "loss": 0.88577569, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.25341797, + "step": 3422, + "time_per_iteration": 2.659196615219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079222, + "balance_loss_mlp": 1.05353248, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.05796849455801806, + "language_loss": 0.79051846, + "learning_rate": 0.0002758685433804008, + "loss": 0.80131066, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.25695801, + "step": 3423, + "time_per_iteration": 2.5024282932281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074298, + "balance_loss_mlp": 1.04835773, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.06008115307148776, + "language_loss": 0.79408616, + "learning_rate": 0.00027559009868630005, + "loss": 0.80482912, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.25964355, + "step": 3424, + "time_per_iteration": 3.0929386615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073444, + "balance_loss_mlp": 1.0477066, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.05902981727550509, + "language_loss": 0.80511308, + "learning_rate": 0.0002753117411158491, + "loss": 0.81584746, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.25744629, + "step": 3425, + "time_per_iteration": 3.0452723503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05611944, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.053958914804285704, + "language_loss": 0.8972351, + "learning_rate": 0.0002750334707771168, + "loss": 0.90806711, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.27124023, + "step": 3426, + "time_per_iteration": 2.626776695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108887, + "balance_loss_mlp": 1.06247699, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.06696403596262077, + "language_loss": 0.81474262, + "learning_rate": 0.0002747552877781369, + "loss": 0.82563138, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.26367188, + "step": 3427, + "time_per_iteration": 2.49870228767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082041, + "balance_loss_mlp": 1.05622029, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.056641096462852314, + "language_loss": 0.82350707, + "learning_rate": 0.0002744771922269097, + "loss": 0.83432746, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.25805664, + "step": 3428, + "time_per_iteration": 2.76737117767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083165, + "balance_loss_mlp": 1.05777287, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.05792922212348718, + "language_loss": 0.82232559, + "learning_rate": 0.0002741991842314015, + "loss": 0.83315718, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.25415039, + "step": 3429, + "time_per_iteration": 3.4959795475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082387, + "balance_loss_mlp": 1.05617321, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.05913775342689391, + "language_loss": 0.86208242, + "learning_rate": 0.0002739212638995445, + "loss": 0.87290633, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.26220703, + "step": 3430, + "time_per_iteration": 2.552647113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091567, + "balance_loss_mlp": 1.06441104, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.06592703083279383, + "language_loss": 0.83386678, + "learning_rate": 0.00027364343133923696, + "loss": 0.84478247, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.27172852, + "step": 3431, + "time_per_iteration": 2.639110565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05480886, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.06195217340834915, + "language_loss": 0.8308382, + "learning_rate": 0.0002733656866583431, + "loss": 0.84164518, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.25927734, + "step": 3432, + "time_per_iteration": 2.6898815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091031, + "balance_loss_mlp": 1.0637325, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.07646297806496907, + "language_loss": 0.83208609, + "learning_rate": 0.0002730880299646927, + "loss": 0.84299648, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.27307129, + "step": 3433, + "time_per_iteration": 3.0324153900146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.06028199, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.09642118703773885, + "language_loss": 0.85385412, + "learning_rate": 0.0002728104613660821, + "loss": 0.8647173, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.26074219, + "step": 3434, + "time_per_iteration": 2.8242013454437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082794, + "balance_loss_mlp": 1.0578196, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.06046346369252319, + "language_loss": 0.83065814, + "learning_rate": 0.0002725329809702729, + "loss": 0.8414861, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.25012207, + "step": 3435, + "time_per_iteration": 3.208373546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086877, + "balance_loss_mlp": 1.06015027, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.06729202687842574, + "language_loss": 0.76439357, + "learning_rate": 0.0002722555888849921, + "loss": 0.77526236, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.26757812, + "step": 3436, + "time_per_iteration": 3.455219030380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108327, + "balance_loss_mlp": 1.05748534, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.06326694519745679, + "language_loss": 0.80694687, + "learning_rate": 0.00027197828521793334, + "loss": 0.8177796, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.25793457, + "step": 3437, + "time_per_iteration": 2.500117301940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086414, + "balance_loss_mlp": 1.06089163, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.06352548474841713, + "language_loss": 0.84948212, + "learning_rate": 0.0002717010700767552, + "loss": 0.86034626, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.25549316, + "step": 3438, + "time_per_iteration": 2.7301025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.06205106, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.06533223637533662, + "language_loss": 0.75988388, + "learning_rate": 0.00027142394356908226, + "loss": 0.77076733, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.26318359, + "step": 3439, + "time_per_iteration": 2.5677285194396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086811, + "balance_loss_mlp": 1.06116903, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.0569940621311471, + "language_loss": 0.85089839, + "learning_rate": 0.00027114690580250456, + "loss": 0.86176658, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.25646973, + "step": 3440, + "time_per_iteration": 2.738121509552002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 1.06724405, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.05472871432656112, + "language_loss": 0.86912161, + "learning_rate": 0.0002708699568845776, + "loss": 0.88005286, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.25891113, + "step": 3441, + "time_per_iteration": 2.611889600753784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_mlp": 1.03374481, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.021890830835033067, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80329108, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.07568359, + "step": 3442, + "time_per_iteration": 4.8971052169799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090068, + "balance_loss_mlp": 1.06495047, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.064050238945667, + "language_loss": 0.83170366, + "learning_rate": 0.0002703163260247261, + "loss": 0.8426044, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.25134277, + "step": 3443, + "time_per_iteration": 2.5994081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06699824, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.06534456788919288, + "language_loss": 0.81938642, + "learning_rate": 0.0002700396442977399, + "loss": 0.83030105, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.24462891, + "step": 3444, + "time_per_iteration": 2.6017937660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091879, + "balance_loss_mlp": 1.06627333, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.06451262067496133, + "language_loss": 0.84422678, + "learning_rate": 0.0002697630518492817, + "loss": 0.85514563, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.25634766, + "step": 3445, + "time_per_iteration": 2.628159523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094697, + "balance_loss_mlp": 1.06956816, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.05416253097531709, + "language_loss": 0.85508287, + "learning_rate": 0.0002694865487867343, + "loss": 0.8660298, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.25134277, + "step": 3446, + "time_per_iteration": 2.604813814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088071, + "balance_loss_mlp": 1.06316853, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.052847331110623744, + "language_loss": 0.84946668, + "learning_rate": 0.0002692101352174453, + "loss": 0.86034739, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.24914551, + "step": 3447, + "time_per_iteration": 2.768223285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109622, + "balance_loss_mlp": 1.06981492, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.058874726069321814, + "language_loss": 0.8497262, + "learning_rate": 0.00026893381124872787, + "loss": 0.86068839, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.26452637, + "step": 3448, + "time_per_iteration": 2.6762025356292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090166, + "balance_loss_mlp": 1.06560886, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.057817999010546496, + "language_loss": 0.80621779, + "learning_rate": 0.00026865757698786097, + "loss": 0.81711942, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.24584961, + "step": 3449, + "time_per_iteration": 3.0353593826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088537, + "balance_loss_mlp": 1.06256163, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.06325061387502293, + "language_loss": 0.81828356, + "learning_rate": 0.000268381432542088, + "loss": 0.82916903, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.26000977, + "step": 3450, + "time_per_iteration": 2.8381845951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085731, + "balance_loss_mlp": 1.05967212, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.06107082028806233, + "language_loss": 0.80140352, + "learning_rate": 0.00026810537801861807, + "loss": 0.81226087, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.26074219, + "step": 3451, + "time_per_iteration": 2.755697727203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091691, + "balance_loss_mlp": 1.06455863, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.05182534623872074, + "language_loss": 0.8148368, + "learning_rate": 0.0002678294135246243, + "loss": 0.82575375, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.27087402, + "step": 3452, + "time_per_iteration": 2.7235701084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077401, + "balance_loss_mlp": 1.05224776, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.07490988727173932, + "language_loss": 0.86671561, + "learning_rate": 0.0002675535391672463, + "loss": 0.87748969, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.25170898, + "step": 3453, + "time_per_iteration": 3.0891692638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.05430508, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.05695144440492774, + "language_loss": 0.86551011, + "learning_rate": 0.0002672777550535877, + "loss": 0.8763122, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.25939941, + "step": 3454, + "time_per_iteration": 2.7647364139556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078097, + "balance_loss_mlp": 1.05288386, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.06003914399103326, + "language_loss": 0.85505843, + "learning_rate": 0.00026700206129071747, + "loss": 0.86583936, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.25231934, + "step": 3455, + "time_per_iteration": 2.5821306705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078808, + "balance_loss_mlp": 1.05316663, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.06471391174754697, + "language_loss": 0.88815629, + "learning_rate": 0.00026672645798566925, + "loss": 0.89894438, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.25671387, + "step": 3456, + "time_per_iteration": 2.536905288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073004, + "balance_loss_mlp": 1.04708791, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.06322098786419635, + "language_loss": 0.79450369, + "learning_rate": 0.00026645094524544225, + "loss": 0.80523372, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.25927734, + "step": 3457, + "time_per_iteration": 3.346942663192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.05416238, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.07380509782774128, + "language_loss": 0.75270724, + "learning_rate": 0.00026617552317699945, + "loss": 0.76351058, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.26220703, + "step": 3458, + "time_per_iteration": 2.8174753189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075769, + "balance_loss_mlp": 1.05087817, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.06466167118068906, + "language_loss": 0.87317026, + "learning_rate": 0.0002659001918872693, + "loss": 0.88392794, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.24890137, + "step": 3459, + "time_per_iteration": 2.620330810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.0529331, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.06328415655418428, + "language_loss": 0.81001127, + "learning_rate": 0.0002656249514831449, + "loss": 0.82079417, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.25378418, + "step": 3460, + "time_per_iteration": 2.6549599170684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079023, + "balance_loss_mlp": 1.05377483, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.054463111692168976, + "language_loss": 0.86972237, + "learning_rate": 0.00026534980207148416, + "loss": 0.8805126, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.25256348, + "step": 3461, + "time_per_iteration": 3.424241065979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.05996895, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.06786500256083805, + "language_loss": 0.7389307, + "learning_rate": 0.0002650747437591097, + "loss": 0.7497921, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.26208496, + "step": 3462, + "time_per_iteration": 3.037792921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020491, + "balance_loss_mlp": 1.01310015, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.010691660665593496, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82900071, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.07373047, + "step": 3463, + "time_per_iteration": 5.019932985305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077653, + "balance_loss_mlp": 1.05172443, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.0677151355970307, + "language_loss": 0.86401796, + "learning_rate": 0.00026452490085933155, + "loss": 0.87479448, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.25952148, + "step": 3464, + "time_per_iteration": 2.577608346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_mlp": 1.05381727, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.06950705493870243, + "language_loss": 0.90135396, + "learning_rate": 0.00026425011648539614, + "loss": 0.91214788, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.25622559, + "step": 3465, + "time_per_iteration": 2.5207860469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.0527184, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06360289256438866, + "language_loss": 0.83105028, + "learning_rate": 0.00026397542363768267, + "loss": 0.84183496, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.25769043, + "step": 3466, + "time_per_iteration": 2.662781238555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081527, + "balance_loss_mlp": 1.05476463, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.11778132677194894, + "language_loss": 0.8209849, + "learning_rate": 0.0002637008224228362, + "loss": 0.83180016, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.26794434, + "step": 3467, + "time_per_iteration": 2.5543577671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05868888, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.04775421920110858, + "language_loss": 0.8469578, + "learning_rate": 0.00026342631294746653, + "loss": 0.85780263, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.25842285, + "step": 3468, + "time_per_iteration": 2.7040185928344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086094, + "balance_loss_mlp": 1.06041682, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.049080807880720057, + "language_loss": 0.81080979, + "learning_rate": 0.0002631518953181476, + "loss": 0.82167077, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.25671387, + "step": 3469, + "time_per_iteration": 3.493414878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011784, + "balance_loss_mlp": 1.00391626, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.010939757170187329, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77337068, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.07861328, + "step": 3470, + "time_per_iteration": 4.9387853145599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_mlp": 1.06110907, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.0606952460981544, + "language_loss": 0.80340272, + "learning_rate": 0.00026260333602377985, + "loss": 0.81427646, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.26281738, + "step": 3471, + "time_per_iteration": 2.838916063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109892, + "balance_loss_mlp": 1.0729208, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.06496239585891986, + "language_loss": 0.87351251, + "learning_rate": 0.0002623291945717007, + "loss": 0.88450176, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.26000977, + "step": 3472, + "time_per_iteration": 2.4870412349700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097292, + "balance_loss_mlp": 1.07054186, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.04982364311813806, + "language_loss": 0.84127951, + "learning_rate": 0.00026205514539161175, + "loss": 0.85225236, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.26782227, + "step": 3473, + "time_per_iteration": 3.565732479095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102422, + "balance_loss_mlp": 1.07651806, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.06841158179572154, + "language_loss": 0.84113353, + "learning_rate": 0.00026178118858990773, + "loss": 0.85215771, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.2590332, + "step": 3474, + "time_per_iteration": 2.8573057651519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087697, + "balance_loss_mlp": 1.0619719, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.07905158602596217, + "language_loss": 0.84220064, + "learning_rate": 0.0002615073242729483, + "loss": 0.85307765, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.25732422, + "step": 3475, + "time_per_iteration": 2.6173481941223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090195, + "balance_loss_mlp": 1.06363511, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.04794889281343623, + "language_loss": 0.84776723, + "learning_rate": 0.0002612335525470573, + "loss": 0.85866916, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.26586914, + "step": 3476, + "time_per_iteration": 2.819981575012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.06361461, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.06414606112589924, + "language_loss": 0.7840637, + "learning_rate": 0.0002609598735185221, + "loss": 0.79496014, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.26062012, + "step": 3477, + "time_per_iteration": 2.6392619609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.0593915, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.054041595090679226, + "language_loss": 0.83408946, + "learning_rate": 0.00026068628729359445, + "loss": 0.8449471, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.26379395, + "step": 3478, + "time_per_iteration": 2.766197919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108263, + "balance_loss_mlp": 1.05621278, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.059772967228533376, + "language_loss": 0.76451987, + "learning_rate": 0.00026041279397848996, + "loss": 0.77534616, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.2644043, + "step": 3479, + "time_per_iteration": 2.8584389686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077924, + "balance_loss_mlp": 1.05261552, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.051702403588613846, + "language_loss": 0.82616276, + "learning_rate": 0.00026013939367938797, + "loss": 0.83694196, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.25317383, + "step": 3480, + "time_per_iteration": 2.891376495361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_mlp": 1.04828119, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.05419828241435922, + "language_loss": 0.81335235, + "learning_rate": 0.00025986608650243204, + "loss": 0.82409453, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.25952148, + "step": 3481, + "time_per_iteration": 2.77876353263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073761, + "balance_loss_mlp": 1.04680765, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.051697162904794, + "language_loss": 0.79773414, + "learning_rate": 0.0002595928725537293, + "loss": 0.8084718, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.26965332, + "step": 3482, + "time_per_iteration": 2.8413639068603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073841, + "balance_loss_mlp": 1.04836571, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.05767199414491062, + "language_loss": 0.88867986, + "learning_rate": 0.0002593197519393509, + "loss": 0.89941823, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.25500488, + "step": 3483, + "time_per_iteration": 2.603405475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069253, + "balance_loss_mlp": 1.04446936, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.06697980614257329, + "language_loss": 0.79532218, + "learning_rate": 0.00025904672476533165, + "loss": 0.80601466, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.2479248, + "step": 3484, + "time_per_iteration": 2.84698224067688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070985, + "balance_loss_mlp": 1.04459202, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.05331322450394034, + "language_loss": 0.82924032, + "learning_rate": 0.0002587737911376704, + "loss": 0.83995014, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.26416016, + "step": 3485, + "time_per_iteration": 2.585921049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074192, + "balance_loss_mlp": 1.04729843, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.06756987561009595, + "language_loss": 0.84183806, + "learning_rate": 0.00025850095116232885, + "loss": 0.85257995, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.26953125, + "step": 3486, + "time_per_iteration": 2.7065019607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075208, + "balance_loss_mlp": 1.04840994, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.05801175058434062, + "language_loss": 0.77675766, + "learning_rate": 0.000258228204945233, + "loss": 0.7875098, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.2677002, + "step": 3487, + "time_per_iteration": 2.8951704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071909, + "balance_loss_mlp": 1.04588532, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.05899101310847367, + "language_loss": 0.84739226, + "learning_rate": 0.00025795555259227254, + "loss": 0.85811132, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.26062012, + "step": 3488, + "time_per_iteration": 2.777141571044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072765, + "balance_loss_mlp": 1.04677725, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.0454202058547125, + "language_loss": 0.84104466, + "learning_rate": 0.00025768299420930046, + "loss": 0.85177231, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.2598877, + "step": 3489, + "time_per_iteration": 2.720435857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073548, + "balance_loss_mlp": 1.04736936, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.052981388085366045, + "language_loss": 0.83523858, + "learning_rate": 0.0002574105299021332, + "loss": 0.84597409, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.26220703, + "step": 3490, + "time_per_iteration": 2.874335289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072546, + "balance_loss_mlp": 1.04605818, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.05653925915184199, + "language_loss": 0.84515595, + "learning_rate": 0.00025713815977655084, + "loss": 0.85588139, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.26501465, + "step": 3491, + "time_per_iteration": 2.857795000076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107473, + "balance_loss_mlp": 1.04900455, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.0648375250519464, + "language_loss": 0.84809422, + "learning_rate": 0.0002568658839382969, + "loss": 0.85884148, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.25683594, + "step": 3492, + "time_per_iteration": 2.5480034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072741, + "balance_loss_mlp": 1.04731405, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.06366513295568171, + "language_loss": 0.84661782, + "learning_rate": 0.00025659370249307814, + "loss": 0.85734528, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.25439453, + "step": 3493, + "time_per_iteration": 2.602646589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072937, + "balance_loss_mlp": 1.04722357, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.05297099433671679, + "language_loss": 0.85274851, + "learning_rate": 0.00025632161554656473, + "loss": 0.86347795, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.25732422, + "step": 3494, + "time_per_iteration": 2.867612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071204, + "balance_loss_mlp": 1.04509759, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.05583877885035688, + "language_loss": 0.81951666, + "learning_rate": 0.00025604962320439017, + "loss": 0.83022875, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.26147461, + "step": 3495, + "time_per_iteration": 2.7493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107172, + "balance_loss_mlp": 1.04625738, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.056464737234764244, + "language_loss": 0.82197464, + "learning_rate": 0.0002557777255721516, + "loss": 0.83269185, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.2545166, + "step": 3496, + "time_per_iteration": 2.712113857269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068431, + "balance_loss_mlp": 1.04230046, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.0673285818829442, + "language_loss": 0.80758643, + "learning_rate": 0.0002555059227554087, + "loss": 0.8182708, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.26171875, + "step": 3497, + "time_per_iteration": 2.6871681213378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_mlp": 1.04655433, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.05408032607546607, + "language_loss": 0.7786265, + "learning_rate": 0.00025523421485968453, + "loss": 0.78934866, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.25695801, + "step": 3498, + "time_per_iteration": 2.822655439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.04613543, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.05805760425239871, + "language_loss": 0.85567248, + "learning_rate": 0.00025496260199046585, + "loss": 0.86639267, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.25891113, + "step": 3499, + "time_per_iteration": 2.9368207454681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073883, + "balance_loss_mlp": 1.04759765, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.05807897060622665, + "language_loss": 0.84593326, + "learning_rate": 0.000254691084253202, + "loss": 0.85667205, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.26293945, + "step": 3500, + "time_per_iteration": 2.812175750732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069637, + "balance_loss_mlp": 1.04343474, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.06730887087818041, + "language_loss": 0.77490008, + "learning_rate": 0.00025441966175330567, + "loss": 0.78559649, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.2623291, + "step": 3501, + "time_per_iteration": 2.6858127117156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074737, + "balance_loss_mlp": 1.04904723, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.05973627548594562, + "language_loss": 0.7990756, + "learning_rate": 0.00025414833459615183, + "loss": 0.80982292, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.2565918, + "step": 3502, + "time_per_iteration": 2.792283296585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079245, + "balance_loss_mlp": 1.05329359, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.054401429492937234, + "language_loss": 0.80582958, + "learning_rate": 0.0002538771028870796, + "loss": 0.81662202, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.2598877, + "step": 3503, + "time_per_iteration": 2.7585413455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073955, + "balance_loss_mlp": 1.04783654, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.064846065362636, + "language_loss": 0.81689268, + "learning_rate": 0.0002536059667313903, + "loss": 0.82763219, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.2611084, + "step": 3504, + "time_per_iteration": 2.71769118309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074293, + "balance_loss_mlp": 1.04768562, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.06348051765573881, + "language_loss": 0.89503717, + "learning_rate": 0.0002533349262343483, + "loss": 0.90578014, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.26635742, + "step": 3505, + "time_per_iteration": 2.660651445388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079263, + "balance_loss_mlp": 1.05396676, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.07580313985305334, + "language_loss": 0.81963527, + "learning_rate": 0.0002530639815011807, + "loss": 0.83042789, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.25317383, + "step": 3506, + "time_per_iteration": 2.4884142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107741, + "balance_loss_mlp": 1.05192339, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.07059145793354948, + "language_loss": 0.85113943, + "learning_rate": 0.0002527931326370781, + "loss": 0.86191356, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.25512695, + "step": 3507, + "time_per_iteration": 2.7946653366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078834, + "balance_loss_mlp": 1.05275106, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.06075343684572694, + "language_loss": 0.83284092, + "learning_rate": 0.00025252237974719276, + "loss": 0.84362924, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.26098633, + "step": 3508, + "time_per_iteration": 2.8548471927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108079, + "balance_loss_mlp": 1.05530286, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.06110839735898087, + "language_loss": 0.80529547, + "learning_rate": 0.00025225172293664056, + "loss": 0.81610334, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.25500488, + "step": 3509, + "time_per_iteration": 3.0396220684051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013373, + "balance_loss_mlp": 1.00583911, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.007570597102939453, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77946508, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.07519531, + "step": 3510, + "time_per_iteration": 4.9238317012786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081964, + "balance_loss_mlp": 1.05588078, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.06266149701009033, + "language_loss": 0.85147846, + "learning_rate": 0.00025171069797381106, + "loss": 0.86229801, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.26123047, + "step": 3511, + "time_per_iteration": 2.842026948928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107574, + "balance_loss_mlp": 1.05036068, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.05295851129049709, + "language_loss": 0.82071269, + "learning_rate": 0.00025144033003157864, + "loss": 0.83147007, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.25402832, + "step": 3512, + "time_per_iteration": 2.5853493213653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087572, + "balance_loss_mlp": 1.06216824, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.10878048166540129, + "language_loss": 0.78940082, + "learning_rate": 0.00025117005858876806, + "loss": 0.80027652, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.25402832, + "step": 3513, + "time_per_iteration": 2.683076858520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05658007, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.062477123984618736, + "language_loss": 0.8580628, + "learning_rate": 0.000250899883750308, + "loss": 0.86887884, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.25036621, + "step": 3514, + "time_per_iteration": 2.7132656574249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081057, + "balance_loss_mlp": 1.05621386, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.06208222280166845, + "language_loss": 0.82150948, + "learning_rate": 0.00025062980562109006, + "loss": 0.83232003, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.24841309, + "step": 3515, + "time_per_iteration": 4.169267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080203, + "balance_loss_mlp": 1.0545373, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.06255733263360135, + "language_loss": 0.83099926, + "learning_rate": 0.0002503598243059677, + "loss": 0.84180129, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.25683594, + "step": 3516, + "time_per_iteration": 2.7749977111816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.05966699, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.06025675944988047, + "language_loss": 0.8034898, + "learning_rate": 0.0002500899399097568, + "loss": 0.8143459, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.25976562, + "step": 3517, + "time_per_iteration": 2.638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087679, + "balance_loss_mlp": 1.06179833, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.06061041390288269, + "language_loss": 0.85528451, + "learning_rate": 0.0002498201525372359, + "loss": 0.86616129, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.25915527, + "step": 3518, + "time_per_iteration": 2.6280837059020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090365, + "balance_loss_mlp": 1.06465113, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.05678341042479038, + "language_loss": 0.83314502, + "learning_rate": 0.00024955046229314584, + "loss": 0.84404874, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.25732422, + "step": 3519, + "time_per_iteration": 2.598114013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090498, + "balance_loss_mlp": 1.06486833, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06076117053087645, + "language_loss": 0.87566268, + "learning_rate": 0.00024928086928218947, + "loss": 0.88656765, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.25646973, + "step": 3520, + "time_per_iteration": 2.4903347492218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088735, + "balance_loss_mlp": 1.06373692, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.07287675105407085, + "language_loss": 0.76298815, + "learning_rate": 0.00024901137360903216, + "loss": 0.77387547, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.25012207, + "step": 3521, + "time_per_iteration": 2.957127332687378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095619, + "balance_loss_mlp": 1.07063317, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.06312793336661301, + "language_loss": 0.80923325, + "learning_rate": 0.00024874197537830115, + "loss": 0.82018942, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.25, + "step": 3522, + "time_per_iteration": 2.5331904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088383, + "balance_loss_mlp": 1.06340837, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.06755999829243825, + "language_loss": 0.83245486, + "learning_rate": 0.00024847267469458684, + "loss": 0.84333861, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.24987793, + "step": 3523, + "time_per_iteration": 2.525132417678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087699, + "balance_loss_mlp": 1.06227136, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.06413222868120108, + "language_loss": 0.7768755, + "learning_rate": 0.00024820347166244034, + "loss": 0.78775245, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.2545166, + "step": 3524, + "time_per_iteration": 2.981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086699, + "balance_loss_mlp": 1.06202292, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.05504268755714505, + "language_loss": 0.85045242, + "learning_rate": 0.0002479343663863755, + "loss": 0.86131942, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.24682617, + "step": 3525, + "time_per_iteration": 2.8227763175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_mlp": 1.05880737, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.05863991257689852, + "language_loss": 0.76910073, + "learning_rate": 0.00024766535897086876, + "loss": 0.77995467, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.26623535, + "step": 3526, + "time_per_iteration": 2.5773653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_mlp": 1.06144333, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.09784293796140163, + "language_loss": 0.78738832, + "learning_rate": 0.0002473964495203578, + "loss": 0.79827124, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.26879883, + "step": 3527, + "time_per_iteration": 2.6880078315734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084335, + "balance_loss_mlp": 1.0582881, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.057535616480669176, + "language_loss": 0.85700953, + "learning_rate": 0.0002471276381392425, + "loss": 0.86785293, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.26062012, + "step": 3528, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_mlp": 1.02067733, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.014996437557936866, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79216838, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.07519531, + "step": 3529, + "time_per_iteration": 4.95120096206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088756, + "balance_loss_mlp": 1.06375766, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06208247419260481, + "language_loss": 0.84420717, + "learning_rate": 0.00024659031000260826, + "loss": 0.85509473, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.25, + "step": 3530, + "time_per_iteration": 2.8619091510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085263, + "balance_loss_mlp": 1.05816674, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.0739213834175869, + "language_loss": 0.80927098, + "learning_rate": 0.0002463217934556985, + "loss": 0.82012367, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.27111816, + "step": 3531, + "time_per_iteration": 2.6372668743133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015203, + "balance_loss_mlp": 1.00790787, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.011067583088495437, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77547294, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.07275391, + "step": 3532, + "time_per_iteration": 4.7275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.06364703, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.07509562800064129, + "language_loss": 0.83719718, + "learning_rate": 0.0002457850559259306, + "loss": 0.84809136, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.25769043, + "step": 3533, + "time_per_iteration": 2.9546730518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082617, + "balance_loss_mlp": 1.05801249, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.058098360832657354, + "language_loss": 0.82016122, + "learning_rate": 0.00024551683515145275, + "loss": 0.83098733, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.24597168, + "step": 3534, + "time_per_iteration": 2.675198793411255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080796, + "balance_loss_mlp": 1.05546427, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.05760567747955486, + "language_loss": 0.8703866, + "learning_rate": 0.0002452487131761014, + "loss": 0.88119459, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.25354004, + "step": 3535, + "time_per_iteration": 2.7560551166534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080116, + "balance_loss_mlp": 1.0540328, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.06295067828117173, + "language_loss": 0.80308378, + "learning_rate": 0.00024498069010397093, + "loss": 0.81388497, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.26123047, + "step": 3536, + "time_per_iteration": 2.7834858894348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081714, + "balance_loss_mlp": 1.05659688, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.05311413665555526, + "language_loss": 0.85467112, + "learning_rate": 0.00024471276603911697, + "loss": 0.86548829, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.2512207, + "step": 3537, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086182, + "balance_loss_mlp": 1.06095743, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.0668547033198753, + "language_loss": 0.79341853, + "learning_rate": 0.0002444449410855572, + "loss": 0.80428034, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.25231934, + "step": 3538, + "time_per_iteration": 2.790034532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083109, + "balance_loss_mlp": 1.0583849, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.056899188287429556, + "language_loss": 0.84389639, + "learning_rate": 0.00024417721534727033, + "loss": 0.85472751, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.24731445, + "step": 3539, + "time_per_iteration": 2.703143358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081155, + "balance_loss_mlp": 1.0562042, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.06562679569248508, + "language_loss": 0.83345222, + "learning_rate": 0.00024390958892819687, + "loss": 0.84426379, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.24938965, + "step": 3540, + "time_per_iteration": 2.5123190879821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083792, + "balance_loss_mlp": 1.0574708, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.0704351751694786, + "language_loss": 0.80845803, + "learning_rate": 0.0002436420619322381, + "loss": 0.81929594, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.26367188, + "step": 3541, + "time_per_iteration": 2.8810999393463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080954, + "balance_loss_mlp": 1.05532384, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.05740970422005706, + "language_loss": 0.82921457, + "learning_rate": 0.0002433746344632577, + "loss": 0.84002411, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.25634766, + "step": 3542, + "time_per_iteration": 2.7135009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085031, + "balance_loss_mlp": 1.0591507, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.09305819117462581, + "language_loss": 0.80352092, + "learning_rate": 0.00024310730662508006, + "loss": 0.81437123, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.25891113, + "step": 3543, + "time_per_iteration": 3.061795949935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080318, + "balance_loss_mlp": 1.05509281, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.05668741815102704, + "language_loss": 0.87538439, + "learning_rate": 0.0002428400785214911, + "loss": 0.88618755, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.25231934, + "step": 3544, + "time_per_iteration": 2.600311279296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077375, + "balance_loss_mlp": 1.05138755, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.05461889595804736, + "language_loss": 0.8282584, + "learning_rate": 0.00024257295025623794, + "loss": 0.83903217, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.26025391, + "step": 3545, + "time_per_iteration": 2.9303810596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.05181503, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.05463357395047058, + "language_loss": 0.80816013, + "learning_rate": 0.00024230592193302892, + "loss": 0.8189292, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.25085449, + "step": 3546, + "time_per_iteration": 3.0259780883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108338, + "balance_loss_mlp": 1.05730915, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.061332624341889866, + "language_loss": 0.84813237, + "learning_rate": 0.00024203899365553372, + "loss": 0.85896623, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.2611084, + "step": 3547, + "time_per_iteration": 2.5990257263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101826, + "balance_loss_mlp": 1.01120329, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.024302183931920462, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7775262, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.07080078, + "step": 3548, + "time_per_iteration": 4.529210090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082512, + "balance_loss_mlp": 1.05796695, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.06743291659407481, + "language_loss": 0.83211255, + "learning_rate": 0.00024150543765216848, + "loss": 0.84293771, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.2454834, + "step": 3549, + "time_per_iteration": 2.9848315715789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079547, + "balance_loss_mlp": 1.05444109, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.06339760092568236, + "language_loss": 0.83246768, + "learning_rate": 0.00024123881013344352, + "loss": 0.84326315, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.25109863, + "step": 3550, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078185, + "balance_loss_mlp": 1.05236471, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.05786884638385198, + "language_loss": 0.79739892, + "learning_rate": 0.00024097228307472202, + "loss": 0.80818081, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.25854492, + "step": 3551, + "time_per_iteration": 2.8328561782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078138, + "balance_loss_mlp": 1.0525794, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.06566140613628157, + "language_loss": 0.81969666, + "learning_rate": 0.00024070585657947846, + "loss": 0.83047807, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.25585938, + "step": 3552, + "time_per_iteration": 2.962819814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081884, + "balance_loss_mlp": 1.05676627, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.0534920389978937, + "language_loss": 0.8565321, + "learning_rate": 0.00024043953075114934, + "loss": 0.86735094, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.2512207, + "step": 3553, + "time_per_iteration": 2.638843059539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075297, + "balance_loss_mlp": 1.04947591, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.05485764052076591, + "language_loss": 0.88990396, + "learning_rate": 0.00024017330569313128, + "loss": 0.90065694, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.25842285, + "step": 3554, + "time_per_iteration": 2.7616748809814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078527, + "balance_loss_mlp": 1.05215812, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.07669249148194994, + "language_loss": 0.75058365, + "learning_rate": 0.0002399071815087821, + "loss": 0.76136887, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.26391602, + "step": 3555, + "time_per_iteration": 3.047292470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_mlp": 1.05511451, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.0595534971161133, + "language_loss": 0.84028351, + "learning_rate": 0.00023964115830142025, + "loss": 0.85108721, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.25256348, + "step": 3556, + "time_per_iteration": 2.708983898162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074295, + "balance_loss_mlp": 1.05035782, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.0757977451950182, + "language_loss": 0.88028133, + "learning_rate": 0.00023937523617432522, + "loss": 0.89102429, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.23950195, + "step": 3557, + "time_per_iteration": 2.454397201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077509, + "balance_loss_mlp": 1.05258226, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.08760866739293877, + "language_loss": 0.87423909, + "learning_rate": 0.00023910941523073705, + "loss": 0.88501424, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.24938965, + "step": 3558, + "time_per_iteration": 3.9113569259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.05796981, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.053991545736228864, + "language_loss": 0.86934376, + "learning_rate": 0.0002388436955738566, + "loss": 0.88018322, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.2598877, + "step": 3559, + "time_per_iteration": 2.837038040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080181, + "balance_loss_mlp": 1.05512345, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.06078167941241102, + "language_loss": 0.81248534, + "learning_rate": 0.00023857807730684523, + "loss": 0.82328713, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.25061035, + "step": 3560, + "time_per_iteration": 2.892477035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084991, + "balance_loss_mlp": 1.05795407, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.06645470458229728, + "language_loss": 0.82908154, + "learning_rate": 0.00023831256053282547, + "loss": 0.83993149, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.27050781, + "step": 3561, + "time_per_iteration": 2.724573850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081036, + "balance_loss_mlp": 1.05547762, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.06597218498580906, + "language_loss": 0.78399622, + "learning_rate": 0.00023804714535488003, + "loss": 0.7948066, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.25561523, + "step": 3562, + "time_per_iteration": 2.95060133934021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019571, + "balance_loss_mlp": 1.01251411, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.015166594487017694, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80829203, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.07080078, + "step": 3563, + "time_per_iteration": 4.933622360229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087969, + "balance_loss_mlp": 1.0631851, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.058645114783078524, + "language_loss": 0.81150877, + "learning_rate": 0.00023751662019934488, + "loss": 0.82238841, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.2479248, + "step": 3564, + "time_per_iteration": 2.551375150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080841, + "balance_loss_mlp": 1.05612862, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.05683958550718021, + "language_loss": 0.79323113, + "learning_rate": 0.00023725151042772364, + "loss": 0.80403948, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.24719238, + "step": 3565, + "time_per_iteration": 2.8488030433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081595, + "balance_loss_mlp": 1.05563116, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.06643768922422526, + "language_loss": 0.83425218, + "learning_rate": 0.00023698650266411276, + "loss": 0.84506816, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.26000977, + "step": 3566, + "time_per_iteration": 2.704754590988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079538, + "balance_loss_mlp": 1.05554175, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.06089372321072988, + "language_loss": 0.83402336, + "learning_rate": 0.00023672159701139755, + "loss": 0.84481871, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.23986816, + "step": 3567, + "time_per_iteration": 3.2112581729888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_mlp": 1.05952144, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.06475688467901158, + "language_loss": 0.86233699, + "learning_rate": 0.00023645679357242296, + "loss": 0.87318128, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.24890137, + "step": 3568, + "time_per_iteration": 2.618299961090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077978, + "balance_loss_mlp": 1.05325365, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.06930985258360142, + "language_loss": 0.84079957, + "learning_rate": 0.00023619209244999534, + "loss": 0.85157931, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.24694824, + "step": 3569, + "time_per_iteration": 2.5762784481048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.05775487, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.07239946064246126, + "language_loss": 0.84962302, + "learning_rate": 0.0002359274937468806, + "loss": 0.86045027, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.24975586, + "step": 3570, + "time_per_iteration": 2.507097005844116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080008, + "balance_loss_mlp": 1.0555582, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.052246818326421945, + "language_loss": 0.78233075, + "learning_rate": 0.00023566299756580512, + "loss": 0.79313087, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.2442627, + "step": 3571, + "time_per_iteration": 2.6490540504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.05523372, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.06589873086425142, + "language_loss": 0.78497767, + "learning_rate": 0.0002353986040094551, + "loss": 0.79579425, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.2644043, + "step": 3572, + "time_per_iteration": 2.525590419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05405378, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.058453848630334905, + "language_loss": 0.79833031, + "learning_rate": 0.00023513431318047796, + "loss": 0.80912042, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.24975586, + "step": 3573, + "time_per_iteration": 2.5652148723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081007, + "balance_loss_mlp": 1.0563786, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.12934714491167457, + "language_loss": 0.77343333, + "learning_rate": 0.00023487012518147977, + "loss": 0.78424335, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.24621582, + "step": 3574, + "time_per_iteration": 3.2728779315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.05660903, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.06788347581923994, + "language_loss": 0.8458752, + "learning_rate": 0.00023460604011502772, + "loss": 0.85669678, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.25549316, + "step": 3575, + "time_per_iteration": 3.650050163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071019, + "balance_loss_mlp": 1.04640222, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.06594699265094836, + "language_loss": 0.85666633, + "learning_rate": 0.00023434205808364845, + "loss": 0.86737645, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.24621582, + "step": 3576, + "time_per_iteration": 3.2174363136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081926, + "balance_loss_mlp": 1.05646336, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.073624827285274, + "language_loss": 0.85645866, + "learning_rate": 0.00023407817918982932, + "loss": 0.86727792, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.25488281, + "step": 3577, + "time_per_iteration": 2.8009090423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088022, + "balance_loss_mlp": 1.06271362, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.06549349473125507, + "language_loss": 0.79113662, + "learning_rate": 0.00023381440353601718, + "loss": 0.80201685, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.2532959, + "step": 3578, + "time_per_iteration": 3.023149251937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080761, + "balance_loss_mlp": 1.05627584, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.05959315999492073, + "language_loss": 0.86070436, + "learning_rate": 0.00023355073122461822, + "loss": 0.87151194, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.24487305, + "step": 3579, + "time_per_iteration": 2.9520890712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05880141, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.06355756593191678, + "language_loss": 0.82827502, + "learning_rate": 0.00023328716235799973, + "loss": 0.83911884, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.25598145, + "step": 3580, + "time_per_iteration": 3.351285219192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080812, + "balance_loss_mlp": 1.05680299, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.05871142943590934, + "language_loss": 0.84103072, + "learning_rate": 0.00023302369703848803, + "loss": 0.85183883, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.24023438, + "step": 3581, + "time_per_iteration": 2.7034530639648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088103, + "balance_loss_mlp": 1.06281841, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.05872811421519248, + "language_loss": 0.80432433, + "learning_rate": 0.00023276033536836937, + "loss": 0.81520534, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.25305176, + "step": 3582, + "time_per_iteration": 2.933551073074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077987, + "balance_loss_mlp": 1.05369234, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.06546577273757126, + "language_loss": 0.84750611, + "learning_rate": 0.00023249707744988984, + "loss": 0.85828596, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.24279785, + "step": 3583, + "time_per_iteration": 2.694974184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_mlp": 1.05804539, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.07473355522869814, + "language_loss": 0.82210362, + "learning_rate": 0.00023223392338525529, + "loss": 0.83294201, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.25830078, + "step": 3584, + "time_per_iteration": 2.5522758960723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078352, + "balance_loss_mlp": 1.05215001, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.05831544334966422, + "language_loss": 0.78814328, + "learning_rate": 0.00023197087327663107, + "loss": 0.79892683, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.26208496, + "step": 3585, + "time_per_iteration": 2.6880340576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083733, + "balance_loss_mlp": 1.05843663, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.6312762348239643, + "language_loss": 0.81380439, + "learning_rate": 0.00023170792722614243, + "loss": 0.8246417, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.25317383, + "step": 3586, + "time_per_iteration": 3.0318641662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079757, + "balance_loss_mlp": 1.05460346, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.05006567848129158, + "language_loss": 0.83709162, + "learning_rate": 0.00023144508533587377, + "loss": 0.84788913, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.25170898, + "step": 3587, + "time_per_iteration": 2.8474464416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06386399, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.06762785817059219, + "language_loss": 0.79246032, + "learning_rate": 0.0002311823477078698, + "loss": 0.80336785, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.26928711, + "step": 3588, + "time_per_iteration": 2.9889235496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097034, + "balance_loss_mlp": 1.0714879, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.09415937130110832, + "language_loss": 0.85614562, + "learning_rate": 0.00023091971444413428, + "loss": 0.86711591, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.2557373, + "step": 3589, + "time_per_iteration": 2.809373378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101509, + "balance_loss_mlp": 1.07424605, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.05794959755729282, + "language_loss": 0.82868153, + "learning_rate": 0.00023065718564663012, + "loss": 0.83969659, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.27307129, + "step": 3590, + "time_per_iteration": 2.7731661796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074688, + "balance_loss_mlp": 1.06705844, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.02655452112357536, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74986279, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.07617188, + "step": 3591, + "time_per_iteration": 4.988200664520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_mlp": 1.07704329, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.05972599436202674, + "language_loss": 0.81043237, + "learning_rate": 0.0002301324418579666, + "loss": 0.82145822, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.2557373, + "step": 3592, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010908, + "balance_loss_mlp": 1.0828371, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.028191154698104088, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79779273, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.07958984, + "step": 3593, + "time_per_iteration": 4.760195732116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108367, + "balance_loss_mlp": 1.08173525, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.065561015733832, + "language_loss": 0.809973, + "learning_rate": 0.00022960811715677415, + "loss": 0.82105672, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.26660156, + "step": 3594, + "time_per_iteration": 2.897792339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117, + "balance_loss_mlp": 1.08976054, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.0669935961338165, + "language_loss": 0.81794119, + "learning_rate": 0.00022934611221845608, + "loss": 0.82911116, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.27258301, + "step": 3595, + "time_per_iteration": 2.8457274436950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.08326638, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.05592882614281094, + "language_loss": 0.78289419, + "learning_rate": 0.00022908421235729609, + "loss": 0.79401559, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.28881836, + "step": 3596, + "time_per_iteration": 2.7383065223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108605, + "balance_loss_mlp": 1.08028126, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.10609288335258749, + "language_loss": 0.85567772, + "learning_rate": 0.0002288224176749728, + "loss": 0.86676377, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.28320312, + "step": 3597, + "time_per_iteration": 2.716928720474243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102474, + "balance_loss_mlp": 1.07592607, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.07082611334178894, + "language_loss": 0.78666878, + "learning_rate": 0.00022856072827312385, + "loss": 0.79769349, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.26525879, + "step": 3598, + "time_per_iteration": 2.9266068935394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102937, + "balance_loss_mlp": 1.07671118, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.06087087584265889, + "language_loss": 0.77196717, + "learning_rate": 0.00022829914425334598, + "loss": 0.78299654, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.26269531, + "step": 3599, + "time_per_iteration": 2.654209852218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.07294059, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.0619495663998332, + "language_loss": 0.80632389, + "learning_rate": 0.0002280376657171956, + "loss": 0.81731534, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.26245117, + "step": 3600, + "time_per_iteration": 2.699690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110091, + "balance_loss_mlp": 1.07408822, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.061197826149154644, + "language_loss": 0.76475906, + "learning_rate": 0.00022777629276618706, + "loss": 0.77576816, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.26855469, + "step": 3601, + "time_per_iteration": 3.2044432163238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07805634, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.05964780177227117, + "language_loss": 0.77385223, + "learning_rate": 0.0002275150255017947, + "loss": 0.78491223, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.2800293, + "step": 3602, + "time_per_iteration": 2.7982289791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.06525421, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.02252774148051873, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76806176, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.08251953, + "step": 3603, + "time_per_iteration": 5.054601192474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.06465173, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.023702106563631756, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76200008, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.08251953, + "step": 3604, + "time_per_iteration": 4.7977614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116592, + "balance_loss_mlp": 1.08946013, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.06388542496956687, + "language_loss": 0.85052603, + "learning_rate": 0.0002267318588424379, + "loss": 0.86169201, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.2713623, + "step": 3605, + "time_per_iteration": 2.654792308807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110494, + "balance_loss_mlp": 1.08425605, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.06333584007687255, + "language_loss": 0.87824345, + "learning_rate": 0.00022647101533842845, + "loss": 0.88934839, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.26257324, + "step": 3606, + "time_per_iteration": 2.8975396156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109552, + "balance_loss_mlp": 1.08295608, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.1091990827020025, + "language_loss": 0.76831424, + "learning_rate": 0.00022621027802778872, + "loss": 0.77940977, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.26623535, + "step": 3607, + "time_per_iteration": 2.63248348236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108534, + "balance_loss_mlp": 1.08149719, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.059104440190076296, + "language_loss": 0.78716248, + "learning_rate": 0.00022594964701174586, + "loss": 0.79824781, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.27075195, + "step": 3608, + "time_per_iteration": 2.681976079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111794, + "balance_loss_mlp": 1.08559155, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.07590462116844392, + "language_loss": 0.84867048, + "learning_rate": 0.00022568912239148586, + "loss": 0.85978842, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.26245117, + "step": 3609, + "time_per_iteration": 2.6417384147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101276, + "balance_loss_mlp": 1.07528806, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.058005826874071686, + "language_loss": 0.81464773, + "learning_rate": 0.00022542870426815344, + "loss": 0.82566053, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.26000977, + "step": 3610, + "time_per_iteration": 2.7006101608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109994, + "balance_loss_mlp": 1.08157444, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.056828094701861585, + "language_loss": 0.86496603, + "learning_rate": 0.00022516839274285173, + "loss": 0.87606597, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.28442383, + "step": 3611, + "time_per_iteration": 2.5740535259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094552, + "balance_loss_mlp": 1.06758666, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.08027595543675893, + "language_loss": 0.74892008, + "learning_rate": 0.00022490818791664265, + "loss": 0.75986564, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.26977539, + "step": 3612, + "time_per_iteration": 2.608222007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098839, + "balance_loss_mlp": 1.072685, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.059039400605863955, + "language_loss": 0.85845947, + "learning_rate": 0.00022464808989054676, + "loss": 0.86944789, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.26171875, + "step": 3613, + "time_per_iteration": 2.676614999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108263, + "balance_loss_mlp": 1.08036768, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.062091067173502004, + "language_loss": 0.76033241, + "learning_rate": 0.00022438809876554284, + "loss": 0.77141511, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.27905273, + "step": 3614, + "time_per_iteration": 2.6178860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104393, + "balance_loss_mlp": 1.07635498, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.07239671718654846, + "language_loss": 0.80618018, + "learning_rate": 0.00022412821464256873, + "loss": 0.81722414, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.28051758, + "step": 3615, + "time_per_iteration": 2.690284252166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094366, + "balance_loss_mlp": 1.06802058, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.05319621307951733, + "language_loss": 0.82896477, + "learning_rate": 0.00022386843762252023, + "loss": 0.83990836, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.2635498, + "step": 3616, + "time_per_iteration": 2.600942611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102453, + "balance_loss_mlp": 1.07486832, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.06580678033513349, + "language_loss": 0.79979908, + "learning_rate": 0.00022360876780625193, + "loss": 0.81082356, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.27587891, + "step": 3617, + "time_per_iteration": 2.645925998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095024, + "balance_loss_mlp": 1.06762934, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.0499393728898112, + "language_loss": 0.8003695, + "learning_rate": 0.00022334920529457604, + "loss": 0.81131971, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.27441406, + "step": 3618, + "time_per_iteration": 2.899454116821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.06254315, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.05309035379190974, + "language_loss": 0.87337005, + "learning_rate": 0.00022308975018826423, + "loss": 0.88426542, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.27026367, + "step": 3619, + "time_per_iteration": 2.912917375564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095664, + "balance_loss_mlp": 1.06719649, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.06796578820751965, + "language_loss": 0.84640574, + "learning_rate": 0.00022283040258804564, + "loss": 0.85736233, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.28466797, + "step": 3620, + "time_per_iteration": 2.8118083477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094233, + "balance_loss_mlp": 1.06671929, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.05989374057808202, + "language_loss": 0.8382861, + "learning_rate": 0.00022257116259460802, + "loss": 0.84922838, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.27539062, + "step": 3621, + "time_per_iteration": 2.8895604610443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087446, + "balance_loss_mlp": 1.06112456, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.08406713908768157, + "language_loss": 0.81423789, + "learning_rate": 0.00022231203030859725, + "loss": 0.82511234, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.26367188, + "step": 3622, + "time_per_iteration": 2.971266269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094655, + "balance_loss_mlp": 1.06714153, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.06551084245575202, + "language_loss": 0.83678401, + "learning_rate": 0.00022205300583061737, + "loss": 0.84773052, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.27539062, + "step": 3623, + "time_per_iteration": 2.585472822189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108403, + "balance_loss_mlp": 1.07649624, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.033083333186048725, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83922231, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.07519531, + "step": 3624, + "time_per_iteration": 4.895202159881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090164, + "balance_loss_mlp": 1.06300831, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.0660307641542608, + "language_loss": 0.77727789, + "learning_rate": 0.00022153528070095735, + "loss": 0.78817952, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.27197266, + "step": 3625, + "time_per_iteration": 2.701016902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085494, + "balance_loss_mlp": 1.05917203, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07343943993793525, + "language_loss": 0.88176632, + "learning_rate": 0.00022127658025027568, + "loss": 0.89262128, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.26330566, + "step": 3626, + "time_per_iteration": 2.66186261177063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087139, + "balance_loss_mlp": 1.0592438, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.05867128849985362, + "language_loss": 0.85380179, + "learning_rate": 0.00022101798800962258, + "loss": 0.86467314, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.27905273, + "step": 3627, + "time_per_iteration": 2.61289119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088464, + "balance_loss_mlp": 1.06195211, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.06919874176804652, + "language_loss": 0.78915298, + "learning_rate": 0.00022075950407939227, + "loss": 0.80003762, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.26550293, + "step": 3628, + "time_per_iteration": 2.6066434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082826, + "balance_loss_mlp": 1.05665994, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.06455342757001964, + "language_loss": 0.83102697, + "learning_rate": 0.0002205011285599367, + "loss": 0.84185529, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.26208496, + "step": 3629, + "time_per_iteration": 2.627265691757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084671, + "balance_loss_mlp": 1.05813527, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.05600849207785957, + "language_loss": 0.80451405, + "learning_rate": 0.00022024286155156658, + "loss": 0.81536078, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.26586914, + "step": 3630, + "time_per_iteration": 2.8945116996765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080924, + "balance_loss_mlp": 1.05462611, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.05471727557268105, + "language_loss": 0.86118478, + "learning_rate": 0.00021998470315454994, + "loss": 0.87199402, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.26306152, + "step": 3631, + "time_per_iteration": 2.711768627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05755305, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.05720000052164256, + "language_loss": 0.8692646, + "learning_rate": 0.00021972665346911275, + "loss": 0.8801105, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.27050781, + "step": 3632, + "time_per_iteration": 2.7766430377960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086616, + "balance_loss_mlp": 1.0609858, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.0722224306004379, + "language_loss": 0.79897952, + "learning_rate": 0.00021946871259543877, + "loss": 0.80984569, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.2565918, + "step": 3633, + "time_per_iteration": 2.600034713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079015, + "balance_loss_mlp": 1.05383754, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.0639524243068684, + "language_loss": 0.83284152, + "learning_rate": 0.00021921088063366957, + "loss": 0.84363163, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.25183105, + "step": 3634, + "time_per_iteration": 2.956197738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085501, + "balance_loss_mlp": 1.0596205, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.058476095641480985, + "language_loss": 0.81960422, + "learning_rate": 0.00021895315768390435, + "loss": 0.83045918, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.2590332, + "step": 3635, + "time_per_iteration": 2.5913336277008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05759156, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.04531341451753373, + "language_loss": 0.87785435, + "learning_rate": 0.00021869554384619999, + "loss": 0.88868463, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.25415039, + "step": 3636, + "time_per_iteration": 2.9603588581085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089766, + "balance_loss_mlp": 1.06315875, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.21159082474566934, + "language_loss": 0.80919135, + "learning_rate": 0.00021843803922057115, + "loss": 0.82008898, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.26660156, + "step": 3637, + "time_per_iteration": 2.708937406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087312, + "balance_loss_mlp": 1.0621587, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.060159968094543256, + "language_loss": 0.82011575, + "learning_rate": 0.00021818064390698977, + "loss": 0.83098888, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.25170898, + "step": 3638, + "time_per_iteration": 2.605764389038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086471, + "balance_loss_mlp": 1.06113935, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.06371626432210087, + "language_loss": 0.87017298, + "learning_rate": 0.0002179233580053861, + "loss": 0.88103765, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.25354004, + "step": 3639, + "time_per_iteration": 2.7112109661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.0573926, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.058687026763644914, + "language_loss": 0.86069989, + "learning_rate": 0.00021766618161564688, + "loss": 0.87153351, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.26013184, + "step": 3640, + "time_per_iteration": 2.6974241733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.05666459, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.05259786469009478, + "language_loss": 0.87277496, + "learning_rate": 0.00021740911483761677, + "loss": 0.88360298, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.26123047, + "step": 3641, + "time_per_iteration": 2.5836639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089745, + "balance_loss_mlp": 1.06368566, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.04971665087061583, + "language_loss": 0.9236384, + "learning_rate": 0.00021715215777109837, + "loss": 0.93453586, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.26074219, + "step": 3642, + "time_per_iteration": 2.974407911300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085504, + "balance_loss_mlp": 1.06024349, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.05973771415141703, + "language_loss": 0.84664541, + "learning_rate": 0.00021689531051585103, + "loss": 0.85750043, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.25280762, + "step": 3643, + "time_per_iteration": 2.577305316925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089117, + "balance_loss_mlp": 1.06186557, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.062367103447564735, + "language_loss": 0.80804634, + "learning_rate": 0.00021663857317159196, + "loss": 0.81893754, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.27294922, + "step": 3644, + "time_per_iteration": 2.640782356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085412, + "balance_loss_mlp": 1.05996037, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.10933947779444686, + "language_loss": 0.82007676, + "learning_rate": 0.00021638194583799487, + "loss": 0.83093089, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.25476074, + "step": 3645, + "time_per_iteration": 2.660571813583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080249, + "balance_loss_mlp": 1.05334401, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.0653990594073395, + "language_loss": 0.82918119, + "learning_rate": 0.00021612542861469176, + "loss": 0.83998358, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.26916504, + "step": 3646, + "time_per_iteration": 3.1750996112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082853, + "balance_loss_mlp": 1.05595946, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.060469177257194674, + "language_loss": 0.82402915, + "learning_rate": 0.00021586902160127135, + "loss": 0.8348577, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.26928711, + "step": 3647, + "time_per_iteration": 2.60231614112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083469, + "balance_loss_mlp": 1.05743361, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.10102975915851765, + "language_loss": 0.74238408, + "learning_rate": 0.00021561272489727974, + "loss": 0.75321877, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.26062012, + "step": 3648, + "time_per_iteration": 2.455183744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083725, + "balance_loss_mlp": 1.0581665, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.05896874636911686, + "language_loss": 0.80454385, + "learning_rate": 0.0002153565386022199, + "loss": 0.81538105, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.25585938, + "step": 3649, + "time_per_iteration": 2.6365654468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090223, + "balance_loss_mlp": 1.0643425, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.0684708856776036, + "language_loss": 0.82569027, + "learning_rate": 0.00021510046281555262, + "loss": 0.83659256, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.25915527, + "step": 3650, + "time_per_iteration": 2.8082711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088794, + "balance_loss_mlp": 1.06316423, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.06759336316034399, + "language_loss": 0.81458813, + "learning_rate": 0.0002148444976366949, + "loss": 0.82547605, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.2565918, + "step": 3651, + "time_per_iteration": 2.753706455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086485, + "balance_loss_mlp": 1.06129622, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.05344717871766575, + "language_loss": 0.82698804, + "learning_rate": 0.00021458864316502136, + "loss": 0.8378529, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.25183105, + "step": 3652, + "time_per_iteration": 2.737903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086912, + "balance_loss_mlp": 1.06264138, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.05962835254673255, + "language_loss": 0.87223494, + "learning_rate": 0.0002143328994998634, + "loss": 0.88310409, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.24267578, + "step": 3653, + "time_per_iteration": 2.504406213760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089816, + "balance_loss_mlp": 1.06336296, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.060478723540627326, + "language_loss": 0.78619695, + "learning_rate": 0.00021407726674050982, + "loss": 0.79709506, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.26477051, + "step": 3654, + "time_per_iteration": 2.8486123085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094096, + "balance_loss_mlp": 1.06856155, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.050916885962277426, + "language_loss": 0.87187326, + "learning_rate": 0.0002138217449862061, + "loss": 0.88281423, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.25549316, + "step": 3655, + "time_per_iteration": 2.7588388919830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108901, + "balance_loss_mlp": 1.06326032, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.05276360585412431, + "language_loss": 0.78396368, + "learning_rate": 0.00021356633433615403, + "loss": 0.79485381, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.25744629, + "step": 3656, + "time_per_iteration": 2.6218318939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079447, + "balance_loss_mlp": 1.05436552, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.048722851637787626, + "language_loss": 0.83386952, + "learning_rate": 0.0002133110348895133, + "loss": 0.84466398, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.25061035, + "step": 3657, + "time_per_iteration": 2.9466397762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086119, + "balance_loss_mlp": 1.06054902, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.10765454833188913, + "language_loss": 0.85102618, + "learning_rate": 0.0002130558467453999, + "loss": 0.86188745, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.25585938, + "step": 3658, + "time_per_iteration": 3.3578195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05956531, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06250625204972131, + "language_loss": 0.84476495, + "learning_rate": 0.0002128007700028865, + "loss": 0.85562122, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.26086426, + "step": 3659, + "time_per_iteration": 2.716048002243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.05420375, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.07665519307089459, + "language_loss": 0.845348, + "learning_rate": 0.00021254580476100276, + "loss": 0.85614467, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.25476074, + "step": 3660, + "time_per_iteration": 2.5458219051361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.05685711, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.058748946938806695, + "language_loss": 0.7943238, + "learning_rate": 0.00021229095111873497, + "loss": 0.80515134, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.25927734, + "step": 3661, + "time_per_iteration": 2.775683641433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05252695, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.051479556836423725, + "language_loss": 0.86013281, + "learning_rate": 0.0002120362091750261, + "loss": 0.87092221, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.26452637, + "step": 3662, + "time_per_iteration": 2.835092782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076634, + "balance_loss_mlp": 1.04978824, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.060876931500520017, + "language_loss": 0.86844277, + "learning_rate": 0.00021178157902877566, + "loss": 0.87920904, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.26879883, + "step": 3663, + "time_per_iteration": 2.440558910369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.0555284, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.061135120384029226, + "language_loss": 0.87179941, + "learning_rate": 0.0002115270607788397, + "loss": 0.88261312, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.25866699, + "step": 3664, + "time_per_iteration": 2.7565457820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107708, + "balance_loss_mlp": 1.05143833, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.0582225162514945, + "language_loss": 0.85968196, + "learning_rate": 0.00021127265452403133, + "loss": 0.87045276, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.25671387, + "step": 3665, + "time_per_iteration": 2.545664072036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032205, + "balance_loss_mlp": 1.02552938, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.013425187729100906, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85123837, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.06689453, + "step": 3666, + "time_per_iteration": 4.894615888595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076922, + "balance_loss_mlp": 1.04990888, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.05971260757424555, + "language_loss": 0.82980728, + "learning_rate": 0.00021076417839483065, + "loss": 0.84057647, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.27026367, + "step": 3667, + "time_per_iteration": 2.776766300201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_mlp": 1.04667187, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.06375812283048922, + "language_loss": 0.8522588, + "learning_rate": 0.00021051010871784589, + "loss": 0.86299354, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.26855469, + "step": 3668, + "time_per_iteration": 2.5415139198303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069942, + "balance_loss_mlp": 1.04501557, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.055214127492262476, + "language_loss": 0.79052877, + "learning_rate": 0.0002102561514308045, + "loss": 0.80122823, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.24926758, + "step": 3669, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072356, + "balance_loss_mlp": 1.04667854, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.07306534316954115, + "language_loss": 0.82677996, + "learning_rate": 0.00021000230663230135, + "loss": 0.83750349, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.25708008, + "step": 3670, + "time_per_iteration": 2.6818981170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074835, + "balance_loss_mlp": 1.04937172, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.06539460490463701, + "language_loss": 0.83441806, + "learning_rate": 0.00020974857442088762, + "loss": 0.84516644, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.25476074, + "step": 3671, + "time_per_iteration": 2.608067512512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075045, + "balance_loss_mlp": 1.04928422, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.05848649704443167, + "language_loss": 0.88856924, + "learning_rate": 0.00020949495489507104, + "loss": 0.89931971, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.25769043, + "step": 3672, + "time_per_iteration": 2.6813790798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076712, + "balance_loss_mlp": 1.050367, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.06054837689365347, + "language_loss": 0.84767634, + "learning_rate": 0.00020924144815331525, + "loss": 0.8584435, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.26367188, + "step": 3673, + "time_per_iteration": 2.542840003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076685, + "balance_loss_mlp": 1.05078053, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.05390499311408587, + "language_loss": 0.83514738, + "learning_rate": 0.00020898805429404044, + "loss": 0.84591424, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.25927734, + "step": 3674, + "time_per_iteration": 2.6225385665893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079471, + "balance_loss_mlp": 1.05372167, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.06276037819785552, + "language_loss": 0.78933322, + "learning_rate": 0.0002087347734156228, + "loss": 0.80012792, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.2578125, + "step": 3675, + "time_per_iteration": 2.855715751647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078668, + "balance_loss_mlp": 1.05318117, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.06320503796682253, + "language_loss": 0.79648715, + "learning_rate": 0.00020848160561639452, + "loss": 0.80727386, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.25512695, + "step": 3676, + "time_per_iteration": 2.647651433944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079385, + "balance_loss_mlp": 1.05396986, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.05839132735303564, + "language_loss": 0.86102867, + "learning_rate": 0.0002082285509946445, + "loss": 0.8718226, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.25415039, + "step": 3677, + "time_per_iteration": 2.5633320808410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081369, + "balance_loss_mlp": 1.05606055, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.05152517094969974, + "language_loss": 0.8344785, + "learning_rate": 0.00020797560964861683, + "loss": 0.84529221, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.25341797, + "step": 3678, + "time_per_iteration": 2.7661099433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.05028617, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.06274913334452144, + "language_loss": 0.80699748, + "learning_rate": 0.0002077227816765122, + "loss": 0.81774426, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.24401855, + "step": 3679, + "time_per_iteration": 3.065239191055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_mlp": 1.01730835, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.014391592464441782, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77472043, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.0703125, + "step": 3680, + "time_per_iteration": 4.8172595500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073441, + "balance_loss_mlp": 1.04958761, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.05034113841233223, + "language_loss": 0.79209405, + "learning_rate": 0.00020721746624665383, + "loss": 0.80282843, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.23852539, + "step": 3681, + "time_per_iteration": 2.7298145294189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_mlp": 1.05822945, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.059799820942850454, + "language_loss": 0.80445623, + "learning_rate": 0.00020696497898508114, + "loss": 0.81529093, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.25268555, + "step": 3682, + "time_per_iteration": 2.9937915802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075641, + "balance_loss_mlp": 1.05092919, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.06191150286406427, + "language_loss": 0.77959311, + "learning_rate": 0.00020671260548979316, + "loss": 0.79034948, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.24719238, + "step": 3683, + "time_per_iteration": 3.0161404609680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081595, + "balance_loss_mlp": 1.05558372, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.05521829943560005, + "language_loss": 0.85212427, + "learning_rate": 0.00020646034585876982, + "loss": 0.86294019, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.26037598, + "step": 3684, + "time_per_iteration": 2.8698270320892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073657, + "balance_loss_mlp": 1.04850388, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.04944753850163826, + "language_loss": 0.84324521, + "learning_rate": 0.00020620820018994718, + "loss": 0.85398173, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.25170898, + "step": 3685, + "time_per_iteration": 2.801947832107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079387, + "balance_loss_mlp": 1.0536145, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.07519073749771547, + "language_loss": 0.83086288, + "learning_rate": 0.00020595616858121675, + "loss": 0.84165674, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.2578125, + "step": 3686, + "time_per_iteration": 2.7280051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070479, + "balance_loss_mlp": 1.04551697, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.05447903108557543, + "language_loss": 0.80602473, + "learning_rate": 0.00020570425113042586, + "loss": 0.81672955, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.24963379, + "step": 3687, + "time_per_iteration": 2.8146443367004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05596519, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.06579545138102952, + "language_loss": 0.85866553, + "learning_rate": 0.0002054524479353776, + "loss": 0.86947191, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.24682617, + "step": 3688, + "time_per_iteration": 2.6602835655212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04767823, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07679676176766496, + "language_loss": 0.81976587, + "learning_rate": 0.00020520075909383063, + "loss": 0.83050537, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.26306152, + "step": 3689, + "time_per_iteration": 2.866727590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074803, + "balance_loss_mlp": 1.04981625, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.05660248987472117, + "language_loss": 0.81022668, + "learning_rate": 0.00020494918470349916, + "loss": 0.82097471, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.25, + "step": 3690, + "time_per_iteration": 3.272037982940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107649, + "balance_loss_mlp": 1.04971516, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.08247583019648676, + "language_loss": 0.85683942, + "learning_rate": 0.00020469772486205297, + "loss": 0.86760426, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.26794434, + "step": 3691, + "time_per_iteration": 2.677762269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079524, + "balance_loss_mlp": 1.05322635, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.06411942158990899, + "language_loss": 0.81443423, + "learning_rate": 0.0002044463796671177, + "loss": 0.82522947, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.26330566, + "step": 3692, + "time_per_iteration": 2.6739578247070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077922, + "balance_loss_mlp": 1.0519464, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.06149610751956677, + "language_loss": 0.80325758, + "learning_rate": 0.00020419514921627408, + "loss": 0.81403679, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.2598877, + "step": 3693, + "time_per_iteration": 2.8510119915008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076383, + "balance_loss_mlp": 1.05039525, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.05808850805852677, + "language_loss": 0.77474564, + "learning_rate": 0.00020394403360705855, + "loss": 0.78550947, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.26025391, + "step": 3694, + "time_per_iteration": 2.719911813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086948, + "balance_loss_mlp": 1.06034029, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.059410233197540796, + "language_loss": 0.87816525, + "learning_rate": 0.00020369303293696228, + "loss": 0.88903475, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.26635742, + "step": 3695, + "time_per_iteration": 2.657715082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079685, + "balance_loss_mlp": 1.05571198, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.06517545508220793, + "language_loss": 0.7842719, + "learning_rate": 0.00020344214730343304, + "loss": 0.79506874, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.23962402, + "step": 3696, + "time_per_iteration": 2.6142332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05308461, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.05470571931894002, + "language_loss": 0.79182768, + "learning_rate": 0.00020319137680387296, + "loss": 0.80260944, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.25109863, + "step": 3697, + "time_per_iteration": 2.915419578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107666, + "balance_loss_mlp": 1.05055368, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.06661588329403122, + "language_loss": 0.80553949, + "learning_rate": 0.0002029407215356398, + "loss": 0.81630599, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.26123047, + "step": 3698, + "time_per_iteration": 2.5700740814208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108156, + "balance_loss_mlp": 1.05670524, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.06665507382105876, + "language_loss": 0.83601737, + "learning_rate": 0.00020269018159604663, + "loss": 0.84683299, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.24841309, + "step": 3699, + "time_per_iteration": 2.7208173274993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.05197358, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.05024967484992462, + "language_loss": 0.82184601, + "learning_rate": 0.00020243975708236162, + "loss": 0.83261693, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.25146484, + "step": 3700, + "time_per_iteration": 2.6433067321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108194, + "balance_loss_mlp": 1.05664349, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.07883365908247705, + "language_loss": 0.86320221, + "learning_rate": 0.00020218944809180818, + "loss": 0.87402165, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.25305176, + "step": 3701, + "time_per_iteration": 2.705932855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080401, + "balance_loss_mlp": 1.05541444, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.048190263761871716, + "language_loss": 0.84987295, + "learning_rate": 0.00020193925472156493, + "loss": 0.86067688, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.25, + "step": 3702, + "time_per_iteration": 2.6893904209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_mlp": 1.03368771, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.023975764530948636, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.7532953, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.06689453, + "step": 3703, + "time_per_iteration": 4.881204843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078432, + "balance_loss_mlp": 1.05408931, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.04896517905072385, + "language_loss": 0.83809257, + "learning_rate": 0.00020143921523049863, + "loss": 0.84887689, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.24316406, + "step": 3704, + "time_per_iteration": 2.9580681324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075128, + "balance_loss_mlp": 1.04962897, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.05872916530123236, + "language_loss": 0.84084362, + "learning_rate": 0.00020118936930380837, + "loss": 0.85159492, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.25512695, + "step": 3705, + "time_per_iteration": 2.76068377494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_mlp": 1.05290496, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.05789936228630773, + "language_loss": 0.81465518, + "learning_rate": 0.0002009396393856932, + "loss": 0.82543886, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.25463867, + "step": 3706, + "time_per_iteration": 2.664915084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04923296, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.06297371189153962, + "language_loss": 0.8270002, + "learning_rate": 0.00020069002557310673, + "loss": 0.83774769, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.25512695, + "step": 3707, + "time_per_iteration": 2.658581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04976273, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.06876092007107866, + "language_loss": 0.77463377, + "learning_rate": 0.00020044052796295807, + "loss": 0.78538585, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.25476074, + "step": 3708, + "time_per_iteration": 2.7701447010040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073382, + "balance_loss_mlp": 1.04729891, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.058576923733569305, + "language_loss": 0.82293993, + "learning_rate": 0.00020019114665211063, + "loss": 0.83367372, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.2611084, + "step": 3709, + "time_per_iteration": 2.584200143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.04671192, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.05922999044905372, + "language_loss": 0.81765306, + "learning_rate": 0.00019994188173738276, + "loss": 0.82836854, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.24829102, + "step": 3710, + "time_per_iteration": 2.551407814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072134, + "balance_loss_mlp": 1.04628921, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.06343816758833129, + "language_loss": 0.80772817, + "learning_rate": 0.0001996927333155477, + "loss": 0.8184495, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.25878906, + "step": 3711, + "time_per_iteration": 2.748868227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075658, + "balance_loss_mlp": 1.04955149, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.06552359252627656, + "language_loss": 0.8595196, + "learning_rate": 0.00019944370148333346, + "loss": 0.87027609, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.26123047, + "step": 3712, + "time_per_iteration": 3.166109800338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072351, + "balance_loss_mlp": 1.04660141, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.05387618778038521, + "language_loss": 0.80135339, + "learning_rate": 0.00019919478633742278, + "loss": 0.81207693, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.2578125, + "step": 3713, + "time_per_iteration": 2.683401107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075332, + "balance_loss_mlp": 1.04877234, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.058133564140499, + "language_loss": 0.85435075, + "learning_rate": 0.00019894598797445302, + "loss": 0.86510408, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.265625, + "step": 3714, + "time_per_iteration": 2.570040225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074334, + "balance_loss_mlp": 1.04846525, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.050277092127782926, + "language_loss": 0.81853724, + "learning_rate": 0.00019869730649101615, + "loss": 0.82928061, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.25878906, + "step": 3715, + "time_per_iteration": 2.811513662338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071824, + "balance_loss_mlp": 1.04564583, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.06869941272731987, + "language_loss": 0.72641587, + "learning_rate": 0.00019844874198365943, + "loss": 0.73713416, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.26220703, + "step": 3716, + "time_per_iteration": 3.1328516006469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068844, + "balance_loss_mlp": 1.04348803, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.061640340400288096, + "language_loss": 0.84182858, + "learning_rate": 0.00019820029454888362, + "loss": 0.85251707, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.25378418, + "step": 3717, + "time_per_iteration": 2.7154488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014076, + "balance_loss_mlp": 1.00725687, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.019699659470436708, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75535345, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.06835938, + "step": 3718, + "time_per_iteration": 5.046099424362183 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04632878, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.06720182925313008, + "language_loss": 0.80157018, + "learning_rate": 0.0001977037512828529, + "loss": 0.81229812, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.26489258, + "step": 3719, + "time_per_iteration": 2.5823724269866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_mlp": 1.04183865, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.06101106638891309, + "language_loss": 0.86410248, + "learning_rate": 0.0001974556556443734, + "loss": 0.87477803, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.25708008, + "step": 3720, + "time_per_iteration": 2.6981611251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069959, + "balance_loss_mlp": 1.04529428, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.05855660874159423, + "language_loss": 0.88533628, + "learning_rate": 0.00019720767746402547, + "loss": 0.89603585, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.24658203, + "step": 3721, + "time_per_iteration": 2.7615206241607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_mlp": 1.04597294, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.062366353751096386, + "language_loss": 0.8018384, + "learning_rate": 0.00019695981683808222, + "loss": 0.81254995, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.2520752, + "step": 3722, + "time_per_iteration": 2.7723004817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079066, + "balance_loss_mlp": 1.05452061, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.061040751408566865, + "language_loss": 0.85407031, + "learning_rate": 0.00019671207386277225, + "loss": 0.86486095, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.24536133, + "step": 3723, + "time_per_iteration": 2.929828643798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074994, + "balance_loss_mlp": 1.0494113, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.060904147533300125, + "language_loss": 0.78436089, + "learning_rate": 0.0001964644486342777, + "loss": 0.79511088, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.25610352, + "step": 3724, + "time_per_iteration": 2.945258617401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072735, + "balance_loss_mlp": 1.04702103, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.06414027483355057, + "language_loss": 0.87113518, + "learning_rate": 0.00019621694124873524, + "loss": 0.88186252, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.25732422, + "step": 3725, + "time_per_iteration": 2.6636407375335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010246, + "balance_loss_mlp": 1.00323606, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.005035081365633862, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77550328, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.0703125, + "step": 3726, + "time_per_iteration": 4.901204347610474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074333, + "balance_loss_mlp": 1.04913247, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.05913508438980992, + "language_loss": 0.77430266, + "learning_rate": 0.00019572228039082428, + "loss": 0.78504598, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.2520752, + "step": 3727, + "time_per_iteration": 3.088613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078544, + "balance_loss_mlp": 1.05268764, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.05372970057922247, + "language_loss": 0.83879149, + "learning_rate": 0.0001954751271105002, + "loss": 0.84957701, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.25866699, + "step": 3728, + "time_per_iteration": 2.8328897953033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079816, + "balance_loss_mlp": 1.05423403, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.054514017613719934, + "language_loss": 0.80957007, + "learning_rate": 0.00019522809205721687, + "loss": 0.82036829, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.25598145, + "step": 3729, + "time_per_iteration": 2.763596534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076432, + "balance_loss_mlp": 1.05167198, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.06077062039876485, + "language_loss": 0.82796627, + "learning_rate": 0.0001949811753268816, + "loss": 0.83873057, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.24768066, + "step": 3730, + "time_per_iteration": 2.6999707221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107353, + "balance_loss_mlp": 1.04911554, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.06199825755801458, + "language_loss": 0.82858533, + "learning_rate": 0.00019473437701535634, + "loss": 0.83932066, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.2442627, + "step": 3731, + "time_per_iteration": 2.6672961711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.04839206, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.05911673909192475, + "language_loss": 0.89378715, + "learning_rate": 0.00019448769721845677, + "loss": 0.90452051, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.24975586, + "step": 3732, + "time_per_iteration": 2.8097128868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077958, + "balance_loss_mlp": 1.0521369, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.0968125790866447, + "language_loss": 0.85677779, + "learning_rate": 0.00019424113603195203, + "loss": 0.86755735, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.25854492, + "step": 3733, + "time_per_iteration": 2.5098788738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.05124426, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.06289800168130656, + "language_loss": 0.80150187, + "learning_rate": 0.0001939946935515657, + "loss": 0.81226921, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.25512695, + "step": 3734, + "time_per_iteration": 2.8232650756835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075815, + "balance_loss_mlp": 1.05049455, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.06894576786718996, + "language_loss": 0.80948031, + "learning_rate": 0.0001937483698729755, + "loss": 0.82023847, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.25341797, + "step": 3735, + "time_per_iteration": 2.583744525909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_mlp": 1.06058323, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.05171464240859849, + "language_loss": 0.82055521, + "learning_rate": 0.0001935021650918128, + "loss": 0.83142066, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.25976562, + "step": 3736, + "time_per_iteration": 3.018035411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075054, + "balance_loss_mlp": 1.05029404, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.06470560317481229, + "language_loss": 0.87265974, + "learning_rate": 0.0001932560793036625, + "loss": 0.88341027, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.24755859, + "step": 3737, + "time_per_iteration": 2.5036935806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080158, + "balance_loss_mlp": 1.05338335, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.06672658192386556, + "language_loss": 0.8673166, + "learning_rate": 0.00019301011260406382, + "loss": 0.87811816, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.26794434, + "step": 3738, + "time_per_iteration": 2.651357412338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075221, + "balance_loss_mlp": 1.050843, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.054290518405139924, + "language_loss": 0.80049711, + "learning_rate": 0.00019276426508850936, + "loss": 0.81124938, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.24377441, + "step": 3739, + "time_per_iteration": 2.7231712341308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070443, + "balance_loss_mlp": 1.04517078, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.061140917990422254, + "language_loss": 0.80563027, + "learning_rate": 0.00019251853685244564, + "loss": 0.81633466, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.25292969, + "step": 3740, + "time_per_iteration": 3.0039608478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071133, + "balance_loss_mlp": 1.0455265, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.05993968121683736, + "language_loss": 0.80916333, + "learning_rate": 0.00019227292799127283, + "loss": 0.81987464, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.25622559, + "step": 3741, + "time_per_iteration": 3.011082172393799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.04817998, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.062033255796259436, + "language_loss": 0.79226792, + "learning_rate": 0.00019202743860034454, + "loss": 0.80300719, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.25744629, + "step": 3742, + "time_per_iteration": 3.2250611782073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071976, + "balance_loss_mlp": 1.04692984, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.06270566779319728, + "language_loss": 0.83965755, + "learning_rate": 0.00019178206877496873, + "loss": 0.85037732, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.25061035, + "step": 3743, + "time_per_iteration": 2.702446222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068486, + "balance_loss_mlp": 1.04463267, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.05142738510326197, + "language_loss": 0.85388875, + "learning_rate": 0.0001915368186104059, + "loss": 0.8645736, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.23840332, + "step": 3744, + "time_per_iteration": 2.737600326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072746, + "balance_loss_mlp": 1.04818881, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.07812429294813375, + "language_loss": 0.80877572, + "learning_rate": 0.0001912916882018706, + "loss": 0.81950319, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.2454834, + "step": 3745, + "time_per_iteration": 2.7886669635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.04774189, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.10461054453296469, + "language_loss": 0.79336673, + "learning_rate": 0.00019104667764453125, + "loss": 0.80409628, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.2520752, + "step": 3746, + "time_per_iteration": 3.01520037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068478, + "balance_loss_mlp": 1.04382503, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.05271540811251211, + "language_loss": 0.80517203, + "learning_rate": 0.00019080178703350926, + "loss": 0.81585681, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.24658203, + "step": 3747, + "time_per_iteration": 2.6013572216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067775, + "balance_loss_mlp": 1.04224086, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.06037415597287081, + "language_loss": 0.83132112, + "learning_rate": 0.00019055701646387952, + "loss": 0.84199888, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.25549316, + "step": 3748, + "time_per_iteration": 2.641214609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012229, + "balance_loss_mlp": 1.00474262, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.010630398353693617, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81484914, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.07470703, + "step": 3749, + "time_per_iteration": 4.815402507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.04136407, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.06404376467324384, + "language_loss": 0.86850023, + "learning_rate": 0.00019006783582886368, + "loss": 0.8791635, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.24975586, + "step": 3750, + "time_per_iteration": 2.5772666931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068921, + "balance_loss_mlp": 1.04362464, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.05743356486239607, + "language_loss": 0.83082181, + "learning_rate": 0.00018982342595339437, + "loss": 0.84151101, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.25292969, + "step": 3751, + "time_per_iteration": 3.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074387, + "balance_loss_mlp": 1.04874492, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.12990726200021083, + "language_loss": 0.82180882, + "learning_rate": 0.00018957913649915076, + "loss": 0.83255273, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.25646973, + "step": 3752, + "time_per_iteration": 3.160003900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069675, + "balance_loss_mlp": 1.0439254, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.06468827882865268, + "language_loss": 0.80619174, + "learning_rate": 0.00018933496756097428, + "loss": 0.81688845, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.2578125, + "step": 3753, + "time_per_iteration": 2.5997426509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.04083598, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.06037343169471402, + "language_loss": 0.81664622, + "learning_rate": 0.0001890909192336603, + "loss": 0.8273102, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.2557373, + "step": 3754, + "time_per_iteration": 3.018083095550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.04364371, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.056170219084609056, + "language_loss": 0.70541704, + "learning_rate": 0.00018884699161195623, + "loss": 0.71610725, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.25390625, + "step": 3755, + "time_per_iteration": 2.947492837905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068503, + "balance_loss_mlp": 1.04259872, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.08930664907788496, + "language_loss": 0.77445567, + "learning_rate": 0.00018860318479056327, + "loss": 0.78514069, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.25939941, + "step": 3756, + "time_per_iteration": 3.133481740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075638, + "balance_loss_mlp": 1.05084264, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.05236327296273719, + "language_loss": 0.83486211, + "learning_rate": 0.00018835949886413555, + "loss": 0.84561849, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.24804688, + "step": 3757, + "time_per_iteration": 2.7377569675445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.04592407, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.06766060207164688, + "language_loss": 0.79256356, + "learning_rate": 0.0001881159339272806, + "loss": 0.80327755, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.25476074, + "step": 3758, + "time_per_iteration": 2.6691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106811, + "balance_loss_mlp": 1.04336238, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.06062364795368716, + "language_loss": 0.78869492, + "learning_rate": 0.00018787249007455858, + "loss": 0.79937607, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.24731445, + "step": 3759, + "time_per_iteration": 2.628452777862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072784, + "balance_loss_mlp": 1.04866767, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.05921726316721053, + "language_loss": 0.71849477, + "learning_rate": 0.00018762916740048302, + "loss": 0.7292226, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.24108887, + "step": 3760, + "time_per_iteration": 4.164097547531128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074503, + "balance_loss_mlp": 1.04969609, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.05859039427854228, + "language_loss": 0.85892487, + "learning_rate": 0.0001873859659995195, + "loss": 0.86966991, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.24816895, + "step": 3761, + "time_per_iteration": 2.7077507972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076187, + "balance_loss_mlp": 1.05047345, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.05612829292688987, + "language_loss": 0.8333689, + "learning_rate": 0.0001871428859660878, + "loss": 0.84413075, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.25744629, + "step": 3762, + "time_per_iteration": 2.7349491119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070635, + "balance_loss_mlp": 1.04679286, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.05320593884566549, + "language_loss": 0.82095098, + "learning_rate": 0.00018689992739455975, + "loss": 0.83165729, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.23828125, + "step": 3763, + "time_per_iteration": 2.9456627368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_mlp": 1.04832602, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.05110197345931534, + "language_loss": 0.86203957, + "learning_rate": 0.00018665709037926027, + "loss": 0.87277734, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.25476074, + "step": 3764, + "time_per_iteration": 3.318403959274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.04516387, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.06311256302273614, + "language_loss": 0.85269356, + "learning_rate": 0.00018641437501446694, + "loss": 0.86338234, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.23693848, + "step": 3765, + "time_per_iteration": 2.6275501251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077113, + "balance_loss_mlp": 1.05141139, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.06293710681021243, + "language_loss": 0.82769656, + "learning_rate": 0.0001861717813944104, + "loss": 0.83846772, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.25744629, + "step": 3766, + "time_per_iteration": 2.6608469486236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074023, + "balance_loss_mlp": 1.04876232, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.06015775700699107, + "language_loss": 0.79908741, + "learning_rate": 0.00018592930961327365, + "loss": 0.80982769, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.25280762, + "step": 3767, + "time_per_iteration": 2.7321486473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107651, + "balance_loss_mlp": 1.05160677, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.056564551709211236, + "language_loss": 0.88070989, + "learning_rate": 0.00018568695976519273, + "loss": 0.89147508, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.24890137, + "step": 3768, + "time_per_iteration": 2.7732081413269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073175, + "balance_loss_mlp": 1.04744947, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.06484399949200302, + "language_loss": 0.80721432, + "learning_rate": 0.00018544473194425593, + "loss": 0.81794608, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.25744629, + "step": 3769, + "time_per_iteration": 2.489635467529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069994, + "balance_loss_mlp": 1.04453063, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.06360093923079267, + "language_loss": 0.78936434, + "learning_rate": 0.00018520262624450485, + "loss": 0.80006427, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.25488281, + "step": 3770, + "time_per_iteration": 2.874816417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070988, + "balance_loss_mlp": 1.04658556, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.05111495515347452, + "language_loss": 0.87226415, + "learning_rate": 0.00018496064275993324, + "loss": 0.88297403, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.24377441, + "step": 3771, + "time_per_iteration": 2.7426414489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070976, + "balance_loss_mlp": 1.04448795, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.06635315591168078, + "language_loss": 0.82333881, + "learning_rate": 0.00018471878158448686, + "loss": 0.83404857, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.26538086, + "step": 3772, + "time_per_iteration": 2.927983283996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073436, + "balance_loss_mlp": 1.04748392, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.0478676363130983, + "language_loss": 0.84174544, + "learning_rate": 0.00018447704281206512, + "loss": 0.85247982, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.25964355, + "step": 3773, + "time_per_iteration": 2.863914966583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068748, + "balance_loss_mlp": 1.04243803, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.056210264368279125, + "language_loss": 0.83150065, + "learning_rate": 0.0001842354265365191, + "loss": 0.84218812, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.26330566, + "step": 3774, + "time_per_iteration": 2.6950740814208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.04815984, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.08533819626854355, + "language_loss": 0.81115055, + "learning_rate": 0.0001839939328516526, + "loss": 0.82188785, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.25598145, + "step": 3775, + "time_per_iteration": 2.7223706245422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075372, + "balance_loss_mlp": 1.04981351, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.08605287501334834, + "language_loss": 0.81360769, + "learning_rate": 0.0001837525618512218, + "loss": 0.82436144, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.2557373, + "step": 3776, + "time_per_iteration": 2.874652624130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067677, + "balance_loss_mlp": 1.04284596, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.060733174286640615, + "language_loss": 0.83042395, + "learning_rate": 0.00018351131362893519, + "loss": 0.84110069, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.24841309, + "step": 3777, + "time_per_iteration": 2.801011323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070149, + "balance_loss_mlp": 1.04434013, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.06246763883136397, + "language_loss": 0.80644751, + "learning_rate": 0.00018327018827845364, + "loss": 0.81714904, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.25842285, + "step": 3778, + "time_per_iteration": 2.5989460945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.04869461, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.05228982381822259, + "language_loss": 0.87562966, + "learning_rate": 0.00018302918589339036, + "loss": 0.88636208, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.2454834, + "step": 3779, + "time_per_iteration": 2.6237618923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073848, + "balance_loss_mlp": 1.04871857, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06453049409533262, + "language_loss": 0.90400481, + "learning_rate": 0.00018278830656731054, + "loss": 0.9147433, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.25158691, + "step": 3780, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069473, + "balance_loss_mlp": 1.04534531, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.050403453356215815, + "language_loss": 0.86580253, + "learning_rate": 0.00018254755039373222, + "loss": 0.87649727, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.24121094, + "step": 3781, + "time_per_iteration": 2.7791805267333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078472, + "balance_loss_mlp": 1.05350983, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.06136859684084447, + "language_loss": 0.83780336, + "learning_rate": 0.0001823069174661252, + "loss": 0.84858811, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.24963379, + "step": 3782, + "time_per_iteration": 2.8298797607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069254, + "balance_loss_mlp": 1.0445894, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.05448040343195996, + "language_loss": 0.78343076, + "learning_rate": 0.00018206640787791112, + "loss": 0.79412329, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.2467041, + "step": 3783, + "time_per_iteration": 2.609013795852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.0477066, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.057564515393037245, + "language_loss": 0.85957235, + "learning_rate": 0.00018182602172246416, + "loss": 0.87028909, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.23974609, + "step": 3784, + "time_per_iteration": 2.6400623321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072013, + "balance_loss_mlp": 1.04888618, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.060673398412002894, + "language_loss": 0.76418436, + "learning_rate": 0.00018158575909311075, + "loss": 0.77490449, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.23132324, + "step": 3785, + "time_per_iteration": 2.64180850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079039, + "balance_loss_mlp": 1.05404127, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.06019097733888483, + "language_loss": 0.8038618, + "learning_rate": 0.000181345620083129, + "loss": 0.8146522, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.24987793, + "step": 3786, + "time_per_iteration": 2.8254077434539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075124, + "balance_loss_mlp": 1.05057859, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.056512794901340806, + "language_loss": 0.86981964, + "learning_rate": 0.00018110560478574927, + "loss": 0.88057089, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.2454834, + "step": 3787, + "time_per_iteration": 2.6989898681640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107191, + "balance_loss_mlp": 1.04607677, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.0653462875447768, + "language_loss": 0.80641389, + "learning_rate": 0.0001808657132941533, + "loss": 0.81713301, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.25830078, + "step": 3788, + "time_per_iteration": 2.7848241329193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076986, + "balance_loss_mlp": 1.05143917, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.06505823149164586, + "language_loss": 0.8307749, + "learning_rate": 0.00018062594570147572, + "loss": 0.84154475, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.25549316, + "step": 3789, + "time_per_iteration": 2.633287191390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070828, + "balance_loss_mlp": 1.046152, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.05002031972924792, + "language_loss": 0.85413891, + "learning_rate": 0.00018038630210080243, + "loss": 0.86484718, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.24658203, + "step": 3790, + "time_per_iteration": 2.866363286972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072918, + "balance_loss_mlp": 1.04853952, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.05805310793954541, + "language_loss": 0.85253292, + "learning_rate": 0.0001801467825851712, + "loss": 0.86326218, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.24401855, + "step": 3791, + "time_per_iteration": 2.728860378265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071802, + "balance_loss_mlp": 1.04638696, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.14519807994310208, + "language_loss": 0.78306311, + "learning_rate": 0.00017990738724757172, + "loss": 0.79378116, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.25427246, + "step": 3792, + "time_per_iteration": 2.8468916416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_mlp": 1.04319978, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.05959978185176886, + "language_loss": 0.8250258, + "learning_rate": 0.00017966811618094598, + "loss": 0.83570778, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.24987793, + "step": 3793, + "time_per_iteration": 2.909195899963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077382, + "balance_loss_mlp": 1.05095315, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.06013443658312294, + "language_loss": 0.8499018, + "learning_rate": 0.00017942896947818664, + "loss": 0.86067569, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.26452637, + "step": 3794, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008896, + "balance_loss_mlp": 1.00121939, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.014415453052224393, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75833952, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.07666016, + "step": 3795, + "time_per_iteration": 4.844003200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067336, + "balance_loss_mlp": 1.04246938, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.07521259733742676, + "language_loss": 0.8533113, + "learning_rate": 0.00017895104953559947, + "loss": 0.86398464, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.24865723, + "step": 3796, + "time_per_iteration": 2.5970304012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04881954, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.082255252193866, + "language_loss": 0.8954308, + "learning_rate": 0.00017871227648131672, + "loss": 0.90617365, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.25476074, + "step": 3797, + "time_per_iteration": 2.5412604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066925, + "balance_loss_mlp": 1.0418191, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.050248722250274616, + "language_loss": 0.8297137, + "learning_rate": 0.0001784736281619907, + "loss": 0.84038293, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.25134277, + "step": 3798, + "time_per_iteration": 2.5844838619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068914, + "balance_loss_mlp": 1.04355788, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.07691325106249959, + "language_loss": 0.7466501, + "learning_rate": 0.00017823510467027232, + "loss": 0.75733924, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.25341797, + "step": 3799, + "time_per_iteration": 2.777209520339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071842, + "balance_loss_mlp": 1.04620039, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.08066489228669042, + "language_loss": 0.7834214, + "learning_rate": 0.00017799670609876516, + "loss": 0.79413986, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.25671387, + "step": 3800, + "time_per_iteration": 2.5069777965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107102, + "balance_loss_mlp": 1.04590285, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.05293495483873373, + "language_loss": 0.88974595, + "learning_rate": 0.00017775843254002366, + "loss": 0.90045619, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.2512207, + "step": 3801, + "time_per_iteration": 2.725081443786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077058, + "balance_loss_mlp": 1.0519762, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.05473119278948026, + "language_loss": 0.84161508, + "learning_rate": 0.00017752028408655367, + "loss": 0.85238564, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.25097656, + "step": 3802, + "time_per_iteration": 3.025043249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075279, + "balance_loss_mlp": 1.04994678, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.05406841313546952, + "language_loss": 0.85023701, + "learning_rate": 0.00017728226083081272, + "loss": 0.86098975, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.25354004, + "step": 3803, + "time_per_iteration": 2.556396245956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078247, + "balance_loss_mlp": 1.05355895, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.06231590720725376, + "language_loss": 0.81697959, + "learning_rate": 0.00017704436286520965, + "loss": 0.82776201, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.24682617, + "step": 3804, + "time_per_iteration": 2.5290911197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078799, + "balance_loss_mlp": 1.05242968, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.06772838198197546, + "language_loss": 0.84615296, + "learning_rate": 0.0001768065902821046, + "loss": 0.85694098, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.26379395, + "step": 3805, + "time_per_iteration": 2.6657214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072852, + "balance_loss_mlp": 1.04691195, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.06439046141851584, + "language_loss": 0.82463551, + "learning_rate": 0.00017656894317380907, + "loss": 0.83536404, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.25976562, + "step": 3806, + "time_per_iteration": 2.7381749153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008677, + "balance_loss_mlp": 1.00085652, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.011036115367728498, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77039945, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.078125, + "step": 3807, + "time_per_iteration": 5.021719217300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074035, + "balance_loss_mlp": 1.04846501, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.059101144317775495, + "language_loss": 0.84063375, + "learning_rate": 0.00017609402575064875, + "loss": 0.85137415, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.25585938, + "step": 3808, + "time_per_iteration": 2.601905345916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070564, + "balance_loss_mlp": 1.04569697, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.06287307202427123, + "language_loss": 0.81348085, + "learning_rate": 0.00017585675562016367, + "loss": 0.8241865, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.2487793, + "step": 3809, + "time_per_iteration": 2.5671656131744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101117, + "balance_loss_mlp": 1.0035404, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.009961164092808575, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78224015, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.07617188, + "step": 3810, + "time_per_iteration": 4.85601019859314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067566, + "balance_loss_mlp": 1.04141164, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.0717218178349286, + "language_loss": 0.85344338, + "learning_rate": 0.00017538259298196474, + "loss": 0.86411905, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.26171875, + "step": 3811, + "time_per_iteration": 2.5674660205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066538, + "balance_loss_mlp": 1.0418613, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.06191722538005279, + "language_loss": 0.8221786, + "learning_rate": 0.00017514570065833745, + "loss": 0.83284396, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.24658203, + "step": 3812, + "time_per_iteration": 2.7502520084381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065481, + "balance_loss_mlp": 1.04084063, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.09654235990380512, + "language_loss": 0.80427462, + "learning_rate": 0.00017490893445433426, + "loss": 0.81492949, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.24633789, + "step": 3813, + "time_per_iteration": 2.644380569458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.04200649, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.05501039024116298, + "language_loss": 0.81422758, + "learning_rate": 0.00017467229446187587, + "loss": 0.82489812, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.25061035, + "step": 3814, + "time_per_iteration": 2.6799376010894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072542, + "balance_loss_mlp": 1.04665017, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.054283563918009155, + "language_loss": 0.81726635, + "learning_rate": 0.00017443578077283424, + "loss": 0.82799172, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.2590332, + "step": 3815, + "time_per_iteration": 2.6411497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072187, + "balance_loss_mlp": 1.04731965, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.06697852947124575, + "language_loss": 0.85358864, + "learning_rate": 0.0001741993934790319, + "loss": 0.8643105, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.24853516, + "step": 3816, + "time_per_iteration": 2.813728094100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106954, + "balance_loss_mlp": 1.04457784, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.07301575323096621, + "language_loss": 0.83966112, + "learning_rate": 0.00017396313267224273, + "loss": 0.85035658, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.24963379, + "step": 3817, + "time_per_iteration": 2.7044739723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074705, + "balance_loss_mlp": 1.04939699, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.05834260982052782, + "language_loss": 0.88725907, + "learning_rate": 0.0001737269984441912, + "loss": 0.89800614, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.2532959, + "step": 3818, + "time_per_iteration": 2.6479249000549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068069, + "balance_loss_mlp": 1.04333293, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.04867070462384417, + "language_loss": 0.85300821, + "learning_rate": 0.00017349099088655263, + "loss": 0.86368895, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.24743652, + "step": 3819, + "time_per_iteration": 2.687084197998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068808, + "balance_loss_mlp": 1.04391694, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.05808726713133537, + "language_loss": 0.81269497, + "learning_rate": 0.00017325511009095375, + "loss": 0.82338297, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.24902344, + "step": 3820, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.04350281, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.05934097649534438, + "language_loss": 0.83911049, + "learning_rate": 0.00017301935614897113, + "loss": 0.84979987, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.2545166, + "step": 3821, + "time_per_iteration": 2.6836743354797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.04855156, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.0494453398159371, + "language_loss": 0.81605434, + "learning_rate": 0.00017278372915213274, + "loss": 0.82679343, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.25378418, + "step": 3822, + "time_per_iteration": 2.651975393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008179, + "balance_loss_mlp": 1.00121737, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.007266533432635982, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80902022, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.06982422, + "step": 3823, + "time_per_iteration": 4.976882457733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075947, + "balance_loss_mlp": 1.05096054, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.0894957625662193, + "language_loss": 0.80647838, + "learning_rate": 0.00017231285635975314, + "loss": 0.81723785, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.25, + "step": 3824, + "time_per_iteration": 2.8835809230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_mlp": 1.04871035, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.0659132638478438, + "language_loss": 0.83565962, + "learning_rate": 0.00017207761074702115, + "loss": 0.84640133, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.25488281, + "step": 3825, + "time_per_iteration": 2.5829551219940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.05089879, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.05423674228427361, + "language_loss": 0.83801639, + "learning_rate": 0.0001718424924450514, + "loss": 0.84877622, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.25085449, + "step": 3826, + "time_per_iteration": 2.6215810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072026, + "balance_loss_mlp": 1.0464201, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.047662784770319516, + "language_loss": 0.86247635, + "learning_rate": 0.00017160750154512482, + "loss": 0.8731966, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.25610352, + "step": 3827, + "time_per_iteration": 2.7316274642944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072121, + "balance_loss_mlp": 1.04726601, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.05425230647323069, + "language_loss": 0.83439684, + "learning_rate": 0.0001713726381384731, + "loss": 0.84511811, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.24841309, + "step": 3828, + "time_per_iteration": 2.7767257690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107353, + "balance_loss_mlp": 1.04826927, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.06782192310346803, + "language_loss": 0.81600618, + "learning_rate": 0.00017113790231627812, + "loss": 0.8267414, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.25280762, + "step": 3829, + "time_per_iteration": 2.4791929721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100591, + "balance_loss_mlp": 0.99885303, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.00950707875200575, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80264139, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.07080078, + "step": 3830, + "time_per_iteration": 6.233624696731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075447, + "balance_loss_mlp": 1.05000758, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.0605697653719091, + "language_loss": 0.82126367, + "learning_rate": 0.00017066881378973936, + "loss": 0.83201814, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.25463867, + "step": 3831, + "time_per_iteration": 2.6804988384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_mlp": 1.0483644, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.051765900336182155, + "language_loss": 0.83060026, + "learning_rate": 0.00017043446126751189, + "loss": 0.84133494, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.2512207, + "step": 3832, + "time_per_iteration": 2.677116870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078507, + "balance_loss_mlp": 1.05299592, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.06293756083772555, + "language_loss": 0.76538479, + "learning_rate": 0.00017020023669397376, + "loss": 0.7761699, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.25524902, + "step": 3833, + "time_per_iteration": 2.6688897609710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107645, + "balance_loss_mlp": 1.04953265, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.06089571273560201, + "language_loss": 0.81964701, + "learning_rate": 0.0001699661401600589, + "loss": 0.83041155, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.26953125, + "step": 3834, + "time_per_iteration": 2.6013684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072143, + "balance_loss_mlp": 1.04688239, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.05707021957695399, + "language_loss": 0.78780484, + "learning_rate": 0.00016973217175665205, + "loss": 0.79852629, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.25268555, + "step": 3835, + "time_per_iteration": 2.5545742511749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_mlp": 0.99759406, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.011573205656029463, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82170916, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.07128906, + "step": 3836, + "time_per_iteration": 4.935137748718262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073573, + "balance_loss_mlp": 1.04770422, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.05911051824592706, + "language_loss": 0.8443321, + "learning_rate": 0.00016926461970465047, + "loss": 0.85506785, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.25878906, + "step": 3837, + "time_per_iteration": 2.753530979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070473, + "balance_loss_mlp": 1.04638028, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.055427222427827466, + "language_loss": 0.84596455, + "learning_rate": 0.00016903103623757516, + "loss": 0.85666919, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.2409668, + "step": 3838, + "time_per_iteration": 3.0433106422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070361, + "balance_loss_mlp": 1.04518437, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.06096616849216926, + "language_loss": 0.80038297, + "learning_rate": 0.00016879758126404738, + "loss": 0.81108665, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.25183105, + "step": 3839, + "time_per_iteration": 2.726783037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071404, + "balance_loss_mlp": 1.04598832, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.0748668456042948, + "language_loss": 0.80022889, + "learning_rate": 0.00016856425487470216, + "loss": 0.81094301, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.25439453, + "step": 3840, + "time_per_iteration": 3.0780324935913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_mlp": 1.04228592, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.06187629511856373, + "language_loss": 0.79238671, + "learning_rate": 0.00016833105716012486, + "loss": 0.80306083, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.25146484, + "step": 3841, + "time_per_iteration": 3.1636850833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070126, + "balance_loss_mlp": 1.04452026, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.05887802150454755, + "language_loss": 0.85242188, + "learning_rate": 0.00016809798821085088, + "loss": 0.86312318, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.25622559, + "step": 3842, + "time_per_iteration": 2.990478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069406, + "balance_loss_mlp": 1.04390705, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.051928079352218694, + "language_loss": 0.8929773, + "learning_rate": 0.00016786504811736565, + "loss": 0.90367138, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.25524902, + "step": 3843, + "time_per_iteration": 2.6872341632843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066886, + "balance_loss_mlp": 1.04195881, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.06625408386492132, + "language_loss": 0.82992953, + "learning_rate": 0.00016763223697010442, + "loss": 0.84059834, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.24938965, + "step": 3844, + "time_per_iteration": 2.9391865730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.04412675, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.05828893088019289, + "language_loss": 0.84686291, + "learning_rate": 0.00016739955485945256, + "loss": 0.85754752, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.24304199, + "step": 3845, + "time_per_iteration": 2.7142622470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072077, + "balance_loss_mlp": 1.04656637, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.07100000886785215, + "language_loss": 0.85870165, + "learning_rate": 0.00016716700187574513, + "loss": 0.86942244, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.25537109, + "step": 3846, + "time_per_iteration": 2.6977670192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067392, + "balance_loss_mlp": 1.04300213, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.054057188356913304, + "language_loss": 0.84146428, + "learning_rate": 0.0001669345781092675, + "loss": 0.85213816, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.24377441, + "step": 3847, + "time_per_iteration": 2.7265117168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074624, + "balance_loss_mlp": 1.05013824, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.06355688718688712, + "language_loss": 0.87326193, + "learning_rate": 0.0001667022836502546, + "loss": 0.88400817, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.24499512, + "step": 3848, + "time_per_iteration": 2.7551324367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073869, + "balance_loss_mlp": 1.04852557, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.08017271540920272, + "language_loss": 0.828776, + "learning_rate": 0.00016647011858889077, + "loss": 0.83951473, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.25378418, + "step": 3849, + "time_per_iteration": 2.5299232006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073654, + "balance_loss_mlp": 1.04783297, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.06268234066304752, + "language_loss": 0.85992008, + "learning_rate": 0.00016623808301531056, + "loss": 0.87065661, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.25842285, + "step": 3850, + "time_per_iteration": 2.6404004096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077251, + "balance_loss_mlp": 1.05166864, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.07684631062218569, + "language_loss": 0.79265726, + "learning_rate": 0.00016600617701959842, + "loss": 0.80342978, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.25610352, + "step": 3851, + "time_per_iteration": 2.719182014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007219, + "balance_loss_mlp": 1.00025725, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.009023170879128087, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79851031, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.06982422, + "step": 3852, + "time_per_iteration": 4.949675798416138 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073682, + "balance_loss_mlp": 1.04883838, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.05701919948873552, + "language_loss": 0.81264549, + "learning_rate": 0.00016554275412186315, + "loss": 0.82338226, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.24853516, + "step": 3853, + "time_per_iteration": 2.843740701675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04884005, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.06536701062861092, + "language_loss": 0.80980605, + "learning_rate": 0.0001653112373997568, + "loss": 0.82055348, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.25927734, + "step": 3854, + "time_per_iteration": 2.65200138092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073741, + "balance_loss_mlp": 1.04929078, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.06830067718858168, + "language_loss": 0.74823475, + "learning_rate": 0.0001650798506153517, + "loss": 0.75897211, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.24450684, + "step": 3855, + "time_per_iteration": 2.687006950378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070882, + "balance_loss_mlp": 1.04607463, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.07905469083436836, + "language_loss": 0.84182036, + "learning_rate": 0.00016484859385848023, + "loss": 0.85252917, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.24816895, + "step": 3856, + "time_per_iteration": 2.6188693046569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072107, + "balance_loss_mlp": 1.0480032, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.061726371250172385, + "language_loss": 0.77338076, + "learning_rate": 0.0001646174672189243, + "loss": 0.7841019, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.24108887, + "step": 3857, + "time_per_iteration": 2.649557590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067204, + "balance_loss_mlp": 1.0426352, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.0578395137702567, + "language_loss": 0.80607724, + "learning_rate": 0.00016438647078641488, + "loss": 0.81674922, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.24572754, + "step": 3858, + "time_per_iteration": 2.619621515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072571, + "balance_loss_mlp": 1.04788327, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.06183948781118283, + "language_loss": 0.83172727, + "learning_rate": 0.00016415560465063344, + "loss": 0.842453, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.24694824, + "step": 3859, + "time_per_iteration": 2.7068328857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_mlp": 1.04234803, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.07149126280637065, + "language_loss": 0.79273307, + "learning_rate": 0.0001639248689012095, + "loss": 0.80340761, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.2512207, + "step": 3860, + "time_per_iteration": 2.559715986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069308, + "balance_loss_mlp": 1.04384458, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.06474025834236737, + "language_loss": 0.87225401, + "learning_rate": 0.00016369426362772271, + "loss": 0.88294709, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.25463867, + "step": 3861, + "time_per_iteration": 2.768488883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071358, + "balance_loss_mlp": 1.0464673, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.05729012917412524, + "language_loss": 0.80612242, + "learning_rate": 0.00016346378891970233, + "loss": 0.816836, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.24890137, + "step": 3862, + "time_per_iteration": 2.805666923522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107111, + "balance_loss_mlp": 1.04621959, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.054983080042834975, + "language_loss": 0.81883794, + "learning_rate": 0.00016323344486662633, + "loss": 0.82954907, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.24902344, + "step": 3863, + "time_per_iteration": 3.301302671432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072159, + "balance_loss_mlp": 1.04673147, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.05456395021125743, + "language_loss": 0.78892124, + "learning_rate": 0.00016300323155792247, + "loss": 0.7996428, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.2545166, + "step": 3864, + "time_per_iteration": 2.8931703567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066338, + "balance_loss_mlp": 1.0422101, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.05569760658066131, + "language_loss": 0.88605452, + "learning_rate": 0.00016277314908296687, + "loss": 0.89671785, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.24121094, + "step": 3865, + "time_per_iteration": 2.684453248977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071833, + "balance_loss_mlp": 1.04628646, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.09698057624651829, + "language_loss": 0.75883031, + "learning_rate": 0.00016254319753108604, + "loss": 0.76954859, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.25561523, + "step": 3866, + "time_per_iteration": 2.8249847888946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04626155, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.06903603879321982, + "language_loss": 0.76659936, + "learning_rate": 0.00016231337699155492, + "loss": 0.7773214, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.25964355, + "step": 3867, + "time_per_iteration": 2.954054594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.04905081, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.052812057289516566, + "language_loss": 0.78941596, + "learning_rate": 0.0001620836875535977, + "loss": 0.80016011, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.25378418, + "step": 3868, + "time_per_iteration": 2.868677854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065856, + "balance_loss_mlp": 1.04120398, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.06361911402361287, + "language_loss": 0.806584, + "learning_rate": 0.00016185412930638766, + "loss": 0.81724262, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.24658203, + "step": 3869, + "time_per_iteration": 2.8323211669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071749, + "balance_loss_mlp": 1.04708433, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.05653769935152868, + "language_loss": 0.82733011, + "learning_rate": 0.00016162470233904765, + "loss": 0.83804756, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.24658203, + "step": 3870, + "time_per_iteration": 2.7211382389068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04862642, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.06316774486708195, + "language_loss": 0.82555729, + "learning_rate": 0.00016139540674064856, + "loss": 0.83629668, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.25280762, + "step": 3871, + "time_per_iteration": 2.739121675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070986, + "balance_loss_mlp": 1.04655969, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.05640449284487911, + "language_loss": 0.78114176, + "learning_rate": 0.00016116624260021113, + "loss": 0.79185158, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.24414062, + "step": 3872, + "time_per_iteration": 2.7855870723724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071272, + "balance_loss_mlp": 1.04650021, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.05661952400288272, + "language_loss": 0.84321451, + "learning_rate": 0.0001609372100067046, + "loss": 0.85392725, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.24768066, + "step": 3873, + "time_per_iteration": 2.5051002502441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076949, + "balance_loss_mlp": 1.05139041, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.07051271048779074, + "language_loss": 0.85103834, + "learning_rate": 0.0001607083090490475, + "loss": 0.86180782, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.25585938, + "step": 3874, + "time_per_iteration": 2.865432024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073052, + "balance_loss_mlp": 1.04762459, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.0748811600341369, + "language_loss": 0.80497265, + "learning_rate": 0.00016047953981610714, + "loss": 0.81570315, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.25439453, + "step": 3875, + "time_per_iteration": 2.7216734886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007311, + "balance_loss_mlp": 1.00044441, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.007625795803468779, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80736953, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.06884766, + "step": 3876, + "time_per_iteration": 5.382456064224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075501, + "balance_loss_mlp": 1.05013371, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.05488514319290027, + "language_loss": 0.81060588, + "learning_rate": 0.0001600223968795889, + "loss": 0.82136083, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.25378418, + "step": 3877, + "time_per_iteration": 2.9120445251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100724, + "balance_loss_mlp": 1.0003736, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.007629360710496433, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.7670331, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.06884766, + "step": 3878, + "time_per_iteration": 4.901887893676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072632, + "balance_loss_mlp": 1.04855156, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.07646083771091663, + "language_loss": 0.82140052, + "learning_rate": 0.00015956578190706483, + "loss": 0.83212686, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.24072266, + "step": 3879, + "time_per_iteration": 2.665292978286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106672, + "balance_loss_mlp": 1.04198372, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.05773895513703621, + "language_loss": 0.75869894, + "learning_rate": 0.00015933767262892468, + "loss": 0.76936615, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.24743652, + "step": 3880, + "time_per_iteration": 2.7083511352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068645, + "balance_loss_mlp": 1.04439831, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.07814319262934219, + "language_loss": 0.82429087, + "learning_rate": 0.00015910969560762927, + "loss": 0.83497727, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.2421875, + "step": 3881, + "time_per_iteration": 2.556643009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072001, + "balance_loss_mlp": 1.04824293, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.05526796797761112, + "language_loss": 0.8303771, + "learning_rate": 0.00015888185093168727, + "loss": 0.84109712, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.2376709, + "step": 3882, + "time_per_iteration": 2.7359204292297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074755, + "balance_loss_mlp": 1.0493511, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.05340233113956033, + "language_loss": 0.81238657, + "learning_rate": 0.00015865413868955581, + "loss": 0.82313412, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.25439453, + "step": 3883, + "time_per_iteration": 2.658531665802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.04343939, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.051714571371053245, + "language_loss": 0.82935232, + "learning_rate": 0.00015842655896964054, + "loss": 0.8400166, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.22973633, + "step": 3884, + "time_per_iteration": 3.018538475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077883, + "balance_loss_mlp": 1.05318248, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.06900594182420934, + "language_loss": 0.74108642, + "learning_rate": 0.00015819911186029567, + "loss": 0.75186527, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.24719238, + "step": 3885, + "time_per_iteration": 2.767460823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074459, + "balance_loss_mlp": 1.04935396, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.05191869003121536, + "language_loss": 0.8641215, + "learning_rate": 0.00015797179744982443, + "loss": 0.87486613, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.25073242, + "step": 3886, + "time_per_iteration": 2.722130060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04973185, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.0600854170312897, + "language_loss": 0.79131281, + "learning_rate": 0.00015774461582647765, + "loss": 0.80205405, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.24389648, + "step": 3887, + "time_per_iteration": 2.6940510272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072368, + "balance_loss_mlp": 1.04781055, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07732553341953252, + "language_loss": 0.8101362, + "learning_rate": 0.00015751756707845505, + "loss": 0.82085991, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.24560547, + "step": 3888, + "time_per_iteration": 2.6013286113739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_mlp": 1.04748178, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.05831839301711609, + "language_loss": 0.88772756, + "learning_rate": 0.00015729065129390502, + "loss": 0.89844996, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.24768066, + "step": 3889, + "time_per_iteration": 3.000511884689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107686, + "balance_loss_mlp": 1.05071712, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.09513844178064898, + "language_loss": 0.82148743, + "learning_rate": 0.0001570638685609241, + "loss": 0.83225602, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.26159668, + "step": 3890, + "time_per_iteration": 2.5567352771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_mlp": 1.04621816, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.06496446825599186, + "language_loss": 0.80583847, + "learning_rate": 0.00015683721896755693, + "loss": 0.81655896, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.25866699, + "step": 3891, + "time_per_iteration": 2.5300092697143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017758, + "balance_loss_mlp": 1.01103473, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.007988812932881569, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83228242, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.06738281, + "step": 3892, + "time_per_iteration": 4.90599799156189 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072454, + "balance_loss_mlp": 1.04705048, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.06234068524332242, + "language_loss": 0.85285282, + "learning_rate": 0.00015638431955158528, + "loss": 0.86357737, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.25402832, + "step": 3893, + "time_per_iteration": 2.674448251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077206, + "balance_loss_mlp": 1.05186236, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.051873425900431515, + "language_loss": 0.8129698, + "learning_rate": 0.00015615806990481186, + "loss": 0.82374185, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.25366211, + "step": 3894, + "time_per_iteration": 2.749011754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075566, + "balance_loss_mlp": 1.05075812, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.04941596722004592, + "language_loss": 0.84629339, + "learning_rate": 0.00015593195374931452, + "loss": 0.85704899, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.24804688, + "step": 3895, + "time_per_iteration": 2.7212753295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077894, + "balance_loss_mlp": 1.05278873, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.06116342211722219, + "language_loss": 0.80278218, + "learning_rate": 0.00015570597117287922, + "loss": 0.8135612, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.25109863, + "step": 3896, + "time_per_iteration": 2.7101802825927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_mlp": 1.04633236, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.069447374717696, + "language_loss": 0.77728438, + "learning_rate": 0.0001554801222632406, + "loss": 0.78799057, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.24291992, + "step": 3897, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107305, + "balance_loss_mlp": 1.04788542, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.06164453931329584, + "language_loss": 0.85245335, + "learning_rate": 0.00015525440710808052, + "loss": 0.86318392, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.25170898, + "step": 3898, + "time_per_iteration": 2.653172016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107387, + "balance_loss_mlp": 1.04789472, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.06163823743918883, + "language_loss": 0.77877641, + "learning_rate": 0.00015502882579502953, + "loss": 0.78951514, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.2598877, + "step": 3899, + "time_per_iteration": 2.949995517730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074982, + "balance_loss_mlp": 1.04994845, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.062464860099035104, + "language_loss": 0.85077929, + "learning_rate": 0.00015480337841166592, + "loss": 0.86152911, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.25012207, + "step": 3900, + "time_per_iteration": 2.7779133319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078827, + "balance_loss_mlp": 1.05378067, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.06586633865886998, + "language_loss": 0.82996714, + "learning_rate": 0.00015457806504551647, + "loss": 0.8407554, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.25061035, + "step": 3901, + "time_per_iteration": 2.8566529750823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074374, + "balance_loss_mlp": 1.04922056, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.053967524095388235, + "language_loss": 0.78072977, + "learning_rate": 0.0001543528857840554, + "loss": 0.79147345, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.25158691, + "step": 3902, + "time_per_iteration": 2.6760079860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107594, + "balance_loss_mlp": 1.05152607, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.0598852080475998, + "language_loss": 0.80620217, + "learning_rate": 0.000154127840714705, + "loss": 0.81696159, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.24401855, + "step": 3903, + "time_per_iteration": 2.788379430770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072506, + "balance_loss_mlp": 1.04635119, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.0690597284577383, + "language_loss": 0.82208622, + "learning_rate": 0.00015390292992483557, + "loss": 0.8328113, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.26184082, + "step": 3904, + "time_per_iteration": 2.507995128631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010781, + "balance_loss_mlp": 1.05372167, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.057063892472999186, + "language_loss": 0.8453331, + "learning_rate": 0.00015367815350176523, + "loss": 0.85611403, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.24389648, + "step": 3905, + "time_per_iteration": 2.733604907989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077145, + "balance_loss_mlp": 1.05211139, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.056222194479754704, + "language_loss": 0.82852668, + "learning_rate": 0.00015345351153275987, + "loss": 0.83929813, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.25048828, + "step": 3906, + "time_per_iteration": 2.5045523643493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068307, + "balance_loss_mlp": 1.04364252, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.0670025336867701, + "language_loss": 0.80755925, + "learning_rate": 0.00015322900410503332, + "loss": 0.81824237, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.24645996, + "step": 3907, + "time_per_iteration": 2.7994320392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072066, + "balance_loss_mlp": 1.04688847, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.05833722566179846, + "language_loss": 0.77270997, + "learning_rate": 0.00015300463130574703, + "loss": 0.78343064, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.2520752, + "step": 3908, + "time_per_iteration": 2.8524723052978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074857, + "balance_loss_mlp": 1.05007386, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.06750112030828431, + "language_loss": 0.8202616, + "learning_rate": 0.00015278039322201033, + "loss": 0.83101016, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.24780273, + "step": 3909, + "time_per_iteration": 2.9736523628234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107108, + "balance_loss_mlp": 1.04605806, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.06973049488885559, + "language_loss": 0.79777265, + "learning_rate": 0.00015255628994088004, + "loss": 0.80848348, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.25012207, + "step": 3910, + "time_per_iteration": 2.5302295684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071428, + "balance_loss_mlp": 1.04685879, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.06491426565594356, + "language_loss": 0.75382125, + "learning_rate": 0.00015233232154936082, + "loss": 0.76453555, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.24572754, + "step": 3911, + "time_per_iteration": 3.251619815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078269, + "balance_loss_mlp": 1.05206633, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.0623404961346465, + "language_loss": 0.76721239, + "learning_rate": 0.0001521084881344048, + "loss": 0.77799511, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.26220703, + "step": 3912, + "time_per_iteration": 2.850635051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075285, + "balance_loss_mlp": 1.05071616, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.05187339069994817, + "language_loss": 0.86498892, + "learning_rate": 0.00015188478978291208, + "loss": 0.87574184, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.24572754, + "step": 3913, + "time_per_iteration": 2.765442371368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072293, + "balance_loss_mlp": 1.04759288, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.06241775193338078, + "language_loss": 0.86580771, + "learning_rate": 0.00015166122658173014, + "loss": 0.87653065, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.24682617, + "step": 3914, + "time_per_iteration": 2.7562687397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076326, + "balance_loss_mlp": 1.05007637, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.05387803011271429, + "language_loss": 0.88860059, + "learning_rate": 0.00015143779861765332, + "loss": 0.89936382, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.26257324, + "step": 3915, + "time_per_iteration": 2.932776927947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107374, + "balance_loss_mlp": 1.04894459, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.057566889010823, + "language_loss": 0.81424505, + "learning_rate": 0.00015121450597742458, + "loss": 0.82498246, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.2479248, + "step": 3916, + "time_per_iteration": 2.854919672012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078465, + "balance_loss_mlp": 1.05316877, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07096809285669192, + "language_loss": 0.7879523, + "learning_rate": 0.00015099134874773369, + "loss": 0.79873693, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.25317383, + "step": 3917, + "time_per_iteration": 2.717822313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072702, + "balance_loss_mlp": 1.04738212, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.05614014037376785, + "language_loss": 0.80481035, + "learning_rate": 0.00015076832701521793, + "loss": 0.81553745, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.25341797, + "step": 3918, + "time_per_iteration": 2.7440896034240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077078, + "balance_loss_mlp": 1.05145979, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.07007735924828153, + "language_loss": 0.81983852, + "learning_rate": 0.000150545440866462, + "loss": 0.83060932, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.25646973, + "step": 3919, + "time_per_iteration": 3.0307159423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080089, + "balance_loss_mlp": 1.05534124, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.06360208867996311, + "language_loss": 0.78682411, + "learning_rate": 0.000150322690387998, + "loss": 0.79762495, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.24755859, + "step": 3920, + "time_per_iteration": 2.4933719635009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075825, + "balance_loss_mlp": 1.05169666, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.07326690987324283, + "language_loss": 0.75561839, + "learning_rate": 0.00015010007566630535, + "loss": 0.76637661, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.24121094, + "step": 3921, + "time_per_iteration": 2.7614030838012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.05487168, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.09124669942400691, + "language_loss": 0.81765956, + "learning_rate": 0.00014987759678781077, + "loss": 0.82845515, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.24707031, + "step": 3922, + "time_per_iteration": 2.6194660663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107743, + "balance_loss_mlp": 1.0523603, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07360679346061566, + "language_loss": 0.82340884, + "learning_rate": 0.00014965525383888795, + "loss": 0.83418316, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.25085449, + "step": 3923, + "time_per_iteration": 2.8085968494415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073002, + "balance_loss_mlp": 1.04881442, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.05494147017339954, + "language_loss": 0.72481954, + "learning_rate": 0.00014943304690585851, + "loss": 0.73554957, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.24182129, + "step": 3924, + "time_per_iteration": 2.9154560565948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079076, + "balance_loss_mlp": 1.0540781, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.07583618548945481, + "language_loss": 0.79405016, + "learning_rate": 0.0001492109760749908, + "loss": 0.80484092, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.25012207, + "step": 3925, + "time_per_iteration": 2.5836076736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076807, + "balance_loss_mlp": 1.0515945, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.05355436965428176, + "language_loss": 0.80110025, + "learning_rate": 0.00014898904143250002, + "loss": 0.81186831, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.25231934, + "step": 3926, + "time_per_iteration": 2.6505353450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024147, + "balance_loss_mlp": 1.01732779, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.0157174231445921, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76779342, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.06835938, + "step": 3927, + "time_per_iteration": 4.953717470169067 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077692, + "balance_loss_mlp": 1.05381441, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.05489454471207153, + "language_loss": 0.80429578, + "learning_rate": 0.0001485455810572474, + "loss": 0.81507266, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.23864746, + "step": 3928, + "time_per_iteration": 2.637946844100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077738, + "balance_loss_mlp": 1.05282295, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.058181996359435495, + "language_loss": 0.84069693, + "learning_rate": 0.00014832405549665236, + "loss": 0.85147429, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.24902344, + "step": 3929, + "time_per_iteration": 2.6932008266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074105, + "balance_loss_mlp": 1.05033493, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.06320192227376603, + "language_loss": 0.78577268, + "learning_rate": 0.00014810266646876746, + "loss": 0.79651374, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.2376709, + "step": 3930, + "time_per_iteration": 2.7697536945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071439, + "balance_loss_mlp": 1.04683375, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.06814480820805115, + "language_loss": 0.77612817, + "learning_rate": 0.00014788141405954364, + "loss": 0.78684253, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.24633789, + "step": 3931, + "time_per_iteration": 2.979769468307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073336, + "balance_loss_mlp": 1.04886281, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.059820147392813335, + "language_loss": 0.84867471, + "learning_rate": 0.00014766029835487865, + "loss": 0.85940808, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.24475098, + "step": 3932, + "time_per_iteration": 2.7333834171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.05246568, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.06432503649028948, + "language_loss": 0.79687858, + "learning_rate": 0.0001474393194406173, + "loss": 0.80764747, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.24438477, + "step": 3933, + "time_per_iteration": 2.896916627883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.04866862, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.05519381243728572, + "language_loss": 0.79627228, + "learning_rate": 0.00014721847740255112, + "loss": 0.80699992, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.24084473, + "step": 3934, + "time_per_iteration": 2.8888845443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011104, + "balance_loss_mlp": 1.00419021, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.009067101269619127, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74923027, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.06933594, + "step": 3935, + "time_per_iteration": 4.645391941070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106868, + "balance_loss_mlp": 1.0436697, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.07237572770548766, + "language_loss": 0.78754729, + "learning_rate": 0.00014677720429790526, + "loss": 0.79823411, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.25, + "step": 3936, + "time_per_iteration": 2.588223457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063945, + "balance_loss_mlp": 1.03916097, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.047485127857512396, + "language_loss": 0.84842449, + "learning_rate": 0.0001465567734026429, + "loss": 0.85906392, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.24804688, + "step": 3937, + "time_per_iteration": 2.733915090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.04105449, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08009981712231565, + "language_loss": 0.82548285, + "learning_rate": 0.00014633647972621034, + "loss": 0.83615267, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.25964355, + "step": 3938, + "time_per_iteration": 2.4831509590148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066047, + "balance_loss_mlp": 1.04131114, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.049323859420558516, + "language_loss": 0.8679713, + "learning_rate": 0.00014611632335413354, + "loss": 0.87863177, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.24743652, + "step": 3939, + "time_per_iteration": 2.817972421646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066166, + "balance_loss_mlp": 1.04209805, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.05378533672074644, + "language_loss": 0.82628143, + "learning_rate": 0.00014589630437188456, + "loss": 0.83694315, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.24047852, + "step": 3940, + "time_per_iteration": 3.1869349479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069178, + "balance_loss_mlp": 1.04451323, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.0625352255464929, + "language_loss": 0.78564709, + "learning_rate": 0.00014567642286488253, + "loss": 0.79633886, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.2467041, + "step": 3941, + "time_per_iteration": 2.5982542037963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067881, + "balance_loss_mlp": 1.04189372, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.07448102100024, + "language_loss": 0.79396963, + "learning_rate": 0.00014545667891849258, + "loss": 0.8046484, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.26013184, + "step": 3942, + "time_per_iteration": 2.6813278198242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.04107857, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.05620268870521042, + "language_loss": 0.82649952, + "learning_rate": 0.00014523707261802733, + "loss": 0.83716011, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.24987793, + "step": 3943, + "time_per_iteration": 2.6405162811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068268, + "balance_loss_mlp": 1.04263783, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.05791403818328359, + "language_loss": 0.8163532, + "learning_rate": 0.00014501760404874527, + "loss": 0.8270359, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.25634766, + "step": 3944, + "time_per_iteration": 2.722963809967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106607, + "balance_loss_mlp": 1.04108405, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.06238439989518053, + "language_loss": 0.86068374, + "learning_rate": 0.00014479827329585176, + "loss": 0.87134445, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.24963379, + "step": 3945, + "time_per_iteration": 2.7014224529266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.04362893, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.04867252918796388, + "language_loss": 0.8493138, + "learning_rate": 0.00014457908044449846, + "loss": 0.85999829, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.24816895, + "step": 3946, + "time_per_iteration": 2.7054529190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106665, + "balance_loss_mlp": 1.04214025, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.06710425705469547, + "language_loss": 0.83130479, + "learning_rate": 0.00014436002557978371, + "loss": 0.84197128, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.24511719, + "step": 3947, + "time_per_iteration": 2.788025379180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004209, + "balance_loss_mlp": 0.99724722, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.014479305235322698, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77647352, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.06982422, + "step": 3948, + "time_per_iteration": 4.901083946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067183, + "balance_loss_mlp": 1.0420419, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.05243549460506514, + "language_loss": 0.79874659, + "learning_rate": 0.0001439223301503945, + "loss": 0.80941838, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.25146484, + "step": 3949, + "time_per_iteration": 2.538907527923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_mlp": 1.04793382, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.10938231230046584, + "language_loss": 0.76564628, + "learning_rate": 0.00014370368975564834, + "loss": 0.77636915, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.24353027, + "step": 3950, + "time_per_iteration": 2.9170916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072617, + "balance_loss_mlp": 1.04752314, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.06203753703543282, + "language_loss": 0.83670735, + "learning_rate": 0.00014348518768739766, + "loss": 0.84743357, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.25109863, + "step": 3951, + "time_per_iteration": 2.730717897415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002273, + "balance_loss_mlp": 0.99526346, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.013476999765998546, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.7773031, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.0703125, + "step": 3952, + "time_per_iteration": 4.813526391983032 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067232, + "balance_loss_mlp": 1.04273486, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.09216142296418942, + "language_loss": 0.86414772, + "learning_rate": 0.00014304859886964867, + "loss": 0.87482005, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.24487305, + "step": 3953, + "time_per_iteration": 2.9926557540893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068844, + "balance_loss_mlp": 1.0442636, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.05133749222292773, + "language_loss": 0.83801866, + "learning_rate": 0.00014283051228964878, + "loss": 0.84870708, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.24572754, + "step": 3954, + "time_per_iteration": 2.68623423576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068293, + "balance_loss_mlp": 1.0438199, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.07227710596047977, + "language_loss": 0.82760596, + "learning_rate": 0.00014261256437514197, + "loss": 0.8382889, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.24462891, + "step": 3955, + "time_per_iteration": 2.664646625518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067041, + "balance_loss_mlp": 1.04321122, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.06297577079801352, + "language_loss": 0.82699019, + "learning_rate": 0.0001423947552107428, + "loss": 0.83766061, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.23815918, + "step": 3956, + "time_per_iteration": 2.705461263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106862, + "balance_loss_mlp": 1.04420578, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.06155196103872169, + "language_loss": 0.77009457, + "learning_rate": 0.00014217708488101243, + "loss": 0.78078079, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.24389648, + "step": 3957, + "time_per_iteration": 3.0419583320617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069471, + "balance_loss_mlp": 1.04394794, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08526954862375616, + "language_loss": 0.77756625, + "learning_rate": 0.0001419595534704579, + "loss": 0.78826094, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.25537109, + "step": 3958, + "time_per_iteration": 2.6491963863372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_mlp": 1.04386628, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.05412065824000071, + "language_loss": 0.81526375, + "learning_rate": 0.00014174216106353237, + "loss": 0.82594681, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.24438477, + "step": 3959, + "time_per_iteration": 2.6313867568969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067389, + "balance_loss_mlp": 1.04217625, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.06465573226743382, + "language_loss": 0.76379991, + "learning_rate": 0.00014152490774463512, + "loss": 0.77447385, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.25231934, + "step": 3960, + "time_per_iteration": 2.5999507904052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_mlp": 1.04220998, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.07999326850245969, + "language_loss": 0.87070781, + "learning_rate": 0.00014130779359811135, + "loss": 0.8813796, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.24963379, + "step": 3961, + "time_per_iteration": 2.470813512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067267, + "balance_loss_mlp": 1.04288888, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.05710380569291167, + "language_loss": 0.8618769, + "learning_rate": 0.0001410908187082521, + "loss": 0.87254959, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.24365234, + "step": 3962, + "time_per_iteration": 2.8664379119873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068922, + "balance_loss_mlp": 1.04251671, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.06132823926479849, + "language_loss": 0.83309317, + "learning_rate": 0.0001408739831592949, + "loss": 0.84378237, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.26416016, + "step": 3963, + "time_per_iteration": 2.6825127601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066723, + "balance_loss_mlp": 1.04126, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.061542532553496905, + "language_loss": 0.7740978, + "learning_rate": 0.0001406572870354224, + "loss": 0.78476501, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.25488281, + "step": 3964, + "time_per_iteration": 2.8066251277923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068698, + "balance_loss_mlp": 1.0439024, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.0743228593837952, + "language_loss": 0.8726244, + "learning_rate": 0.00014044073042076337, + "loss": 0.88331133, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.24804688, + "step": 3965, + "time_per_iteration": 2.56150484085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063933, + "balance_loss_mlp": 1.04018617, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.05421398369924913, + "language_loss": 0.89094937, + "learning_rate": 0.00014022431339939302, + "loss": 0.90158874, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.23730469, + "step": 3966, + "time_per_iteration": 2.655383586883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062991, + "balance_loss_mlp": 1.03745639, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.06770239365131947, + "language_loss": 0.78292239, + "learning_rate": 0.00014000803605533163, + "loss": 0.79355228, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.25537109, + "step": 3967, + "time_per_iteration": 2.7987117767333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064596, + "balance_loss_mlp": 1.04112363, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.07669794307495341, + "language_loss": 0.83868659, + "learning_rate": 0.00013979189847254553, + "loss": 0.84933251, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.23474121, + "step": 3968, + "time_per_iteration": 2.5726282596588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068744, + "balance_loss_mlp": 1.04398394, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.06422002731628916, + "language_loss": 0.80682731, + "learning_rate": 0.00013957590073494674, + "loss": 0.81751466, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.24780273, + "step": 3969, + "time_per_iteration": 2.7832956314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063668, + "balance_loss_mlp": 1.03938496, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.0668838144354532, + "language_loss": 0.79043007, + "learning_rate": 0.0001393600429263931, + "loss": 0.80106676, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.24267578, + "step": 3970, + "time_per_iteration": 2.7605960369110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_mlp": 1.01881361, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.01539224673333176, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75770235, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.06542969, + "step": 3971, + "time_per_iteration": 4.931908369064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062427, + "balance_loss_mlp": 1.03795385, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.05820803040195091, + "language_loss": 0.81583369, + "learning_rate": 0.0001389287474315804, + "loss": 0.82645798, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.24450684, + "step": 3972, + "time_per_iteration": 2.6463029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.03905725, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.06630733211536807, + "language_loss": 0.8009305, + "learning_rate": 0.00013871330991276505, + "loss": 0.81156087, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.23950195, + "step": 3973, + "time_per_iteration": 2.706376791000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.04109335, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.09714932843218141, + "language_loss": 0.80939794, + "learning_rate": 0.00013849801265788247, + "loss": 0.82005835, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.24938965, + "step": 3974, + "time_per_iteration": 2.9903647899627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068526, + "balance_loss_mlp": 1.04320633, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.0619121183931823, + "language_loss": 0.83072186, + "learning_rate": 0.00013828285575051818, + "loss": 0.84140712, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.25354004, + "step": 3975, + "time_per_iteration": 2.6031386852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067956, + "balance_loss_mlp": 1.04345858, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.057910960280581285, + "language_loss": 0.84483737, + "learning_rate": 0.0001380678392742035, + "loss": 0.85551691, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.24499512, + "step": 3976, + "time_per_iteration": 2.7115118503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064756, + "balance_loss_mlp": 1.03948343, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.05528375504555246, + "language_loss": 0.8460198, + "learning_rate": 0.00013785296331241526, + "loss": 0.85666734, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.25292969, + "step": 3977, + "time_per_iteration": 2.907803535461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.04070044, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.06111847190326819, + "language_loss": 0.87681317, + "learning_rate": 0.00013763822794857583, + "loss": 0.8874656, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.24560547, + "step": 3978, + "time_per_iteration": 3.321633815765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.04451132, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.06052146133272544, + "language_loss": 0.89868426, + "learning_rate": 0.00013742363326605278, + "loss": 0.90937102, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.24145508, + "step": 3979, + "time_per_iteration": 2.6986289024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069543, + "balance_loss_mlp": 1.04461646, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.05317029267567304, + "language_loss": 0.78481579, + "learning_rate": 0.00013720917934815935, + "loss": 0.79551125, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.24938965, + "step": 3980, + "time_per_iteration": 2.7501296997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.0423131, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.07789477911070539, + "language_loss": 0.83389938, + "learning_rate": 0.00013699486627815344, + "loss": 0.84457338, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.25109863, + "step": 3981, + "time_per_iteration": 2.6420035362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071024, + "balance_loss_mlp": 1.04684854, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.05908161503025523, + "language_loss": 0.82459986, + "learning_rate": 0.00013678069413923928, + "loss": 0.8353101, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.24169922, + "step": 3982, + "time_per_iteration": 2.6923530101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069033, + "balance_loss_mlp": 1.04438031, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.05955764603352247, + "language_loss": 0.82175618, + "learning_rate": 0.00013656666301456555, + "loss": 0.83244646, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.24658203, + "step": 3983, + "time_per_iteration": 2.507713794708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070412, + "balance_loss_mlp": 1.04577184, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.05383529418711491, + "language_loss": 0.84338224, + "learning_rate": 0.0001363527729872267, + "loss": 0.8540864, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.24633789, + "step": 3984, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074302, + "balance_loss_mlp": 1.04956603, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.06494738707995135, + "language_loss": 0.76830447, + "learning_rate": 0.00013613902414026207, + "loss": 0.77904749, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.24755859, + "step": 3985, + "time_per_iteration": 2.782332420349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071114, + "balance_loss_mlp": 1.04673553, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.062182975481338824, + "language_loss": 0.82354724, + "learning_rate": 0.00013592541655665642, + "loss": 0.83425832, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.24389648, + "step": 3986, + "time_per_iteration": 2.9702792167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072457, + "balance_loss_mlp": 1.04731619, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.06854938145673521, + "language_loss": 0.85176432, + "learning_rate": 0.00013571195031933947, + "loss": 0.86248893, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.25134277, + "step": 3987, + "time_per_iteration": 2.7414095401763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017121, + "balance_loss_mlp": 1.01035035, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.007346318890350203, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81498468, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.06787109, + "step": 3988, + "time_per_iteration": 4.668884515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070415, + "balance_loss_mlp": 1.04586959, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.05768495800947346, + "language_loss": 0.85781401, + "learning_rate": 0.00013528544221501655, + "loss": 0.86851817, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.2454834, + "step": 3989, + "time_per_iteration": 2.7101733684539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082587, + "balance_loss_mlp": 1.05741, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.055358014362887016, + "language_loss": 0.81702012, + "learning_rate": 0.00013507240051359586, + "loss": 0.82784599, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.25195312, + "step": 3990, + "time_per_iteration": 3.045398235321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076149, + "balance_loss_mlp": 1.05154467, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.06163508204720818, + "language_loss": 0.86476028, + "learning_rate": 0.00013485950048963425, + "loss": 0.87552178, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.24597168, + "step": 3991, + "time_per_iteration": 2.6160435676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070646, + "balance_loss_mlp": 1.04651809, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.06405846615426129, + "language_loss": 0.83226049, + "learning_rate": 0.00013464674222578643, + "loss": 0.84296697, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.24133301, + "step": 3992, + "time_per_iteration": 3.241417407989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075856, + "balance_loss_mlp": 1.05133498, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.0861576715386882, + "language_loss": 0.83366966, + "learning_rate": 0.00013443412580465292, + "loss": 0.84442818, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.24523926, + "step": 3993, + "time_per_iteration": 4.034927606582642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077486, + "balance_loss_mlp": 1.05234468, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.06539433729122356, + "language_loss": 0.84409565, + "learning_rate": 0.00013422165130877857, + "loss": 0.8548705, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.25146484, + "step": 3994, + "time_per_iteration": 2.899876356124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076399, + "balance_loss_mlp": 1.05162692, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.06486497816335288, + "language_loss": 0.80478024, + "learning_rate": 0.00013400931882065327, + "loss": 0.81554425, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.24755859, + "step": 3995, + "time_per_iteration": 2.637848138809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_mlp": 1.04965222, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.05862406149444633, + "language_loss": 0.80990946, + "learning_rate": 0.0001337971284227118, + "loss": 0.82065952, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.25378418, + "step": 3996, + "time_per_iteration": 2.996047258377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014574, + "balance_loss_mlp": 1.00766027, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.0056434488295698, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77132994, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.06933594, + "step": 3997, + "time_per_iteration": 4.898136854171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079533, + "balance_loss_mlp": 1.05454707, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.05325095394191187, + "language_loss": 0.8044312, + "learning_rate": 0.0001333731742268438, + "loss": 0.81522655, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.24963379, + "step": 3998, + "time_per_iteration": 2.682598352432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074014, + "balance_loss_mlp": 1.0484314, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.05950995381117831, + "language_loss": 0.85969436, + "learning_rate": 0.0001331614105935109, + "loss": 0.87043446, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.25598145, + "step": 3999, + "time_per_iteration": 2.6892640590667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074883, + "balance_loss_mlp": 1.04993236, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.05504712261606648, + "language_loss": 0.84570682, + "learning_rate": 0.00013294978937954883, + "loss": 0.85645556, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.24963379, + "step": 4000, + "time_per_iteration": 2.7923429012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069985, + "balance_loss_mlp": 1.04480815, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.06369425747465635, + "language_loss": 0.85450065, + "learning_rate": 0.00013273831066711655, + "loss": 0.86520052, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.25195312, + "step": 4001, + "time_per_iteration": 2.604282855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078533, + "balance_loss_mlp": 1.05357099, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.05227266936279928, + "language_loss": 0.80363703, + "learning_rate": 0.00013252697453831747, + "loss": 0.81442237, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.24975586, + "step": 4002, + "time_per_iteration": 2.7329213619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075879, + "balance_loss_mlp": 1.04984355, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.05410166650315036, + "language_loss": 0.82619524, + "learning_rate": 0.00013231578107519916, + "loss": 0.836954, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.26037598, + "step": 4003, + "time_per_iteration": 2.8749208450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.04640484, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.06212967389396702, + "language_loss": 0.82997644, + "learning_rate": 0.00013210473035975422, + "loss": 0.84067953, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.23901367, + "step": 4004, + "time_per_iteration": 2.632235288619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073797, + "balance_loss_mlp": 1.04878688, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.08127476835150631, + "language_loss": 0.85770059, + "learning_rate": 0.0001318938224739201, + "loss": 0.8684386, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.25012207, + "step": 4005, + "time_per_iteration": 3.1193981170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069411, + "balance_loss_mlp": 1.04423416, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06450494228742032, + "language_loss": 0.84133303, + "learning_rate": 0.00013168305749957843, + "loss": 0.85202718, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.25183105, + "step": 4006, + "time_per_iteration": 2.7679009437561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066176, + "balance_loss_mlp": 1.04140389, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.05098549472423512, + "language_loss": 0.82851386, + "learning_rate": 0.00013147243551855532, + "loss": 0.83917558, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.24768066, + "step": 4007, + "time_per_iteration": 2.56661057472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106967, + "balance_loss_mlp": 1.04553032, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.048065096858605744, + "language_loss": 0.80720365, + "learning_rate": 0.00013126195661262148, + "loss": 0.81790042, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.24133301, + "step": 4008, + "time_per_iteration": 2.748946189880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04871428, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.05225893259169981, + "language_loss": 0.86849338, + "learning_rate": 0.00013105162086349216, + "loss": 0.87922454, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.24389648, + "step": 4009, + "time_per_iteration": 2.8441760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075731, + "balance_loss_mlp": 1.05036318, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.05592364071179798, + "language_loss": 0.86026949, + "learning_rate": 0.00013084142835282687, + "loss": 0.87102675, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.25390625, + "step": 4010, + "time_per_iteration": 2.6637930870056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018937, + "balance_loss_mlp": 1.01207089, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.009139726311940191, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80903304, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.06884766, + "step": 4011, + "time_per_iteration": 4.862056255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071968, + "balance_loss_mlp": 1.04698229, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.059120136224188345, + "language_loss": 0.89225733, + "learning_rate": 0.0001304214733732485, + "loss": 0.90297705, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.24987793, + "step": 4012, + "time_per_iteration": 2.759030818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073689, + "balance_loss_mlp": 1.04848814, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.06471041730678638, + "language_loss": 0.82730675, + "learning_rate": 0.00013021171106737672, + "loss": 0.83804369, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.25219727, + "step": 4013, + "time_per_iteration": 2.6606547832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076087, + "balance_loss_mlp": 1.05184019, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.04900066226128831, + "language_loss": 0.79908812, + "learning_rate": 0.00013000209232605071, + "loss": 0.80984896, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.24230957, + "step": 4014, + "time_per_iteration": 2.665905237197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.05282855, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.06112285511526787, + "language_loss": 0.79969144, + "learning_rate": 0.0001297926172306519, + "loss": 0.81047451, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.25500488, + "step": 4015, + "time_per_iteration": 2.657850503921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070931, + "balance_loss_mlp": 1.04582596, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.05237005996318212, + "language_loss": 0.79076529, + "learning_rate": 0.0001295832858625055, + "loss": 0.80147457, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.25097656, + "step": 4016, + "time_per_iteration": 3.2558414936065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073374, + "balance_loss_mlp": 1.04854274, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.05315298112926871, + "language_loss": 0.70125198, + "learning_rate": 0.00012937409830288154, + "loss": 0.71198577, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.24841309, + "step": 4017, + "time_per_iteration": 2.8071141242980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.05123496, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.08174629048590813, + "language_loss": 0.85300404, + "learning_rate": 0.00012916505463299362, + "loss": 0.86376196, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.24572754, + "step": 4018, + "time_per_iteration": 2.521777868270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107567, + "balance_loss_mlp": 1.05085087, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.06122283434294953, + "language_loss": 0.78065503, + "learning_rate": 0.00012895615493399972, + "loss": 0.7914117, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.24816895, + "step": 4019, + "time_per_iteration": 2.843815326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077372, + "balance_loss_mlp": 1.05187333, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.06855885620407597, + "language_loss": 0.82437384, + "learning_rate": 0.00012874739928700192, + "loss": 0.83514762, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.25512695, + "step": 4020, + "time_per_iteration": 2.625136613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074703, + "balance_loss_mlp": 1.049371, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.06838381213825387, + "language_loss": 0.79939824, + "learning_rate": 0.00012853878777304624, + "loss": 0.81014526, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.25341797, + "step": 4021, + "time_per_iteration": 2.870577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078589, + "balance_loss_mlp": 1.05390024, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.05110075490537154, + "language_loss": 0.84490621, + "learning_rate": 0.000128330320473123, + "loss": 0.85569209, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.24694824, + "step": 4022, + "time_per_iteration": 2.6740787029266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016568, + "balance_loss_mlp": 1.0095588, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.007922822581460296, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79348469, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.0703125, + "step": 4023, + "time_per_iteration": 4.92633318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079759, + "balance_loss_mlp": 1.05466557, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.0679194814270071, + "language_loss": 0.81913227, + "learning_rate": 0.0001279138188390543, + "loss": 0.82992983, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.25109863, + "step": 4024, + "time_per_iteration": 2.822410821914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074667, + "balance_loss_mlp": 1.05036056, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.05859090584356498, + "language_loss": 0.86860305, + "learning_rate": 0.00012770578466660915, + "loss": 0.87934977, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.24291992, + "step": 4025, + "time_per_iteration": 2.9427406787872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081782, + "balance_loss_mlp": 1.05593777, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.062055121147232294, + "language_loss": 0.82006752, + "learning_rate": 0.0001274978950315968, + "loss": 0.83088535, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.25878906, + "step": 4026, + "time_per_iteration": 2.795128583908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075185, + "balance_loss_mlp": 1.05006814, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.08117867948419291, + "language_loss": 0.83182287, + "learning_rate": 0.00012729015001472716, + "loss": 0.84257472, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.2512207, + "step": 4027, + "time_per_iteration": 2.6325039863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04923844, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.05921270053212527, + "language_loss": 0.82036096, + "learning_rate": 0.00012708254969665418, + "loss": 0.83111012, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.25695801, + "step": 4028, + "time_per_iteration": 2.7775604724884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079508, + "balance_loss_mlp": 1.05439043, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.06748938029736128, + "language_loss": 0.83602798, + "learning_rate": 0.00012687509415797526, + "loss": 0.8468231, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.2512207, + "step": 4029, + "time_per_iteration": 2.550536632537842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077, + "balance_loss_mlp": 1.05216861, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.05440055390110852, + "language_loss": 0.81075221, + "learning_rate": 0.00012666778347923208, + "loss": 0.82152218, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.24829102, + "step": 4030, + "time_per_iteration": 2.627509593963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071803, + "balance_loss_mlp": 1.04774618, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.05010388479437805, + "language_loss": 0.84007275, + "learning_rate": 0.0001264606177409092, + "loss": 0.85079074, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.24047852, + "step": 4031, + "time_per_iteration": 2.6272945404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.04220271, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.05768180331763808, + "language_loss": 0.86024618, + "learning_rate": 0.00012625359702343609, + "loss": 0.87091547, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.24707031, + "step": 4032, + "time_per_iteration": 2.7090940475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066743, + "balance_loss_mlp": 1.04268646, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.05938615660994814, + "language_loss": 0.84870774, + "learning_rate": 0.00012604672140718504, + "loss": 0.85937512, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.24047852, + "step": 4033, + "time_per_iteration": 2.6182591915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069686, + "balance_loss_mlp": 1.04444957, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.06535534521633303, + "language_loss": 0.77696943, + "learning_rate": 0.00012583999097247233, + "loss": 0.78766632, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.25256348, + "step": 4034, + "time_per_iteration": 2.8029134273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064158, + "balance_loss_mlp": 1.03980374, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.06345201912035602, + "language_loss": 0.79936808, + "learning_rate": 0.0001256334057995578, + "loss": 0.81000972, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.24365234, + "step": 4035, + "time_per_iteration": 2.7016162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063824, + "balance_loss_mlp": 1.03927922, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.056783072410337934, + "language_loss": 0.85302633, + "learning_rate": 0.000125426965968645, + "loss": 0.86366457, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.24536133, + "step": 4036, + "time_per_iteration": 2.699063301086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064554, + "balance_loss_mlp": 1.04023552, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07280358929704372, + "language_loss": 0.82468921, + "learning_rate": 0.00012522067155988092, + "loss": 0.83533478, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.24304199, + "step": 4037, + "time_per_iteration": 2.6608784198760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065667, + "balance_loss_mlp": 1.04110956, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.05908438933946337, + "language_loss": 0.75255591, + "learning_rate": 0.00012501452265335617, + "loss": 0.76321256, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.24560547, + "step": 4038, + "time_per_iteration": 2.8470029830932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068825, + "balance_loss_mlp": 1.04455364, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.059889567588302765, + "language_loss": 0.83042991, + "learning_rate": 0.0001248085193291047, + "loss": 0.84111816, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.24255371, + "step": 4039, + "time_per_iteration": 2.730807304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.04434443, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.05468138841735705, + "language_loss": 0.82561696, + "learning_rate": 0.00012460266166710443, + "loss": 0.83630657, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.24609375, + "step": 4040, + "time_per_iteration": 3.2041711807250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.04067707, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.06748219458401046, + "language_loss": 0.77782911, + "learning_rate": 0.00012439694974727633, + "loss": 0.78847289, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.23706055, + "step": 4041, + "time_per_iteration": 3.0317485332489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066638, + "balance_loss_mlp": 1.04206872, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.05414460584794901, + "language_loss": 0.80244517, + "learning_rate": 0.00012419138364948458, + "loss": 0.81311154, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.24560547, + "step": 4042, + "time_per_iteration": 2.7031445503234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063112, + "balance_loss_mlp": 1.03903186, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.05641629712994933, + "language_loss": 0.8246541, + "learning_rate": 0.00012398596345353702, + "loss": 0.83528519, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.24072266, + "step": 4043, + "time_per_iteration": 2.9440853595733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068598, + "balance_loss_mlp": 1.04396939, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06391086688710987, + "language_loss": 0.83438706, + "learning_rate": 0.0001237806892391851, + "loss": 0.84507304, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.24621582, + "step": 4044, + "time_per_iteration": 2.699157476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070628, + "balance_loss_mlp": 1.04549861, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.06994716374064003, + "language_loss": 0.81070113, + "learning_rate": 0.0001235755610861233, + "loss": 0.82140744, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.25134277, + "step": 4045, + "time_per_iteration": 2.772141933441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_mlp": 1.03941059, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.05993243352080289, + "language_loss": 0.8561902, + "learning_rate": 0.0001233705790739893, + "loss": 0.86682665, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.2421875, + "step": 4046, + "time_per_iteration": 2.715252637863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063047, + "balance_loss_mlp": 1.03909791, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.05972072663503075, + "language_loss": 0.74957597, + "learning_rate": 0.0001231657432823643, + "loss": 0.7602064, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.23937988, + "step": 4047, + "time_per_iteration": 3.2537522315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068015, + "balance_loss_mlp": 1.04351759, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.08653070667167902, + "language_loss": 0.79029131, + "learning_rate": 0.0001229610537907725, + "loss": 0.80097145, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.24511719, + "step": 4048, + "time_per_iteration": 2.6116526126861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.04393375, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.07563630490624115, + "language_loss": 0.9076525, + "learning_rate": 0.00012275651067868143, + "loss": 0.91833448, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.24255371, + "step": 4049, + "time_per_iteration": 2.6179893016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067923, + "balance_loss_mlp": 1.04335427, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.05280851265506485, + "language_loss": 0.80811793, + "learning_rate": 0.00012255211402550182, + "loss": 0.81879717, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.24572754, + "step": 4050, + "time_per_iteration": 3.24564266204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106679, + "balance_loss_mlp": 1.0426023, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.06858136893192632, + "language_loss": 0.76661634, + "learning_rate": 0.00012234786391058727, + "loss": 0.77728426, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.24194336, + "step": 4051, + "time_per_iteration": 2.7757604122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072241, + "balance_loss_mlp": 1.04733872, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.06727738365211818, + "language_loss": 0.85258687, + "learning_rate": 0.0001221437604132352, + "loss": 0.86330926, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.24914551, + "step": 4052, + "time_per_iteration": 2.6063547134399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068764, + "balance_loss_mlp": 1.04383731, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.07458964549292091, + "language_loss": 0.81427658, + "learning_rate": 0.0001219398036126852, + "loss": 0.82496417, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.24926758, + "step": 4053, + "time_per_iteration": 2.7283682823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069627, + "balance_loss_mlp": 1.04429483, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.05694657807222903, + "language_loss": 0.78109223, + "learning_rate": 0.00012173599358812027, + "loss": 0.79178852, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.25341797, + "step": 4054, + "time_per_iteration": 3.234590768814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071917, + "balance_loss_mlp": 1.04676414, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07419180869397772, + "language_loss": 0.82674354, + "learning_rate": 0.0001215323304186668, + "loss": 0.83746266, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.25170898, + "step": 4055, + "time_per_iteration": 2.747619152069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068312, + "balance_loss_mlp": 1.04483986, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.05875387608266639, + "language_loss": 0.88048428, + "learning_rate": 0.00012132881418339364, + "loss": 0.8911674, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.23449707, + "step": 4056, + "time_per_iteration": 2.711988687515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020033, + "balance_loss_mlp": 1.01268935, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.018856968303163506, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78537595, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.07324219, + "step": 4057, + "time_per_iteration": 4.845250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068664, + "balance_loss_mlp": 1.04403543, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.07123577938940119, + "language_loss": 0.76748145, + "learning_rate": 0.00012092222283137944, + "loss": 0.77816808, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.24633789, + "step": 4058, + "time_per_iteration": 2.7359137535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015158, + "balance_loss_mlp": 1.00781476, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.015685369403867444, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79921466, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.07324219, + "step": 4059, + "time_per_iteration": 4.798109292984009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068881, + "balance_loss_mlp": 1.0447526, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.05299878085313822, + "language_loss": 0.83917785, + "learning_rate": 0.00012051622016348856, + "loss": 0.84986663, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.24121094, + "step": 4060, + "time_per_iteration": 3.049039125442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068121, + "balance_loss_mlp": 1.04395747, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.06624192950559629, + "language_loss": 0.84473646, + "learning_rate": 0.00012031343978315539, + "loss": 0.85541761, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.24169922, + "step": 4061, + "time_per_iteration": 2.5507752895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065012, + "balance_loss_mlp": 1.03988302, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.06423319540710895, + "language_loss": 0.82729137, + "learning_rate": 0.00012011080681021774, + "loss": 0.83794153, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.2512207, + "step": 4062, + "time_per_iteration": 2.611497640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_mlp": 1.04255247, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.05897582780878463, + "language_loss": 0.86540973, + "learning_rate": 0.00011990832132334512, + "loss": 0.87608433, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.24902344, + "step": 4063, + "time_per_iteration": 2.538100481033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069255, + "balance_loss_mlp": 1.04350615, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.06886794650581697, + "language_loss": 0.82671422, + "learning_rate": 0.00011970598340114897, + "loss": 0.83740675, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.25769043, + "step": 4064, + "time_per_iteration": 2.9499306678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067844, + "balance_loss_mlp": 1.04288173, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.06050124300613184, + "language_loss": 0.83932686, + "learning_rate": 0.00011950379312218396, + "loss": 0.85000533, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.24975586, + "step": 4065, + "time_per_iteration": 2.6981561183929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066746, + "balance_loss_mlp": 1.04218912, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.0632552129471463, + "language_loss": 0.86015439, + "learning_rate": 0.00011930175056494719, + "loss": 0.87082189, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.24560547, + "step": 4066, + "time_per_iteration": 2.8711161613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072113, + "balance_loss_mlp": 1.04645956, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.04894683237800459, + "language_loss": 0.76267391, + "learning_rate": 0.00011909985580787885, + "loss": 0.77339506, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.25683594, + "step": 4067, + "time_per_iteration": 2.623110771179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067479, + "balance_loss_mlp": 1.04259992, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.051509930807311505, + "language_loss": 0.81230474, + "learning_rate": 0.00011889810892936137, + "loss": 0.82297957, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.24865723, + "step": 4068, + "time_per_iteration": 2.7267825603485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072363, + "balance_loss_mlp": 1.04626799, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.06989547292093733, + "language_loss": 0.7721833, + "learning_rate": 0.00011869651000771959, + "loss": 0.78290695, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.26086426, + "step": 4069, + "time_per_iteration": 2.8386263847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066766, + "balance_loss_mlp": 1.04239988, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.05590330027486899, + "language_loss": 0.82998097, + "learning_rate": 0.00011849505912122117, + "loss": 0.84064865, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.24353027, + "step": 4070, + "time_per_iteration": 2.7070353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071318, + "balance_loss_mlp": 1.04591501, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07526003762503049, + "language_loss": 0.77768147, + "learning_rate": 0.00011829375634807654, + "loss": 0.78839469, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.25415039, + "step": 4071, + "time_per_iteration": 3.020780324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070393, + "balance_loss_mlp": 1.04478693, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.07034512653521276, + "language_loss": 0.81144375, + "learning_rate": 0.00011809260176643821, + "loss": 0.82214773, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.25634766, + "step": 4072, + "time_per_iteration": 3.048811674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070982, + "balance_loss_mlp": 1.0463053, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.059023675544883115, + "language_loss": 0.83701313, + "learning_rate": 0.00011789159545440131, + "loss": 0.84772301, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.2467041, + "step": 4073, + "time_per_iteration": 2.576185703277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_mlp": 1.04587591, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05002266794770535, + "language_loss": 0.82456756, + "learning_rate": 0.00011769073749000348, + "loss": 0.83526802, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.24169922, + "step": 4074, + "time_per_iteration": 2.8168578147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079683, + "balance_loss_mlp": 1.05485141, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.06200789049799382, + "language_loss": 0.76390898, + "learning_rate": 0.0001174900279512246, + "loss": 0.77470577, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.24829102, + "step": 4075, + "time_per_iteration": 2.5813939571380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073387, + "balance_loss_mlp": 1.04935431, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.0619484344784815, + "language_loss": 0.81889015, + "learning_rate": 0.00011728946691598707, + "loss": 0.82962406, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.2401123, + "step": 4076, + "time_per_iteration": 2.604560136795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072391, + "balance_loss_mlp": 1.04755974, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.06799226151214174, + "language_loss": 0.76469254, + "learning_rate": 0.00011708905446215561, + "loss": 0.77541649, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.24841309, + "step": 4077, + "time_per_iteration": 2.8604230880737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076266, + "balance_loss_mlp": 1.05137515, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.05961897466734042, + "language_loss": 0.80370939, + "learning_rate": 0.00011688879066753711, + "loss": 0.81447208, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.24914551, + "step": 4078, + "time_per_iteration": 2.676522970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076486, + "balance_loss_mlp": 1.05258489, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.06622191956248384, + "language_loss": 0.87440282, + "learning_rate": 0.00011668867560988122, + "loss": 0.88516766, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.23876953, + "step": 4079, + "time_per_iteration": 2.5650012493133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_mlp": 1.04890943, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.05649669086947294, + "language_loss": 0.84200358, + "learning_rate": 0.00011648870936687916, + "loss": 0.85274112, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.24853516, + "step": 4080, + "time_per_iteration": 2.746581554412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077673, + "balance_loss_mlp": 1.05274642, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.06352176738045938, + "language_loss": 0.78612041, + "learning_rate": 0.00011628889201616461, + "loss": 0.79689717, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.24926758, + "step": 4081, + "time_per_iteration": 2.6527559757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107817, + "balance_loss_mlp": 1.05413723, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.059333709179443264, + "language_loss": 0.81988198, + "learning_rate": 0.00011608922363531393, + "loss": 0.83066362, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.24023438, + "step": 4082, + "time_per_iteration": 2.6462624073028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107947, + "balance_loss_mlp": 1.05538988, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.07948062508408354, + "language_loss": 0.83596992, + "learning_rate": 0.00011588970430184504, + "loss": 0.84676462, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.24072266, + "step": 4083, + "time_per_iteration": 3.0286946296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076264, + "balance_loss_mlp": 1.05245781, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.051566265858254114, + "language_loss": 0.81763256, + "learning_rate": 0.00011569033409321822, + "loss": 0.82839513, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.23803711, + "step": 4084, + "time_per_iteration": 2.677654981613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074536, + "balance_loss_mlp": 1.051386, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.08091390991612231, + "language_loss": 0.73483807, + "learning_rate": 0.00011549111308683591, + "loss": 0.74558342, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.23144531, + "step": 4085, + "time_per_iteration": 2.658348560333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107372, + "balance_loss_mlp": 1.04996181, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.08162659995684916, + "language_loss": 0.80996692, + "learning_rate": 0.00011529204136004251, + "loss": 0.8207041, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.23754883, + "step": 4086, + "time_per_iteration": 2.4145894050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075099, + "balance_loss_mlp": 1.05042303, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.05377876358731873, + "language_loss": 0.84456086, + "learning_rate": 0.00011509311899012459, + "loss": 0.85531187, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.2467041, + "step": 4087, + "time_per_iteration": 2.6459927558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107412, + "balance_loss_mlp": 1.05020642, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.06320948891726123, + "language_loss": 0.78042746, + "learning_rate": 0.00011489434605431053, + "loss": 0.79116869, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.23901367, + "step": 4088, + "time_per_iteration": 2.6297101974487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071348, + "balance_loss_mlp": 1.04667187, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.0677420392553831, + "language_loss": 0.81525648, + "learning_rate": 0.0001146957226297708, + "loss": 0.82596999, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.24682617, + "step": 4089, + "time_per_iteration": 2.686326026916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078205, + "balance_loss_mlp": 1.05306411, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.0625994839156693, + "language_loss": 0.76622844, + "learning_rate": 0.00011449724879361827, + "loss": 0.77701044, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.25146484, + "step": 4090, + "time_per_iteration": 2.944439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072693, + "balance_loss_mlp": 1.04931605, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.07495100545355479, + "language_loss": 0.73970717, + "learning_rate": 0.00011429892462290687, + "loss": 0.7504341, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.23376465, + "step": 4091, + "time_per_iteration": 2.666902542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076674, + "balance_loss_mlp": 1.05143714, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.08113379583083238, + "language_loss": 0.83441567, + "learning_rate": 0.00011410075019463295, + "loss": 0.84518242, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.25256348, + "step": 4092, + "time_per_iteration": 2.6106560230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077622, + "balance_loss_mlp": 1.05387568, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.06484532402040227, + "language_loss": 0.80351543, + "learning_rate": 0.00011390272558573461, + "loss": 0.81429172, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.23730469, + "step": 4093, + "time_per_iteration": 2.6585676670074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_mlp": 1.04527855, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.06429182980195784, + "language_loss": 0.79664457, + "learning_rate": 0.00011370485087309202, + "loss": 0.80732632, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.22888184, + "step": 4094, + "time_per_iteration": 2.6336958408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.05357265, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.05995704636978539, + "language_loss": 0.79134881, + "learning_rate": 0.00011350712613352688, + "loss": 0.80212498, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.24023438, + "step": 4095, + "time_per_iteration": 2.7010250091552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075966, + "balance_loss_mlp": 1.05183816, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.08641736652738899, + "language_loss": 0.79282218, + "learning_rate": 0.00011330955144380283, + "loss": 0.80358183, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.24121094, + "step": 4096, + "time_per_iteration": 2.608861207962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075916, + "balance_loss_mlp": 1.0511322, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.05952811942716876, + "language_loss": 0.8624779, + "learning_rate": 0.00011311212688062483, + "loss": 0.87323707, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.24780273, + "step": 4097, + "time_per_iteration": 2.779981851577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071321, + "balance_loss_mlp": 1.04694271, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.07195937030471744, + "language_loss": 0.77942967, + "learning_rate": 0.0001129148525206402, + "loss": 0.79014289, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.24365234, + "step": 4098, + "time_per_iteration": 2.821665048599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.04944086, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.06162490932245696, + "language_loss": 0.86554086, + "learning_rate": 0.00011271772844043759, + "loss": 0.87627459, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.23913574, + "step": 4099, + "time_per_iteration": 2.6586296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071216, + "balance_loss_mlp": 1.04692149, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.0681580072696929, + "language_loss": 0.75896525, + "learning_rate": 0.00011252075471654727, + "loss": 0.7696774, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.24279785, + "step": 4100, + "time_per_iteration": 2.9564926624298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078491, + "balance_loss_mlp": 1.05411243, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.05782591826916926, + "language_loss": 0.78057027, + "learning_rate": 0.00011232393142544133, + "loss": 0.79135513, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.24365234, + "step": 4101, + "time_per_iteration": 2.9211971759796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071535, + "balance_loss_mlp": 1.04766965, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.05727153385577965, + "language_loss": 0.83000249, + "learning_rate": 0.00011212725864353323, + "loss": 0.84071785, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.23864746, + "step": 4102, + "time_per_iteration": 3.0621325969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010163, + "balance_loss_mlp": 1.00324917, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.011162541851203939, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77346092, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.06933594, + "step": 4103, + "time_per_iteration": 4.843589782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.05412483, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.06819602416212077, + "language_loss": 0.7587899, + "learning_rate": 0.00011173436491267291, + "loss": 0.76958299, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.25195312, + "step": 4104, + "time_per_iteration": 2.5900800228118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071242, + "balance_loss_mlp": 1.04709017, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.058920781794481306, + "language_loss": 0.82051444, + "learning_rate": 0.0001115381441162554, + "loss": 0.83122683, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.24145508, + "step": 4105, + "time_per_iteration": 2.615739345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010138, + "balance_loss_mlp": 1.00322342, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.008527676451490414, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74593866, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.06933594, + "step": 4106, + "time_per_iteration": 4.8757970333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073097, + "balance_loss_mlp": 1.04834938, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06585069884795663, + "language_loss": 0.85103577, + "learning_rate": 0.00011114615504234465, + "loss": 0.8617667, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.24743652, + "step": 4107, + "time_per_iteration": 2.7732784748077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069208, + "balance_loss_mlp": 1.04528236, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.058284414227118955, + "language_loss": 0.80976272, + "learning_rate": 0.00011095038691703468, + "loss": 0.82045484, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.23913574, + "step": 4108, + "time_per_iteration": 2.8352532386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.04486418, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.05943698489540686, + "language_loss": 0.82732534, + "learning_rate": 0.00011075476983417998, + "loss": 0.83800745, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.2331543, + "step": 4109, + "time_per_iteration": 2.8430795669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074219, + "balance_loss_mlp": 1.04974484, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.060761374813596516, + "language_loss": 0.77827907, + "learning_rate": 0.00011055930386972579, + "loss": 0.78902125, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.24487305, + "step": 4110, + "time_per_iteration": 2.8313000202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04976118, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.06263812917256945, + "language_loss": 0.78423452, + "learning_rate": 0.00011036398909955863, + "loss": 0.79498512, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.2532959, + "step": 4111, + "time_per_iteration": 2.952772378921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070116, + "balance_loss_mlp": 1.04626179, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.053341385482647614, + "language_loss": 0.8184247, + "learning_rate": 0.00011016882559950648, + "loss": 0.82912588, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.23852539, + "step": 4112, + "time_per_iteration": 2.8120028972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071944, + "balance_loss_mlp": 1.0478282, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.05974230031891692, + "language_loss": 0.81143081, + "learning_rate": 0.00010997381344533853, + "loss": 0.82215035, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.2409668, + "step": 4113, + "time_per_iteration": 2.7792022228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072154, + "balance_loss_mlp": 1.04729867, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.1069423386655018, + "language_loss": 0.80604601, + "learning_rate": 0.00010977895271276517, + "loss": 0.81676757, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.24853516, + "step": 4114, + "time_per_iteration": 2.663350820541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076688, + "balance_loss_mlp": 1.05166614, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.054772292847761084, + "language_loss": 0.80259216, + "learning_rate": 0.00010958424347743807, + "loss": 0.81335902, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.25036621, + "step": 4115, + "time_per_iteration": 2.7451772689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068946, + "balance_loss_mlp": 1.04523504, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06136237031197372, + "language_loss": 0.80247879, + "learning_rate": 0.00010938968581494991, + "loss": 0.81316829, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.23718262, + "step": 4116, + "time_per_iteration": 2.953747034072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076632, + "balance_loss_mlp": 1.05221832, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.35095947289915363, + "language_loss": 0.79212964, + "learning_rate": 0.000109195279800835, + "loss": 0.80289596, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.24414062, + "step": 4117, + "time_per_iteration": 2.715400218963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107219, + "balance_loss_mlp": 1.04746628, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.07732725032472638, + "language_loss": 0.76730645, + "learning_rate": 0.00010900102551056834, + "loss": 0.77802831, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.24719238, + "step": 4118, + "time_per_iteration": 3.0449063777923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074277, + "balance_loss_mlp": 1.05074465, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.056769240767155345, + "language_loss": 0.84711397, + "learning_rate": 0.00010880692301956601, + "loss": 0.85785675, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.23535156, + "step": 4119, + "time_per_iteration": 2.4458513259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072784, + "balance_loss_mlp": 1.0488348, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.050731815554774906, + "language_loss": 0.86393607, + "learning_rate": 0.00010861297240318518, + "loss": 0.87466389, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.23937988, + "step": 4120, + "time_per_iteration": 2.8899548053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071017, + "balance_loss_mlp": 1.04761648, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.05101998246826083, + "language_loss": 0.8699168, + "learning_rate": 0.00010841917373672444, + "loss": 0.88062692, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.23388672, + "step": 4121, + "time_per_iteration": 2.735093593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072991, + "balance_loss_mlp": 1.04925656, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.06441744003390583, + "language_loss": 0.78287446, + "learning_rate": 0.00010822552709542293, + "loss": 0.79360437, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.23730469, + "step": 4122, + "time_per_iteration": 2.807694911956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107165, + "balance_loss_mlp": 1.04821384, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.05111544046549841, + "language_loss": 0.86263895, + "learning_rate": 0.0001080320325544612, + "loss": 0.87335551, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.23425293, + "step": 4123, + "time_per_iteration": 2.7574799060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073909, + "balance_loss_mlp": 1.04917264, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.061782912255848775, + "language_loss": 0.82952595, + "learning_rate": 0.00010783869018895997, + "loss": 0.84026504, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.24731445, + "step": 4124, + "time_per_iteration": 2.5689857006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070849, + "balance_loss_mlp": 1.04611349, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.05779110209300624, + "language_loss": 0.84302074, + "learning_rate": 0.00010764550007398189, + "loss": 0.85372925, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.24755859, + "step": 4125, + "time_per_iteration": 2.624270439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072775, + "balance_loss_mlp": 1.04818177, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.06779162634736255, + "language_loss": 0.81683314, + "learning_rate": 0.00010745246228452982, + "loss": 0.8275609, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.24597168, + "step": 4126, + "time_per_iteration": 2.6135480403900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073416, + "balance_loss_mlp": 1.04958653, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.08048379704700387, + "language_loss": 0.81824982, + "learning_rate": 0.00010725957689554771, + "loss": 0.82898396, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.23791504, + "step": 4127, + "time_per_iteration": 2.7506372928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072246, + "balance_loss_mlp": 1.0476892, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.05152467214863494, + "language_loss": 0.84890383, + "learning_rate": 0.00010706684398192013, + "loss": 0.85962629, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.2454834, + "step": 4128, + "time_per_iteration": 2.707517385482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070232, + "balance_loss_mlp": 1.04580581, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.06480144593438951, + "language_loss": 0.82284296, + "learning_rate": 0.00010687426361847313, + "loss": 0.83354527, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.24414062, + "step": 4129, + "time_per_iteration": 2.7257397174835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076314, + "balance_loss_mlp": 1.05154264, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.058525612459034176, + "language_loss": 0.85989046, + "learning_rate": 0.00010668183587997254, + "loss": 0.87065363, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.24780273, + "step": 4130, + "time_per_iteration": 2.6166372299194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107138, + "balance_loss_mlp": 1.04734755, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.05131686661427292, + "language_loss": 0.77755392, + "learning_rate": 0.0001064895608411256, + "loss": 0.78826773, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.24047852, + "step": 4131, + "time_per_iteration": 2.806696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074078, + "balance_loss_mlp": 1.04922318, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.06416883380663327, + "language_loss": 0.80383301, + "learning_rate": 0.00010629743857657998, + "loss": 0.81457376, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.24853516, + "step": 4132, + "time_per_iteration": 2.9960997104644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_mlp": 1.02619219, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.021894290253507743, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71631676, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.06835938, + "step": 4133, + "time_per_iteration": 4.623692512512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_mlp": 1.05289078, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.06117584542977267, + "language_loss": 0.82268703, + "learning_rate": 0.00010591365266868802, + "loss": 0.83344853, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.23254395, + "step": 4134, + "time_per_iteration": 2.9868671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_mlp": 1.02421665, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.02172018156045361, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76542771, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.06787109, + "step": 4135, + "time_per_iteration": 4.976134300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071076, + "balance_loss_mlp": 1.04654241, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.06221307729096171, + "language_loss": 0.79485571, + "learning_rate": 0.00010553047875229166, + "loss": 0.80556649, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.24536133, + "step": 4136, + "time_per_iteration": 2.5057613849639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077488, + "balance_loss_mlp": 1.05284762, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.058263200481796444, + "language_loss": 0.83615577, + "learning_rate": 0.00010533912147689328, + "loss": 0.84693062, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.24645996, + "step": 4137, + "time_per_iteration": 2.601483106613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073106, + "balance_loss_mlp": 1.04894197, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.05046290164389679, + "language_loss": 0.82613522, + "learning_rate": 0.00010514791742243656, + "loss": 0.83686626, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.24157715, + "step": 4138, + "time_per_iteration": 2.5787172317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069273, + "balance_loss_mlp": 1.04491878, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.06370293603956936, + "language_loss": 0.82636452, + "learning_rate": 0.00010495686666315341, + "loss": 0.83705723, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.2434082, + "step": 4139, + "time_per_iteration": 2.9137964248657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074776, + "balance_loss_mlp": 1.05144691, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.06334360078787715, + "language_loss": 0.77300745, + "learning_rate": 0.00010476596927321635, + "loss": 0.78375518, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.23327637, + "step": 4140, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071011, + "balance_loss_mlp": 1.04807556, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.04689989949815107, + "language_loss": 0.80298364, + "learning_rate": 0.00010457522532673835, + "loss": 0.81369376, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.22924805, + "step": 4141, + "time_per_iteration": 2.804485321044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074886, + "balance_loss_mlp": 1.05092525, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.0598189384756839, + "language_loss": 0.83671814, + "learning_rate": 0.00010438463489777272, + "loss": 0.84746695, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.23937988, + "step": 4142, + "time_per_iteration": 2.593764305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071767, + "balance_loss_mlp": 1.047472, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.06370121218595463, + "language_loss": 0.77954531, + "learning_rate": 0.00010419419806031316, + "loss": 0.79026294, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.24291992, + "step": 4143, + "time_per_iteration": 2.6709976196289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073716, + "balance_loss_mlp": 1.05002928, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.05491242983846696, + "language_loss": 0.84102464, + "learning_rate": 0.00010400391488829403, + "loss": 0.85176182, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.23681641, + "step": 4144, + "time_per_iteration": 2.767470121383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074277, + "balance_loss_mlp": 1.04962492, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.0554387027549828, + "language_loss": 0.86743295, + "learning_rate": 0.00010381378545558984, + "loss": 0.87817574, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.24658203, + "step": 4145, + "time_per_iteration": 2.7449891567230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067536, + "balance_loss_mlp": 1.04415917, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.05491436381940487, + "language_loss": 0.8494069, + "learning_rate": 0.00010362380983601505, + "loss": 0.86008221, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.23352051, + "step": 4146, + "time_per_iteration": 2.540022134780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071068, + "balance_loss_mlp": 1.04748869, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.05412435738245621, + "language_loss": 0.79004198, + "learning_rate": 0.00010343398810332477, + "loss": 0.80075264, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.2355957, + "step": 4147, + "time_per_iteration": 3.457382917404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064363, + "balance_loss_mlp": 1.04003215, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.057902148991934105, + "language_loss": 0.84552854, + "learning_rate": 0.00010324432033121467, + "loss": 0.8561722, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.2434082, + "step": 4148, + "time_per_iteration": 2.923807382583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068154, + "balance_loss_mlp": 1.04443097, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.05631253152540408, + "language_loss": 0.83762592, + "learning_rate": 0.00010305480659332005, + "loss": 0.84830743, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.23718262, + "step": 4149, + "time_per_iteration": 2.58620285987854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069961, + "balance_loss_mlp": 1.04592896, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.0573271525329392, + "language_loss": 0.83780408, + "learning_rate": 0.00010286544696321682, + "loss": 0.84850371, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.24023438, + "step": 4150, + "time_per_iteration": 2.5883052349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066534, + "balance_loss_mlp": 1.04267991, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.06896512478110718, + "language_loss": 0.79666734, + "learning_rate": 0.00010267624151442073, + "loss": 0.80733263, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.23864746, + "step": 4151, + "time_per_iteration": 2.660498857498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.04581642, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.08734927072671815, + "language_loss": 0.81357807, + "learning_rate": 0.000102487190320388, + "loss": 0.82427216, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.23596191, + "step": 4152, + "time_per_iteration": 3.378927230834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066042, + "balance_loss_mlp": 1.04099584, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.06453133942638287, + "language_loss": 0.79819167, + "learning_rate": 0.00010229829345451475, + "loss": 0.80885208, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.25061035, + "step": 4153, + "time_per_iteration": 3.319824457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068485, + "balance_loss_mlp": 1.04403472, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06331967067065977, + "language_loss": 0.79648089, + "learning_rate": 0.00010210955099013724, + "loss": 0.8071658, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.24438477, + "step": 4154, + "time_per_iteration": 3.4123902320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066775, + "balance_loss_mlp": 1.04323101, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.07636085006530072, + "language_loss": 0.76853955, + "learning_rate": 0.00010192096300053167, + "loss": 0.77920735, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.23547363, + "step": 4155, + "time_per_iteration": 3.0885937213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062142, + "balance_loss_mlp": 1.03831244, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.054659422757200274, + "language_loss": 0.85324514, + "learning_rate": 0.00010173252955891477, + "loss": 0.86386657, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.23828125, + "step": 4156, + "time_per_iteration": 2.7677974700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_mlp": 1.04228568, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07109273842515242, + "language_loss": 0.7358321, + "learning_rate": 0.00010154425073844253, + "loss": 0.74649352, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.23840332, + "step": 4157, + "time_per_iteration": 2.6927075386047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068264, + "balance_loss_mlp": 1.0444932, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.045745633219446115, + "language_loss": 0.82380056, + "learning_rate": 0.00010135612661221138, + "loss": 0.83448321, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.23742676, + "step": 4158, + "time_per_iteration": 2.5854756832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069104, + "balance_loss_mlp": 1.04447556, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.11567647192687848, + "language_loss": 0.82128304, + "learning_rate": 0.00010116815725325751, + "loss": 0.83197409, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.24633789, + "step": 4159, + "time_per_iteration": 3.2739415168762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069034, + "balance_loss_mlp": 1.04451275, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.051597263691987395, + "language_loss": 0.8072108, + "learning_rate": 0.00010098034273455725, + "loss": 0.81790113, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.24523926, + "step": 4160, + "time_per_iteration": 2.9544758796691895 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 344944048, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9388404519337984.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/training_args.bin b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b28f0633932ff84d8e0fde7beb2f9c59f0d04be --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54b92ce31f27a60f5f91da41c22febbdc5fe6a9ac82c4d361c2b9dbc9096639 +size 7992 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/zero_to_fp32.py b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-4160/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4477091c8e5e4d06ea14a8a918edb0ae2310c298 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/generation_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d2bb32fff09e9e4b035db0cf58711e57d0ee877 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb26047eb08608f8d46a6e538cc2a956c3c557cecb86d2824e729703cd446e7f +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b09c42b319772636e68b395c5f98b4c4f626240 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2385912b21c24637daba2f3924b90e9612c5763986127050d1af66deff7142a +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91c737d5501be0441fef9fc8b43ce4a304324440 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9be69f7f0c3c9c8fd08ae7d612648d02171d87bebe3980c26e17ccf832073c0 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748e0de876973f8d057e382eab0cbbba539e3be5 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2d4aab79aba01762cd9f4ab89d34ef0a304d31c5b04ecb6953ce2422b92df16 +size 396582032 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e55be15caaddb2fa7fb750d76425c31f2a5e06f --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8222c73a1966a2eb35bd90e3a8ef67b54655c4094ee34e2cfc4ccbfb35673978 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23c4ddc06e595b888c06b7c30495f207162bda5e --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7332293b25fceacb0371f8b2e72f9a55dc3b14b6380f32c50df556d72e55b9 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81e18ac99fc872b965b7f3c5bdb34e5094147300 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ce413f44ad2a09870264a081c9d1fc8acaff5500aa2a9ea16a834fe32fe38bf +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e88fd802bcf6d8b92102397196cffab34aa43284 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d3f57092a0706674edaba38153bbe4d24a8986b3e700584737db34c29ace333 +size 2117321544 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/latest b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4d239cd18f38895f9c0d72f4844238ac8a2c4ab --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f279a4fbf39064aa08c5b72e2bdde9242074e1b223240741dbb926caad995eed +size 3759025152 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00810e9189727c618ff0394deca47874041e6737 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/trainer_state.json @@ -0,0 +1,78003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03935784, + "balance_loss_mlp": 2.84935808, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 13.498251331228948, + "language_loss": 2.81572914, + "learning_rate": 0.0, + "loss": 1.90346789, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.30480647087097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0351246, + "balance_loss_mlp": 2.65644169, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 27.482987886380492, + "language_loss": 8.76816368, + "learning_rate": 0.00013726078121135892, + "loss": 8.80328846, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 8.578125, + "step": 2, + "time_per_iteration": 2.6929261684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03513305, + "balance_loss_mlp": 2.65728736, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 28.576563245741852, + "language_loss": 9.00053596, + "learning_rate": 0.00021755319103969496, + "loss": 9.03566933, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 8.578125, + "step": 3, + "time_per_iteration": 2.7945075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03667009, + "balance_loss_mlp": 2.78657675, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 15.694146018083416, + "language_loss": 2.74122858, + "learning_rate": 0.00027452156242271784, + "loss": 2.77789879, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 8.828125, + "step": 4, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03933422, + "balance_loss_mlp": 3.01102829, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 3.505338851882968, + "language_loss": 1.83478093, + "learning_rate": 0.0003187096642208417, + "loss": 1.87411511, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 9.2109375, + "step": 5, + "time_per_iteration": 2.651094675064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04005588, + "balance_loss_mlp": 3.05420256, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 3.050600048840319, + "language_loss": 1.61776543, + "learning_rate": 0.0003548139722510539, + "loss": 1.65782118, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 9.4921875, + "step": 6, + "time_per_iteration": 2.697614908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03708502, + "balance_loss_mlp": 2.7708497, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7974788691124679, + "language_loss": 1.32417345, + "learning_rate": 0.00038533972973918044, + "loss": 1.36125851, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 9.3515625, + "step": 7, + "time_per_iteration": 2.6407949924468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0332405, + "balance_loss_mlp": 2.38868618, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.7144720842381633, + "language_loss": 1.25956392, + "learning_rate": 0.0004117823436340768, + "loss": 1.29280448, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 9.3359375, + "step": 8, + "time_per_iteration": 2.6287930011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02785454, + "balance_loss_mlp": 1.8508532, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.3140255221758466, + "language_loss": 1.29993415, + "learning_rate": 0.00043510638207938993, + "loss": 1.32778871, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 9.3203125, + "step": 9, + "time_per_iteration": 2.8048858642578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0244685, + "balance_loss_mlp": 1.50004196, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.19799802642524775, + "language_loss": 1.19032216, + "learning_rate": 0.00045597044543220066, + "loss": 1.2147907, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 9.4453125, + "step": 10, + "time_per_iteration": 2.7669434547424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02310187, + "balance_loss_mlp": 1.35117221, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.14485632700798082, + "language_loss": 1.18421102, + "learning_rate": 0.00047484428652143135, + "loss": 1.20731282, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 9.5703125, + "step": 11, + "time_per_iteration": 2.9067423343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02309394, + "balance_loss_mlp": 1.33740926, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.1366980934684776, + "language_loss": 1.24379897, + "learning_rate": 0.0004920747534624128, + "loss": 1.26689291, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 9.703125, + "step": 12, + "time_per_iteration": 2.612813949584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022984, + "balance_loss_mlp": 1.32565212, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.11957957623458634, + "language_loss": 1.26615512, + "learning_rate": 0.0005079252465375872, + "loss": 1.28913903, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 9.7109375, + "step": 13, + "time_per_iteration": 2.879688262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02311662, + "balance_loss_mlp": 1.34730673, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.10749127497061137, + "language_loss": 1.14448667, + "learning_rate": 0.0005226005109505393, + "loss": 1.16760325, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 9.625, + "step": 14, + "time_per_iteration": 2.568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02285502, + "balance_loss_mlp": 1.3615818, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.11405493545380829, + "language_loss": 1.20514369, + "learning_rate": 0.0005362628552605367, + "loss": 1.22799873, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 9.21875, + "step": 15, + "time_per_iteration": 2.6814210414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02243131, + "balance_loss_mlp": 1.36117291, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.10465613456634369, + "language_loss": 1.24307358, + "learning_rate": 0.0005490431248454357, + "loss": 1.26550484, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 8.84375, + "step": 16, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02323403, + "balance_loss_mlp": 1.52994621, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2929644268686402, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78028512, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.90625, + "step": 17, + "time_per_iteration": 6.376815319061279 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02154669, + "balance_loss_mlp": 1.37418151, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.15081794947454089, + "language_loss": 1.11159086, + "learning_rate": 0.0005723671632907488, + "loss": 1.13313746, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.80078125, + "step": 18, + "time_per_iteration": 2.721731424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02067628, + "balance_loss_mlp": 1.35466075, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11430094844987627, + "language_loss": 1.15730095, + "learning_rate": 0.0005830738490244919, + "loss": 1.1779772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 7.12890625, + "step": 19, + "time_per_iteration": 2.691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01966178, + "balance_loss_mlp": 1.31958628, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10166759343386816, + "language_loss": 1.17760253, + "learning_rate": 0.0005932312266435596, + "loss": 1.19726431, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.46484375, + "step": 20, + "time_per_iteration": 2.779218912124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01836812, + "balance_loss_mlp": 1.26727819, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.12846528828878043, + "language_loss": 1.12106359, + "learning_rate": 0.0006028929207788754, + "loss": 1.13943172, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 5.70703125, + "step": 21, + "time_per_iteration": 2.716970443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01720951, + "balance_loss_mlp": 1.21970022, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09445288880840001, + "language_loss": 1.16516471, + "learning_rate": 0.0006121050677327902, + "loss": 1.18237424, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 5.0078125, + "step": 22, + "time_per_iteration": 2.92696475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01630624, + "balance_loss_mlp": 1.19193399, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.11621712848760359, + "language_loss": 1.06380248, + "learning_rate": 0.0006209076479463684, + "loss": 1.08010876, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.3984375, + "step": 23, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01572853, + "balance_loss_mlp": 1.18394423, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.10970997088624258, + "language_loss": 1.16519284, + "learning_rate": 0.0006293355346737718, + "loss": 1.18092132, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 3.88476562, + "step": 24, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0152954, + "balance_loss_mlp": 1.18755198, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.09735665571869598, + "language_loss": 1.12784922, + "learning_rate": 0.0006374193284416834, + "loss": 1.14314473, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 3.42382812, + "step": 25, + "time_per_iteration": 2.7919249534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0148827, + "balance_loss_mlp": 1.19282198, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.09233879954989622, + "language_loss": 1.11062908, + "learning_rate": 0.0006451860277489461, + "loss": 1.12551177, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 2.953125, + "step": 26, + "time_per_iteration": 2.581066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462989, + "balance_loss_mlp": 1.20988345, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.12330238493557526, + "language_loss": 1.19441557, + "learning_rate": 0.0006526595731190848, + "loss": 1.20904553, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 2.52929688, + "step": 27, + "time_per_iteration": 2.49725604057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423898, + "balance_loss_mlp": 1.20874906, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.09841719698503415, + "language_loss": 1.12322927, + "learning_rate": 0.0006598612921618983, + "loss": 1.13746822, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 2.15625, + "step": 28, + "time_per_iteration": 2.822068929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399446, + "balance_loss_mlp": 1.21443295, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.2589331093265968, + "language_loss": 1.06232262, + "learning_rate": 0.0006668102665011454, + "loss": 1.07631707, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 1.84765625, + "step": 29, + "time_per_iteration": 3.2402820587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444994, + "balance_loss_mlp": 1.28353739, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.1317361033328709, + "language_loss": 1.14859319, + "learning_rate": 0.0006735236364718957, + "loss": 1.16304302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 1.61425781, + "step": 30, + "time_per_iteration": 2.6861231327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333301, + "balance_loss_mlp": 1.20445967, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.07039345614882069, + "language_loss": 1.13512135, + "learning_rate": 0.0006800168558381346, + "loss": 1.14845431, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 1.28808594, + "step": 31, + "time_per_iteration": 2.6444640159606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254242, + "balance_loss_mlp": 1.153772, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.07602265872136475, + "language_loss": 1.1720531, + "learning_rate": 0.0006863039060567947, + "loss": 1.18459558, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 1.00439453, + "step": 32, + "time_per_iteration": 2.7225399017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117915, + "balance_loss_mlp": 1.10071015, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.062098451262649575, + "language_loss": 1.09530759, + "learning_rate": 0.0006923974775611263, + "loss": 1.10709918, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.78417969, + "step": 33, + "time_per_iteration": 2.795565366744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155392, + "balance_loss_mlp": 1.09416604, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0750568617782567, + "language_loss": 1.06307364, + "learning_rate": 0.0006983091239737814, + "loss": 1.0746274, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.61132812, + "step": 34, + "time_per_iteration": 3.0703423023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.0903163, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.057198892540160154, + "language_loss": 1.05094206, + "learning_rate": 0.0007040493939600222, + "loss": 1.06232452, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.47949219, + "step": 35, + "time_per_iteration": 2.8476996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136821, + "balance_loss_mlp": 1.09926963, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.07105443011946577, + "language_loss": 1.05056715, + "learning_rate": 0.0007096279445021078, + "loss": 1.06193542, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.37548828, + "step": 36, + "time_per_iteration": 2.8306472301483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_mlp": 1.12274194, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09366404592926651, + "language_loss": 1.11846077, + "learning_rate": 0.0007150536386503726, + "loss": 1.12998605, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.29736328, + "step": 37, + "time_per_iteration": 2.875190258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150569, + "balance_loss_mlp": 1.12677491, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.0928332145488954, + "language_loss": 1.04548562, + "learning_rate": 0.0007203346302358509, + "loss": 1.05699134, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.23791504, + "step": 38, + "time_per_iteration": 3.0075292587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128748, + "balance_loss_mlp": 1.10757613, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.056043607360260886, + "language_loss": 1.09224963, + "learning_rate": 0.000725478437577282, + "loss": 1.10353708, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.21179199, + "step": 39, + "time_per_iteration": 2.78564715385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_mlp": 1.09953475, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.2122838817863008, + "language_loss": 1.04638147, + "learning_rate": 0.0007304920078549186, + "loss": 1.0575583, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18151855, + "step": 40, + "time_per_iteration": 2.745100975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133734, + "balance_loss_mlp": 1.11621058, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.14528393981530327, + "language_loss": 1.06509054, + "learning_rate": 0.0007353817735343603, + "loss": 1.07642794, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.17529297, + "step": 41, + "time_per_iteration": 2.7425575256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.10357416, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.06769616325508275, + "language_loss": 1.0188365, + "learning_rate": 0.0007401537019902344, + "loss": 1.03003538, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.16308594, + "step": 42, + "time_per_iteration": 2.6797902584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118919, + "balance_loss_mlp": 1.10271883, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.14916902722339276, + "language_loss": 1.05194306, + "learning_rate": 0.0007448133392900729, + "loss": 1.06313229, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.1619873, + "step": 43, + "time_per_iteration": 2.779276132583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_mlp": 1.09945166, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.052417895665492535, + "language_loss": 1.00651026, + "learning_rate": 0.0007493658489441491, + "loss": 1.0176717, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.16711426, + "step": 44, + "time_per_iteration": 2.965435028076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_mlp": 1.09195447, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.04248825884697869, + "language_loss": 1.04600978, + "learning_rate": 0.0007538160463002316, + "loss": 1.05709875, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.16967773, + "step": 45, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_mlp": 1.08735132, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.08538228051147774, + "language_loss": 1.08093452, + "learning_rate": 0.0007581684291577274, + "loss": 1.09198785, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.17980957, + "step": 46, + "time_per_iteration": 2.6020169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.04723509056908367, + "language_loss": 1.10695386, + "learning_rate": 0.0007624272050891776, + "loss": 1.11800754, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.19006348, + "step": 47, + "time_per_iteration": 2.8620407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_mlp": 1.08244705, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.07235265954126073, + "language_loss": 1.00601125, + "learning_rate": 0.0007665963158851307, + "loss": 1.01704311, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.20751953, + "step": 48, + "time_per_iteration": 2.8312995433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114938, + "balance_loss_mlp": 1.09308696, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.10505304652404167, + "language_loss": 1.09914839, + "learning_rate": 0.0007706794594783609, + "loss": 1.1102978, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.21850586, + "step": 49, + "time_per_iteration": 2.779561758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.0874207, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.04709564792407722, + "language_loss": 1.08694363, + "learning_rate": 0.0007746801096530423, + "loss": 1.09804368, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.22583008, + "step": 50, + "time_per_iteration": 2.785332441329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_mlp": 1.09285581, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09574874491356838, + "language_loss": 1.13402438, + "learning_rate": 0.0007786015338021173, + "loss": 1.14518726, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.23425293, + "step": 51, + "time_per_iteration": 2.676326274871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_mlp": 1.09500206, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.12325193255180054, + "language_loss": 1.06019998, + "learning_rate": 0.0007824468089603051, + "loss": 1.07138121, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.23144531, + "step": 52, + "time_per_iteration": 2.688828945159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_mlp": 1.11038983, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.07208467676878935, + "language_loss": 1.05329835, + "learning_rate": 0.0007862188363098669, + "loss": 1.06464922, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.24707031, + "step": 53, + "time_per_iteration": 3.3342933654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126914, + "balance_loss_mlp": 1.10158229, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.09794855088059086, + "language_loss": 1.06043434, + "learning_rate": 0.0007899203543304438, + "loss": 1.07170355, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25354004, + "step": 54, + "time_per_iteration": 2.933236837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145083, + "balance_loss_mlp": 1.12053776, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.1404118977896248, + "language_loss": 1.20000231, + "learning_rate": 0.0007935539507422731, + "loss": 1.2114532, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.24536133, + "step": 55, + "time_per_iteration": 2.8257975578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.12969017, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.05382700946372506, + "language_loss": 1.10560298, + "learning_rate": 0.0007971220733732573, + "loss": 1.11713552, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.2355957, + "step": 56, + "time_per_iteration": 2.749382495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_mlp": 1.13151693, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.17392462927294325, + "language_loss": 1.05995011, + "learning_rate": 0.0008006270400641869, + "loss": 1.07150006, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.23474121, + "step": 57, + "time_per_iteration": 2.743929147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_mlp": 1.10234821, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.10169017538987117, + "language_loss": 1.06833839, + "learning_rate": 0.0008040710477125043, + "loss": 1.07959747, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.23547363, + "step": 58, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111797, + "balance_loss_mlp": 1.08861065, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.059941584643697095, + "language_loss": 1.07409072, + "learning_rate": 0.0008074561805429771, + "loss": 1.08520865, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.23181152, + "step": 59, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123772, + "balance_loss_mlp": 1.09970331, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.06438674129900752, + "language_loss": 1.04891515, + "learning_rate": 0.0008107844176832545, + "loss": 1.06015277, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.24072266, + "step": 60, + "time_per_iteration": 2.7009053230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.11569333, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.09833112160800331, + "language_loss": 1.0671711, + "learning_rate": 0.0008140576401132568, + "loss": 1.07856739, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.23913574, + "step": 61, + "time_per_iteration": 2.678501844406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114169, + "balance_loss_mlp": 1.11887348, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.11014501355567002, + "language_loss": 1.07748628, + "learning_rate": 0.0008172776370494935, + "loss": 1.08890319, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.22814941, + "step": 62, + "time_per_iteration": 2.7718141078948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116479, + "balance_loss_mlp": 1.09356666, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.06441650429015075, + "language_loss": 1.15269816, + "learning_rate": 0.0008204461118185703, + "loss": 1.16386294, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.22912598, + "step": 63, + "time_per_iteration": 2.5839178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_mlp": 1.09543014, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.06608006175674933, + "language_loss": 1.04523873, + "learning_rate": 0.0008235646872681536, + "loss": 1.05641007, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.21728516, + "step": 64, + "time_per_iteration": 2.5611703395843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_mlp": 1.10659182, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.07834673611922068, + "language_loss": 1.04319417, + "learning_rate": 0.0008266349107584288, + "loss": 1.05447328, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.21313477, + "step": 65, + "time_per_iteration": 2.727666139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141841, + "balance_loss_mlp": 1.1207881, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.06003338375813584, + "language_loss": 1.07126927, + "learning_rate": 0.0008296582587724851, + "loss": 1.08268762, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21057129, + "step": 66, + "time_per_iteration": 2.716701030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127113, + "balance_loss_mlp": 1.10609627, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.04807876202194694, + "language_loss": 1.04662776, + "learning_rate": 0.0008326361411800136, + "loss": 1.05789876, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21008301, + "step": 67, + "time_per_iteration": 2.9571592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114337, + "balance_loss_mlp": 1.09446514, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.05551510449528945, + "language_loss": 1.05008268, + "learning_rate": 0.0008355699051851403, + "loss": 1.06122601, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1986084, + "step": 68, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.1242373, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.0697970629442659, + "language_loss": 1.12296045, + "learning_rate": 0.0008384608389860635, + "loss": 1.13439655, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.19372559, + "step": 69, + "time_per_iteration": 2.685215711593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.122311, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.08511613263061502, + "language_loss": 1.02745342, + "learning_rate": 0.000841310175171381, + "loss": 1.03886437, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.18774414, + "step": 70, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_mlp": 1.12464356, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.055787325190813475, + "language_loss": 1.0065217, + "learning_rate": 0.000844119093875517, + "loss": 1.0179472, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.17944336, + "step": 71, + "time_per_iteration": 2.753220319747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152267, + "balance_loss_mlp": 1.13508892, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08668312915327946, + "language_loss": 1.05463254, + "learning_rate": 0.0008468887257134666, + "loss": 1.0661552, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.17199707, + "step": 72, + "time_per_iteration": 2.7056305408477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117134, + "balance_loss_mlp": 1.15478206, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07356095482564125, + "language_loss": 1.08388793, + "learning_rate": 0.0008496201545131264, + "loss": 1.09560132, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.16564941, + "step": 73, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152437, + "balance_loss_mlp": 1.13545001, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.06787935984484554, + "language_loss": 1.06090975, + "learning_rate": 0.0008523144198617317, + "loss": 1.07243395, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16992188, + "step": 74, + "time_per_iteration": 3.2090003490448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1223346, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.04825332815792917, + "language_loss": 1.053195, + "learning_rate": 0.0008549725194813783, + "loss": 1.06458783, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.16967773, + "step": 75, + "time_per_iteration": 2.654343605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.10599899, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.03887402020767282, + "language_loss": 1.04797029, + "learning_rate": 0.0008575954114472099, + "loss": 1.05919111, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.1607666, + "step": 76, + "time_per_iteration": 3.119884967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.1187191, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.056937643991546806, + "language_loss": 1.02038705, + "learning_rate": 0.0008601840162606118, + "loss": 1.03173184, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.1574707, + "step": 77, + "time_per_iteration": 3.025688886642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146725, + "balance_loss_mlp": 1.13034582, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04989291514363055, + "language_loss": 1.08127129, + "learning_rate": 0.000862739218788641, + "loss": 1.09273863, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16381836, + "step": 78, + "time_per_iteration": 2.7922520637512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149681, + "balance_loss_mlp": 1.13339734, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.06709094188277621, + "language_loss": 1.06189477, + "learning_rate": 0.0008652618700799138, + "loss": 1.07339156, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.1628418, + "step": 79, + "time_per_iteration": 2.6902618408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_mlp": 1.1367681, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.062162504049989416, + "language_loss": 1.05161238, + "learning_rate": 0.0008677527890662774, + "loss": 1.06314492, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16491699, + "step": 80, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_mlp": 1.13076603, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.04934081686369646, + "language_loss": 1.06529951, + "learning_rate": 0.0008702127641587799, + "loss": 1.0767715, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.16430664, + "step": 81, + "time_per_iteration": 2.634038209915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_mlp": 1.12558985, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.08879987127008451, + "language_loss": 1.0221808, + "learning_rate": 0.0008726425547457192, + "loss": 1.0336051, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.16845703, + "step": 82, + "time_per_iteration": 2.74308705329895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.13108134, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.06313420095488197, + "language_loss": 1.01906681, + "learning_rate": 0.0008750428925998964, + "loss": 1.03054249, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.16491699, + "step": 83, + "time_per_iteration": 2.777132511138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146759, + "balance_loss_mlp": 1.13009322, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.11663644047392754, + "language_loss": 1.07169831, + "learning_rate": 0.0008774144832015932, + "loss": 1.08316588, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16674805, + "step": 84, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01524523, + "balance_loss_mlp": 1.51412809, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.22860236459315994, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76298833, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.10400391, + "step": 85, + "time_per_iteration": 4.57580041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166169, + "balance_loss_mlp": 1.1501826, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.05249425037579876, + "language_loss": 1.01959693, + "learning_rate": 0.0008820741205014318, + "loss": 1.03125858, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.15979004, + "step": 86, + "time_per_iteration": 2.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223619, + "balance_loss_mlp": 1.20703709, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.10761462625124436, + "language_loss": 1.03955913, + "learning_rate": 0.0008843634575408404, + "loss": 1.05179524, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.16577148, + "step": 87, + "time_per_iteration": 2.6694159507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228231, + "balance_loss_mlp": 1.21267366, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.10737104518045529, + "language_loss": 1.05078888, + "learning_rate": 0.0008866266301555082, + "loss": 1.06307125, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.15551758, + "step": 88, + "time_per_iteration": 2.7686069011688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212138, + "balance_loss_mlp": 1.19609249, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.1616084590878673, + "language_loss": 1.0609467, + "learning_rate": 0.0008888642296509615, + "loss": 1.07306814, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.16040039, + "step": 89, + "time_per_iteration": 2.625988721847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199649, + "balance_loss_mlp": 1.18316197, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.07585409016808545, + "language_loss": 1.1065979, + "learning_rate": 0.0008910768275115906, + "loss": 1.11859453, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16491699, + "step": 90, + "time_per_iteration": 2.793017864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_mlp": 1.15697813, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.07277460951060387, + "language_loss": 1.06493175, + "learning_rate": 0.0008932649762767675, + "loss": 1.07666695, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16552734, + "step": 91, + "time_per_iteration": 2.5919723510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169355, + "balance_loss_mlp": 1.15323818, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.10172519854243242, + "language_loss": 1.09112859, + "learning_rate": 0.0008954292103690864, + "loss": 1.10282218, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.16113281, + "step": 92, + "time_per_iteration": 2.9366836547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174542, + "balance_loss_mlp": 1.15828145, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.07803491111319032, + "language_loss": 1.10981905, + "learning_rate": 0.0008975700468778296, + "loss": 1.12156439, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16259766, + "step": 93, + "time_per_iteration": 2.592458963394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156862, + "balance_loss_mlp": 1.14067388, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.09102852745954727, + "language_loss": 1.04703569, + "learning_rate": 0.0008996879863005366, + "loss": 1.05860424, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.71566104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148536, + "balance_loss_mlp": 1.13235974, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.03859462796979438, + "language_loss": 1.04768109, + "learning_rate": 0.0009017835132453337, + "loss": 1.05916631, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.16174316, + "step": 95, + "time_per_iteration": 2.664511203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_mlp": 1.121889, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.060963703759419355, + "language_loss": 1.04675508, + "learning_rate": 0.0009038570970964896, + "loss": 1.05813384, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.15991211, + "step": 96, + "time_per_iteration": 2.7669789791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_mlp": 1.10899043, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0943042692373462, + "language_loss": 1.02071011, + "learning_rate": 0.0009059091926454854, + "loss": 1.03196073, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16064453, + "step": 97, + "time_per_iteration": 2.6028668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_mlp": 1.11052442, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.06745462513624549, + "language_loss": 1.0144124, + "learning_rate": 0.0009079402406897198, + "loss": 1.02567911, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.16137695, + "step": 98, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127975, + "balance_loss_mlp": 1.11166739, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.10523687850003575, + "language_loss": 1.03251696, + "learning_rate": 0.0009099506686008212, + "loss": 1.04379678, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16308594, + "step": 99, + "time_per_iteration": 2.8251914978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116643, + "balance_loss_mlp": 1.10100293, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.08495157768411668, + "language_loss": 1.0609076, + "learning_rate": 0.0009119408908644013, + "loss": 1.07207406, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15625, + "step": 100, + "time_per_iteration": 2.6573309898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.12211871, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.09022378013673595, + "language_loss": 1.11755276, + "learning_rate": 0.0009139113095929519, + "loss": 1.12892556, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15124512, + "step": 101, + "time_per_iteration": 2.844698429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159661, + "balance_loss_mlp": 1.14373517, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.0892612752622512, + "language_loss": 1.05698013, + "learning_rate": 0.0009158623150134762, + "loss": 1.06857681, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15917969, + "step": 102, + "time_per_iteration": 2.589857339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_mlp": 1.12158906, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.06508497546963277, + "language_loss": 1.05496848, + "learning_rate": 0.000917794285931332, + "loss": 1.06634164, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15710449, + "step": 103, + "time_per_iteration": 2.6433918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_mlp": 1.1019367, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.07675487095909958, + "language_loss": 0.97610366, + "learning_rate": 0.0009197075901716639, + "loss": 0.98728061, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.1574707, + "step": 104, + "time_per_iteration": 2.709157943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137693, + "balance_loss_mlp": 1.12159956, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.05257934075389246, + "language_loss": 1.0758431, + "learning_rate": 0.0009216025849997171, + "loss": 1.08722019, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16088867, + "step": 105, + "time_per_iteration": 2.7638583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111903, + "balance_loss_mlp": 1.09596467, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.07457888312135433, + "language_loss": 1.02261579, + "learning_rate": 0.0009234796175212258, + "loss": 1.03373492, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.15930176, + "step": 106, + "time_per_iteration": 2.9391980171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117989, + "balance_loss_mlp": 1.10228872, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.06024423434996524, + "language_loss": 1.05948544, + "learning_rate": 0.000925339025064007, + "loss": 1.07066536, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.15686035, + "step": 107, + "time_per_iteration": 2.975294828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118819, + "balance_loss_mlp": 1.10334611, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.07105297051955457, + "language_loss": 0.99294066, + "learning_rate": 0.0009271811355418027, + "loss": 1.00412893, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.15454102, + "step": 108, + "time_per_iteration": 2.8750014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125269, + "balance_loss_mlp": 1.10940242, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09212378946406244, + "language_loss": 1.05636311, + "learning_rate": 0.0009290062678013548, + "loss": 1.06761575, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.15856934, + "step": 109, + "time_per_iteration": 2.8552017211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119898, + "balance_loss_mlp": 1.10393572, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.059465971869905314, + "language_loss": 1.04477715, + "learning_rate": 0.0009308147319536321, + "loss": 1.05597615, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.1595459, + "step": 110, + "time_per_iteration": 2.6493232250213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129754, + "balance_loss_mlp": 1.11385095, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.08324280754141193, + "language_loss": 1.10257316, + "learning_rate": 0.0009326068296900676, + "loss": 1.11387074, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.15893555, + "step": 111, + "time_per_iteration": 2.8384125232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112769, + "balance_loss_mlp": 1.11171615, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06941460102767082, + "language_loss": 1.01355243, + "learning_rate": 0.0009343828545846161, + "loss": 1.02482939, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.15966797, + "step": 112, + "time_per_iteration": 2.7743477821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114893, + "balance_loss_mlp": 1.13326573, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.047977415311889204, + "language_loss": 1.05199587, + "learning_rate": 0.0009361430923823841, + "loss": 1.06348515, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.15649414, + "step": 113, + "time_per_iteration": 2.6022982597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.10308659, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.080001842017843, + "language_loss": 1.09258401, + "learning_rate": 0.0009378878212755459, + "loss": 1.10376549, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15039062, + "step": 114, + "time_per_iteration": 2.491594076156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115419, + "balance_loss_mlp": 1.09967113, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.05036418666557463, + "language_loss": 0.9906168, + "learning_rate": 0.0009396173121672103, + "loss": 1.00177097, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.15734863, + "step": 115, + "time_per_iteration": 2.668848991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_mlp": 1.10945916, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.05918191636932359, + "language_loss": 1.04414749, + "learning_rate": 0.0009413318289238633, + "loss": 1.05539548, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.15307617, + "step": 116, + "time_per_iteration": 2.7496132850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106823, + "balance_loss_mlp": 1.09139705, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.1124204963758038, + "language_loss": 0.96924931, + "learning_rate": 0.0009430316286169771, + "loss": 0.98031747, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.15405273, + "step": 117, + "time_per_iteration": 3.026118278503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_mlp": 1.11998308, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.03693994945601898, + "language_loss": 1.02417183, + "learning_rate": 0.0009447169617543361, + "loss": 1.03552485, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15307617, + "step": 118, + "time_per_iteration": 2.575666666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156925, + "balance_loss_mlp": 1.14185703, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.10959367855453626, + "language_loss": 1.09001684, + "learning_rate": 0.0009463880725016029, + "loss": 1.1015861, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.15039062, + "step": 119, + "time_per_iteration": 2.6811347007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115422, + "balance_loss_mlp": 1.10052109, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.05068852434870314, + "language_loss": 1.03909945, + "learning_rate": 0.0009480451988946134, + "loss": 1.05025363, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.14880371, + "step": 120, + "time_per_iteration": 2.801814079284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_mlp": 1.09179425, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.05688398470992871, + "language_loss": 1.05377555, + "learning_rate": 0.0009496885730428627, + "loss": 1.06484532, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1517334, + "step": 121, + "time_per_iteration": 3.04720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_mlp": 1.10574555, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.08369646841136469, + "language_loss": 1.03908122, + "learning_rate": 0.0009513184213246156, + "loss": 1.05029583, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.15710449, + "step": 122, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129626, + "balance_loss_mlp": 1.11406958, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.05522871343558165, + "language_loss": 1.07008672, + "learning_rate": 0.0009529349645740552, + "loss": 1.08138299, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15539551, + "step": 123, + "time_per_iteration": 2.69759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129797, + "balance_loss_mlp": 1.11481285, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.053769267634074955, + "language_loss": 1.05687594, + "learning_rate": 0.0009545384182608524, + "loss": 1.06817389, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1496582, + "step": 124, + "time_per_iteration": 2.550584316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126, + "balance_loss_mlp": 1.11114669, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.08700167249890467, + "language_loss": 1.02945745, + "learning_rate": 0.0009561289926625252, + "loss": 1.04071736, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14831543, + "step": 125, + "time_per_iteration": 2.6619794368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_mlp": 1.10831082, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.07114777459455598, + "language_loss": 1.07932711, + "learning_rate": 0.0009577068930299292, + "loss": 1.09056234, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.15209961, + "step": 126, + "time_per_iteration": 2.553642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125226, + "balance_loss_mlp": 1.11038458, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.08279894264625885, + "language_loss": 1.03556633, + "learning_rate": 0.0009592723197462087, + "loss": 1.04681861, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.14819336, + "step": 127, + "time_per_iteration": 2.7255966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_mlp": 1.10936916, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07600858050716931, + "language_loss": 0.99905002, + "learning_rate": 0.0009608254684795125, + "loss": 1.01029539, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15148926, + "step": 128, + "time_per_iteration": 2.9839587211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_mlp": 1.11718702, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.08573045125619827, + "language_loss": 1.02976727, + "learning_rate": 0.0009623665303297678, + "loss": 1.04109192, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.15258789, + "step": 129, + "time_per_iteration": 2.7344865798950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_mlp": 1.10497391, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.07510500588649292, + "language_loss": 1.07057762, + "learning_rate": 0.0009638956919697878, + "loss": 1.08177161, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.14416504, + "step": 130, + "time_per_iteration": 2.864952802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_mlp": 1.08930528, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.0567118244953117, + "language_loss": 0.99135083, + "learning_rate": 0.0009654131357809714, + "loss": 1.00239229, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.14819336, + "step": 131, + "time_per_iteration": 2.6095099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_mlp": 1.1081202, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.05892082702998288, + "language_loss": 1.08188879, + "learning_rate": 0.0009669190399838441, + "loss": 1.09312594, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.15576172, + "step": 132, + "time_per_iteration": 3.096733331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_mlp": 1.08531809, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.09564892115109941, + "language_loss": 1.01233923, + "learning_rate": 0.0009684135787636724, + "loss": 1.02334726, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.15478516, + "step": 133, + "time_per_iteration": 2.8120856285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111325, + "balance_loss_mlp": 1.09529161, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.04870542745948935, + "language_loss": 1.05797207, + "learning_rate": 0.0009698969223913726, + "loss": 1.06908536, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.16027832, + "step": 134, + "time_per_iteration": 3.0269176959991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_mlp": 1.10735679, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.04083122637660085, + "language_loss": 1.08225274, + "learning_rate": 0.0009713692373399265, + "loss": 1.09348655, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.16015625, + "step": 135, + "time_per_iteration": 2.690932273864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01798361, + "balance_loss_mlp": 1.75773478, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.2058674005568875, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.8125459, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.40625, + "step": 136, + "time_per_iteration": 5.460411548614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01507549, + "balance_loss_mlp": 1.47512448, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.12866590611947104, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79318589, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.32421875, + "step": 137, + "time_per_iteration": 4.989046335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146765, + "balance_loss_mlp": 1.13081443, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.04917093034878699, + "language_loss": 1.00934815, + "learning_rate": 0.0009757216201974225, + "loss": 1.02081585, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.1595459, + "step": 138, + "time_per_iteration": 2.9566736221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162305, + "balance_loss_mlp": 1.1448524, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.06281235859244827, + "language_loss": 1.0596863, + "learning_rate": 0.0009771514130396581, + "loss": 1.07130933, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17468262, + "step": 139, + "time_per_iteration": 2.683931350708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150087, + "balance_loss_mlp": 1.1330874, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09254080332591261, + "language_loss": 1.06202602, + "learning_rate": 0.00097857095638274, + "loss": 1.07352686, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17016602, + "step": 140, + "time_per_iteration": 2.558708906173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149417, + "balance_loss_mlp": 1.13241768, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.03864103733020509, + "language_loss": 0.97399604, + "learning_rate": 0.0009799803961288726, + "loss": 0.9854902, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17016602, + "step": 141, + "time_per_iteration": 2.992034673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_mlp": 1.10685217, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.06378420241673269, + "language_loss": 1.03629804, + "learning_rate": 0.000981379875086876, + "loss": 1.0475328, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16625977, + "step": 142, + "time_per_iteration": 3.063534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121821, + "balance_loss_mlp": 1.10560894, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.046520134554953796, + "language_loss": 0.98784387, + "learning_rate": 0.0009827695330590185, + "loss": 0.99906206, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.1619873, + "step": 143, + "time_per_iteration": 2.6495330333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_mlp": 1.1078757, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.05485832849515215, + "language_loss": 0.98036379, + "learning_rate": 0.0009841495069248256, + "loss": 0.99160779, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.1652832, + "step": 144, + "time_per_iteration": 2.9577834606170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_mlp": 1.12901306, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.09798795242100523, + "language_loss": 0.97478735, + "learning_rate": 0.0009855199307219871, + "loss": 0.98624128, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.16381836, + "step": 145, + "time_per_iteration": 2.6759142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148365, + "balance_loss_mlp": 1.13168764, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.1254453322996171, + "language_loss": 0.99733889, + "learning_rate": 0.0009868809357244854, + "loss": 1.00882256, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16687012, + "step": 146, + "time_per_iteration": 2.66375994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113683, + "balance_loss_mlp": 1.11978364, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.08248071954181796, + "language_loss": 1.03600287, + "learning_rate": 0.0009882326505180556, + "loss": 1.04737115, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.1706543, + "step": 147, + "time_per_iteration": 2.719353437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_mlp": 1.13280392, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.12761243433758393, + "language_loss": 1.02101135, + "learning_rate": 0.0009895752010730906, + "loss": 1.03252351, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.1842041, + "step": 148, + "time_per_iteration": 2.9704201221466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141454, + "balance_loss_mlp": 1.12377512, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07962775403881484, + "language_loss": 1.0825479, + "learning_rate": 0.0009909087108150867, + "loss": 1.09396255, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.17687988, + "step": 149, + "time_per_iteration": 2.7516071796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151554, + "balance_loss_mlp": 1.13330352, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.10196194967952074, + "language_loss": 1.09083438, + "learning_rate": 0.0009922333006927371, + "loss": 1.10235, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.18249512, + "step": 150, + "time_per_iteration": 2.4685099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.15218103, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.13259475383105176, + "language_loss": 1.020684, + "learning_rate": 0.0009935490892437632, + "loss": 1.03238916, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.18322754, + "step": 151, + "time_per_iteration": 2.5665087699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166904, + "balance_loss_mlp": 1.14880824, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10481585745820837, + "language_loss": 1.00390673, + "learning_rate": 0.0009948561926585687, + "loss": 1.01557577, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.18103027, + "step": 152, + "time_per_iteration": 2.7641003131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139325, + "balance_loss_mlp": 1.122576, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09697971136145118, + "language_loss": 1.05073512, + "learning_rate": 0.0009961547248418122, + "loss": 1.06212831, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.16760254, + "step": 153, + "time_per_iteration": 2.631476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123418, + "balance_loss_mlp": 1.10662186, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.05437877185758658, + "language_loss": 1.01441622, + "learning_rate": 0.0009974447974719707, + "loss": 1.0256505, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.16809082, + "step": 154, + "time_per_iteration": 2.709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.11151338, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.09703401576709127, + "language_loss": 1.03478801, + "learning_rate": 0.0009987265200589763, + "loss": 1.0460813, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.17810059, + "step": 155, + "time_per_iteration": 2.77809739112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140894, + "balance_loss_mlp": 1.12376344, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.08300490544518559, + "language_loss": 1.02959824, + "learning_rate": 0.001, + "loss": 1.04100728, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.17150879, + "step": 156, + "time_per_iteration": 2.845790386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144802, + "balance_loss_mlp": 1.12720668, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07590676388764007, + "language_loss": 1.00599122, + "learning_rate": 0.0009999999029413921, + "loss": 1.01743913, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.17614746, + "step": 157, + "time_per_iteration": 2.833735227584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142594, + "balance_loss_mlp": 1.12554669, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.06607639809804342, + "language_loss": 1.01453137, + "learning_rate": 0.0009999996117656068, + "loss": 1.02595735, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.1706543, + "step": 158, + "time_per_iteration": 2.803636074066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011301, + "balance_loss_mlp": 1.11345792, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.08769352458743468, + "language_loss": 0.94982773, + "learning_rate": 0.0009999991264727564, + "loss": 0.96112871, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.16638184, + "step": 159, + "time_per_iteration": 2.7776851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.11870432, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.05788098803643346, + "language_loss": 1.06247735, + "learning_rate": 0.0009999984470630296, + "loss": 1.07383585, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.17163086, + "step": 160, + "time_per_iteration": 2.6311371326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125321, + "balance_loss_mlp": 1.10836911, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.05159431076001957, + "language_loss": 0.94850963, + "learning_rate": 0.0009999975735366902, + "loss": 0.95976287, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.16955566, + "step": 161, + "time_per_iteration": 3.0904829502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148114, + "balance_loss_mlp": 1.13099504, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0692270455282635, + "language_loss": 0.96706492, + "learning_rate": 0.0009999965058940775, + "loss": 0.97854608, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.17138672, + "step": 162, + "time_per_iteration": 3.490063428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150632, + "balance_loss_mlp": 1.13323975, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08572766411177644, + "language_loss": 1.03267431, + "learning_rate": 0.0009999952441356057, + "loss": 1.04418063, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.17382812, + "step": 163, + "time_per_iteration": 2.497690439224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130614, + "balance_loss_mlp": 1.11405563, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.05784293330097489, + "language_loss": 1.03805065, + "learning_rate": 0.000999993788261765, + "loss": 1.0493567, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.16564941, + "step": 164, + "time_per_iteration": 3.6041390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.1152972, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.05766532368121917, + "language_loss": 1.05311596, + "learning_rate": 0.00099999213827312, + "loss": 1.06444073, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.171875, + "step": 165, + "time_per_iteration": 2.806014060974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_mlp": 1.12589669, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.05992608893057494, + "language_loss": 1.00112009, + "learning_rate": 0.000999990294170312, + "loss": 1.01254439, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.16540527, + "step": 166, + "time_per_iteration": 2.6405951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.11351717, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.05363857392651908, + "language_loss": 1.03767109, + "learning_rate": 0.0009999882559540566, + "loss": 1.04897451, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.16845703, + "step": 167, + "time_per_iteration": 2.69801664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_mlp": 1.11079764, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.03971308084427602, + "language_loss": 1.00767386, + "learning_rate": 0.000999986023625145, + "loss": 1.01894999, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.16821289, + "step": 168, + "time_per_iteration": 2.710706949234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04227602, + "balance_loss_mlp": 3.93005633, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.49669676383753814, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8315202, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.96875, + "step": 169, + "time_per_iteration": 4.921034574508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178384, + "balance_loss_mlp": 1.15987098, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11256254520903143, + "language_loss": 1.01289928, + "learning_rate": 0.0009999809766328958, + "loss": 1.02468312, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.18518066, + "step": 170, + "time_per_iteration": 2.6784250736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236994, + "balance_loss_mlp": 1.21676469, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.13219145589868983, + "language_loss": 1.0357101, + "learning_rate": 0.0009999781619715177, + "loss": 1.04807997, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.20227051, + "step": 171, + "time_per_iteration": 2.5412755012512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234758, + "balance_loss_mlp": 1.21518433, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.05193788120122226, + "language_loss": 1.03408492, + "learning_rate": 0.000999975153201402, + "loss": 1.0464325, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.19567871, + "step": 172, + "time_per_iteration": 2.864586353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_mlp": 1.21688426, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.0814546252210238, + "language_loss": 1.01345742, + "learning_rate": 0.0009999719503237174, + "loss": 1.02582097, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.19470215, + "step": 173, + "time_per_iteration": 2.765923261642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_mlp": 1.24583161, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11494520888694326, + "language_loss": 1.10141742, + "learning_rate": 0.0009999685533397073, + "loss": 1.11407971, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20410156, + "step": 174, + "time_per_iteration": 2.5439114570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_mlp": 1.24525094, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.12313705571337571, + "language_loss": 1.01947784, + "learning_rate": 0.00099996496225069, + "loss": 1.03212488, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19445801, + "step": 175, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257561, + "balance_loss_mlp": 1.23677111, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07888015485072913, + "language_loss": 1.04929149, + "learning_rate": 0.0009999611770580604, + "loss": 1.06186724, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.20788574, + "step": 176, + "time_per_iteration": 2.841484785079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258013, + "balance_loss_mlp": 1.23668683, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.1202186920466195, + "language_loss": 1.03394961, + "learning_rate": 0.0009999571977632876, + "loss": 1.04652977, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21350098, + "step": 177, + "time_per_iteration": 2.567788600921631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_mlp": 1.25026441, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.09201820914192435, + "language_loss": 1.05765235, + "learning_rate": 0.0009999530243679166, + "loss": 1.07036722, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.21240234, + "step": 178, + "time_per_iteration": 2.5753743648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258548, + "balance_loss_mlp": 1.23935485, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.06529189645852858, + "language_loss": 1.00495052, + "learning_rate": 0.0009999486568735675, + "loss": 1.01753592, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.19177246, + "step": 179, + "time_per_iteration": 3.0607473850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251876, + "balance_loss_mlp": 1.23275518, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.07628849485304477, + "language_loss": 1.00889277, + "learning_rate": 0.0009999440952819362, + "loss": 1.02141166, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.19116211, + "step": 180, + "time_per_iteration": 3.6515376567840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248658, + "balance_loss_mlp": 1.22853494, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.05983966318213213, + "language_loss": 1.0115366, + "learning_rate": 0.0009999393395947935, + "loss": 1.02402306, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2010498, + "step": 181, + "time_per_iteration": 2.799633502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253433, + "balance_loss_mlp": 1.23378766, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.0770350968764605, + "language_loss": 1.04747987, + "learning_rate": 0.0009999343898139858, + "loss": 1.06001413, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19641113, + "step": 182, + "time_per_iteration": 2.627434253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258891, + "balance_loss_mlp": 1.23675334, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.06485795323962908, + "language_loss": 1.03381288, + "learning_rate": 0.0009999292459414348, + "loss": 1.04640174, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.22131348, + "step": 183, + "time_per_iteration": 2.5552356243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227697, + "balance_loss_mlp": 1.20765769, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.06837915158031915, + "language_loss": 1.07873201, + "learning_rate": 0.0009999239079791374, + "loss": 1.0910089, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.20031738, + "step": 184, + "time_per_iteration": 2.5553643703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225953, + "balance_loss_mlp": 1.20453107, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.05538225102727573, + "language_loss": 1.00595856, + "learning_rate": 0.0009999183759291659, + "loss": 1.01821804, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.21435547, + "step": 185, + "time_per_iteration": 2.6955769062042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199054, + "balance_loss_mlp": 1.17938447, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.052094207769016576, + "language_loss": 1.02581143, + "learning_rate": 0.0009999126497936682, + "loss": 1.03780198, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1965332, + "step": 186, + "time_per_iteration": 2.5304598808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198293, + "balance_loss_mlp": 1.1770494, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057723222775786294, + "language_loss": 1.05774581, + "learning_rate": 0.0009999067295748676, + "loss": 1.06972873, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21252441, + "step": 187, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225876, + "balance_loss_mlp": 1.20496714, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.0756096280824464, + "language_loss": 1.03738201, + "learning_rate": 0.000999900615275062, + "loss": 1.04964077, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.20922852, + "step": 188, + "time_per_iteration": 2.677471399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211466, + "balance_loss_mlp": 1.18979406, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.0898221855427691, + "language_loss": 1.09605587, + "learning_rate": 0.0009998943068966256, + "loss": 1.10817051, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21679688, + "step": 189, + "time_per_iteration": 2.4233202934265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217638, + "balance_loss_mlp": 1.19651425, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.10338446511893212, + "language_loss": 1.03747463, + "learning_rate": 0.0009998878044420072, + "loss": 1.04965115, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.21130371, + "step": 190, + "time_per_iteration": 2.6978025436401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177731, + "balance_loss_mlp": 1.15573716, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06881722524262912, + "language_loss": 0.99768066, + "learning_rate": 0.0009998811079137318, + "loss": 1.00945807, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22009277, + "step": 191, + "time_per_iteration": 2.5934321880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.12218916, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.0852793637050772, + "language_loss": 1.0086391, + "learning_rate": 0.0009998742173143987, + "loss": 1.02007401, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.2130127, + "step": 192, + "time_per_iteration": 2.6706249713897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139307, + "balance_loss_mlp": 1.1180048, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.07456835679934387, + "language_loss": 1.01398337, + "learning_rate": 0.0009998671326466833, + "loss": 1.02537644, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.21313477, + "step": 193, + "time_per_iteration": 2.992595672607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10519516, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.08171257283174432, + "language_loss": 1.02813613, + "learning_rate": 0.0009998598539133362, + "loss": 1.03940392, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21594238, + "step": 194, + "time_per_iteration": 3.0081543922424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113199, + "balance_loss_mlp": 1.11179638, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.05573112518601677, + "language_loss": 1.02892375, + "learning_rate": 0.0009998523811171828, + "loss": 1.04024363, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2019043, + "step": 195, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149122, + "balance_loss_mlp": 1.12843966, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0935188115694547, + "language_loss": 1.0387187, + "learning_rate": 0.0009998447142611248, + "loss": 1.05020976, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.20690918, + "step": 196, + "time_per_iteration": 2.6388566493988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160139, + "balance_loss_mlp": 1.13986123, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.047444937864230444, + "language_loss": 0.96302813, + "learning_rate": 0.0009998368533481387, + "loss": 0.97462952, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.20275879, + "step": 197, + "time_per_iteration": 3.033572196960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132809, + "balance_loss_mlp": 1.11254394, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08710369828361038, + "language_loss": 0.9995833, + "learning_rate": 0.0009998287983812762, + "loss": 1.01091146, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.20263672, + "step": 198, + "time_per_iteration": 2.8421950340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155397, + "balance_loss_mlp": 1.13373709, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.10277508525357126, + "language_loss": 1.05776644, + "learning_rate": 0.0009998205493636646, + "loss": 1.06932044, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.2166748, + "step": 199, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141939, + "balance_loss_mlp": 1.12035084, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.09429923895154278, + "language_loss": 0.98451054, + "learning_rate": 0.0009998121062985063, + "loss": 0.99592984, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.21594238, + "step": 200, + "time_per_iteration": 2.6926732063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171328, + "balance_loss_mlp": 1.15014482, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.08332681767957313, + "language_loss": 1.00419915, + "learning_rate": 0.0009998034691890794, + "loss": 1.01591253, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.21203613, + "step": 201, + "time_per_iteration": 2.7643332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165409, + "balance_loss_mlp": 1.14516699, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.11326578301102472, + "language_loss": 1.05536067, + "learning_rate": 0.0009997946380387369, + "loss": 1.06701469, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.20251465, + "step": 202, + "time_per_iteration": 2.630284070968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157571, + "balance_loss_mlp": 1.13723421, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09790094078320352, + "language_loss": 1.07388449, + "learning_rate": 0.0009997856128509076, + "loss": 1.08546019, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.20336914, + "step": 203, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144349, + "balance_loss_mlp": 1.12458408, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.1356659453961297, + "language_loss": 1.02559984, + "learning_rate": 0.0009997763936290952, + "loss": 1.03704333, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.19750977, + "step": 204, + "time_per_iteration": 2.503309965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138207, + "balance_loss_mlp": 1.11642766, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.053010676996176516, + "language_loss": 1.07603145, + "learning_rate": 0.0009997669803768789, + "loss": 1.08741355, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.21789551, + "step": 205, + "time_per_iteration": 2.7773749828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_mlp": 1.09366679, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07785432610828748, + "language_loss": 1.0289582, + "learning_rate": 0.0009997573730979134, + "loss": 1.04010415, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.20947266, + "step": 206, + "time_per_iteration": 2.7241222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04720912, + "balance_loss_mlp": 3.71993518, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.31672297251450016, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.83914113, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 10.0, + "step": 207, + "time_per_iteration": 4.65311074256897 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160001, + "balance_loss_mlp": 1.13651657, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.09244016287770654, + "language_loss": 1.01599813, + "learning_rate": 0.0009997375764747294, + "loss": 1.02759814, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.23449707, + "step": 208, + "time_per_iteration": 2.999249219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144547, + "balance_loss_mlp": 1.12159967, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.10768555369795524, + "language_loss": 0.98886019, + "learning_rate": 0.0009997273871381967, + "loss": 1.00030565, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.22949219, + "step": 209, + "time_per_iteration": 2.740895986557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154635, + "balance_loss_mlp": 1.13075733, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.0670178022721504, + "language_loss": 1.03911638, + "learning_rate": 0.0009997170037902862, + "loss": 1.05066276, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.23876953, + "step": 210, + "time_per_iteration": 2.7199809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161677, + "balance_loss_mlp": 1.13826418, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.062356382061819024, + "language_loss": 1.06535935, + "learning_rate": 0.0009997064264350292, + "loss": 1.07697606, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.23413086, + "step": 211, + "time_per_iteration": 2.85477614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164794, + "balance_loss_mlp": 1.14111865, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.11782714892356931, + "language_loss": 1.00570273, + "learning_rate": 0.0009996956550765317, + "loss": 1.01735067, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.23657227, + "step": 212, + "time_per_iteration": 2.683258295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178964, + "balance_loss_mlp": 1.15452623, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07352585681220185, + "language_loss": 0.95357072, + "learning_rate": 0.0009996846897189762, + "loss": 0.9653604, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.24438477, + "step": 213, + "time_per_iteration": 2.64486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.14665973, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.06101080420793073, + "language_loss": 1.01569629, + "learning_rate": 0.0009996735303666193, + "loss": 1.02740788, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.24499512, + "step": 214, + "time_per_iteration": 2.719754934310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189275, + "balance_loss_mlp": 1.16434813, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.09805160088916984, + "language_loss": 1.03784573, + "learning_rate": 0.0009996621770237937, + "loss": 1.04973853, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24938965, + "step": 215, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202725, + "balance_loss_mlp": 1.17728579, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.05858333324383458, + "language_loss": 0.99328029, + "learning_rate": 0.0009996506296949073, + "loss": 1.00530756, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.25463867, + "step": 216, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175124, + "balance_loss_mlp": 1.14957714, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.09898600739692984, + "language_loss": 0.99386859, + "learning_rate": 0.0009996388883844428, + "loss": 1.00561976, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.25561523, + "step": 217, + "time_per_iteration": 2.5985324382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155134, + "balance_loss_mlp": 1.13007665, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06208913439552352, + "language_loss": 1.03500867, + "learning_rate": 0.0009996269530969588, + "loss": 1.04656017, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25048828, + "step": 218, + "time_per_iteration": 2.591993808746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152332, + "balance_loss_mlp": 1.12778735, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.08789931910276294, + "language_loss": 1.02762055, + "learning_rate": 0.0009996148238370888, + "loss": 1.0391438, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24536133, + "step": 219, + "time_per_iteration": 2.7247660160064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146753, + "balance_loss_mlp": 1.12125421, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.059765696203788965, + "language_loss": 0.98427057, + "learning_rate": 0.0009996025006095421, + "loss": 0.99573809, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.25524902, + "step": 220, + "time_per_iteration": 3.314250946044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04012538, + "balance_loss_mlp": 3.61886096, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.18322335632445477, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81795681, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 3.921875, + "step": 221, + "time_per_iteration": 5.397853851318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_mlp": 1.11779404, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.10045289138425088, + "language_loss": 0.98726314, + "learning_rate": 0.0009995772722706307, + "loss": 0.99869102, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.25, + "step": 222, + "time_per_iteration": 2.8346786499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168149, + "balance_loss_mlp": 1.14130318, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.07395583213906755, + "language_loss": 1.12709904, + "learning_rate": 0.0009995643671690604, + "loss": 1.13878047, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.26879883, + "step": 223, + "time_per_iteration": 2.4760169982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157966, + "balance_loss_mlp": 1.1317513, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.08239055528326475, + "language_loss": 1.00208497, + "learning_rate": 0.0009995512681194023, + "loss": 1.01366448, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.26257324, + "step": 224, + "time_per_iteration": 2.833751916885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151672, + "balance_loss_mlp": 1.12492132, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.058356102807926864, + "language_loss": 0.97854793, + "learning_rate": 0.0009995379751267417, + "loss": 0.99006462, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.2677002, + "step": 225, + "time_per_iteration": 3.295761823654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_mlp": 1.1551652, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.09032086206875983, + "language_loss": 0.99067688, + "learning_rate": 0.0009995244881962398, + "loss": 1.00250244, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.27416992, + "step": 226, + "time_per_iteration": 2.6147754192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162924, + "balance_loss_mlp": 1.1352675, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.05273235380658081, + "language_loss": 1.00220668, + "learning_rate": 0.0009995108073331323, + "loss": 1.01383591, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27661133, + "step": 227, + "time_per_iteration": 2.575477361679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165107, + "balance_loss_mlp": 1.13835633, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.07222661628022838, + "language_loss": 1.03328192, + "learning_rate": 0.0009994969325427309, + "loss": 1.04493296, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.26733398, + "step": 228, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159475, + "balance_loss_mlp": 1.13215184, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.05690950477809338, + "language_loss": 0.99788582, + "learning_rate": 0.0009994828638304218, + "loss": 1.0094806, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.2734375, + "step": 229, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160216, + "balance_loss_mlp": 1.13327467, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.0671245201901001, + "language_loss": 1.05080867, + "learning_rate": 0.0009994686012016675, + "loss": 1.06241083, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.26953125, + "step": 230, + "time_per_iteration": 2.5507686138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200075, + "balance_loss_mlp": 1.17368245, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.08083200993131012, + "language_loss": 1.04836714, + "learning_rate": 0.000999454144662005, + "loss": 1.06036782, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.26416016, + "step": 231, + "time_per_iteration": 2.872386932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177085, + "balance_loss_mlp": 1.15090632, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.06521500069668446, + "language_loss": 0.98697901, + "learning_rate": 0.0009994394942170468, + "loss": 0.99874985, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.26208496, + "step": 232, + "time_per_iteration": 2.6734542846679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_mlp": 1.13452244, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06848368332912834, + "language_loss": 0.96340638, + "learning_rate": 0.0009994246498724808, + "loss": 0.97500765, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.25598145, + "step": 233, + "time_per_iteration": 2.735145330429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.14341569, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.09664881582101635, + "language_loss": 0.99309772, + "learning_rate": 0.00099940961163407, + "loss": 1.00479114, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.25964355, + "step": 234, + "time_per_iteration": 2.8988683223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_mlp": 1.11722803, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.06003753756121682, + "language_loss": 1.01686716, + "learning_rate": 0.0009993943795076528, + "loss": 1.02828944, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.25012207, + "step": 235, + "time_per_iteration": 2.6333067417144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132836, + "balance_loss_mlp": 1.10618043, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.08170413586498586, + "language_loss": 1.0374043, + "learning_rate": 0.0009993789534991427, + "loss": 1.04873264, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.26708984, + "step": 236, + "time_per_iteration": 2.4350106716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_mlp": 1.0960753, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.0440176634981383, + "language_loss": 0.99063611, + "learning_rate": 0.0009993633336145287, + "loss": 1.00186157, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26513672, + "step": 237, + "time_per_iteration": 2.6414294242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134799, + "balance_loss_mlp": 1.10904956, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.04213473561248219, + "language_loss": 1.02718055, + "learning_rate": 0.0009993475198598752, + "loss": 1.03852856, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.25756836, + "step": 238, + "time_per_iteration": 2.9781904220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152995, + "balance_loss_mlp": 1.12614954, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08613106589232603, + "language_loss": 1.00055635, + "learning_rate": 0.0009993315122413212, + "loss": 1.01208627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.26879883, + "step": 239, + "time_per_iteration": 2.6395275592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_mlp": 1.13594294, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.06839694959482054, + "language_loss": 0.99973977, + "learning_rate": 0.0009993153107650818, + "loss": 1.01136363, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.2644043, + "step": 240, + "time_per_iteration": 2.563133716583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_mlp": 1.13391829, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.06471449859153773, + "language_loss": 0.98970807, + "learning_rate": 0.0009992989154374468, + "loss": 1.00131631, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.26928711, + "step": 241, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145463, + "balance_loss_mlp": 1.11914206, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06957696695924716, + "language_loss": 1.05868769, + "learning_rate": 0.0009992823262647817, + "loss": 1.07014227, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26342773, + "step": 242, + "time_per_iteration": 2.6841883659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111302, + "balance_loss_mlp": 1.08692503, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.0649477492764712, + "language_loss": 0.99848783, + "learning_rate": 0.0009992655432535264, + "loss": 1.00961804, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.2611084, + "step": 243, + "time_per_iteration": 2.7613234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107198, + "balance_loss_mlp": 1.08162785, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.05612685480258275, + "language_loss": 1.00329947, + "learning_rate": 0.0009992485664101973, + "loss": 1.01437151, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.25598145, + "step": 244, + "time_per_iteration": 2.717280387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.09556472, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.10316769075352135, + "language_loss": 1.02662849, + "learning_rate": 0.000999231395741385, + "loss": 1.03785205, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.26831055, + "step": 245, + "time_per_iteration": 3.095249891281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_mlp": 1.11837006, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.09647975042234339, + "language_loss": 1.01015186, + "learning_rate": 0.0009992140312537557, + "loss": 1.02159202, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.25671387, + "step": 246, + "time_per_iteration": 2.633258819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.09845233, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.09798218580430706, + "language_loss": 0.95550418, + "learning_rate": 0.000999196472954051, + "loss": 0.96674085, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.25231934, + "step": 247, + "time_per_iteration": 3.024939775466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02466762, + "balance_loss_mlp": 2.43700695, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.2831653982047738, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81891614, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 0.296875, + "step": 248, + "time_per_iteration": 5.486468076705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162703, + "balance_loss_mlp": 1.13626289, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.12969478117477343, + "language_loss": 1.03178453, + "learning_rate": 0.0009991607749457578, + "loss": 1.04341149, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.26464844, + "step": 249, + "time_per_iteration": 2.5253713130950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119774, + "balance_loss_mlp": 1.16941571, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.09425507858465235, + "language_loss": 1.01008546, + "learning_rate": 0.0009991426352510286, + "loss": 1.0220629, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.28295898, + "step": 250, + "time_per_iteration": 3.0042202472686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204128, + "balance_loss_mlp": 1.174016, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.07677732337183582, + "language_loss": 1.0282234, + "learning_rate": 0.0009991243017719422, + "loss": 1.04026473, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30126953, + "step": 251, + "time_per_iteration": 2.709934711456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206766, + "balance_loss_mlp": 1.17522311, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.1103729500964747, + "language_loss": 0.97436613, + "learning_rate": 0.0009991057745156165, + "loss": 0.9864338, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.31518555, + "step": 252, + "time_per_iteration": 2.5961716175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03348202, + "balance_loss_mlp": 3.30471396, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.3811060337507454, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85259187, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.43554688, + "step": 253, + "time_per_iteration": 5.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_mlp": 1.1623621, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.07473951959737497, + "language_loss": 1.05491519, + "learning_rate": 0.0009990681387000943, + "loss": 1.06686831, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.3293457, + "step": 254, + "time_per_iteration": 2.7937283515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121698, + "balance_loss_mlp": 1.18345821, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.06898181212790383, + "language_loss": 1.01063621, + "learning_rate": 0.0009990490301555093, + "loss": 1.02280605, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.33544922, + "step": 255, + "time_per_iteration": 2.9615726470947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05252755, + "balance_loss_mlp": 5.12458086, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.5609302024280507, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.84467912, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.28125, + "step": 256, + "time_per_iteration": 4.8413920402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03162439, + "balance_loss_mlp": 3.09758925, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.1723793408951341, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8240518, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6484375, + "step": 257, + "time_per_iteration": 4.985513687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03630928, + "balance_loss_mlp": 3.55844903, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4079591987734508, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73606813, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.7265625, + "step": 258, + "time_per_iteration": 4.858096361160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403117, + "balance_loss_mlp": 1.35569584, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.11330256318865821, + "language_loss": 0.95339322, + "learning_rate": 0.0009989706585723202, + "loss": 0.96742439, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.47436523, + "step": 259, + "time_per_iteration": 2.794419765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437412, + "balance_loss_mlp": 1.38651013, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.10381773722922016, + "language_loss": 1.0219605, + "learning_rate": 0.0009989505813633442, + "loss": 1.03633475, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.50927734, + "step": 260, + "time_per_iteration": 2.6660099029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145174, + "balance_loss_mlp": 1.39776254, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12909552841436595, + "language_loss": 1.02080631, + "learning_rate": 0.000998930310444573, + "loss": 1.03532374, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.5402832, + "step": 261, + "time_per_iteration": 2.7547266483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429363, + "balance_loss_mlp": 1.37698281, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.08616818959721087, + "language_loss": 0.99936116, + "learning_rate": 0.0009989098458238765, + "loss": 1.01365471, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.52441406, + "step": 262, + "time_per_iteration": 2.804656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431577, + "balance_loss_mlp": 1.38310647, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.10103635045761167, + "language_loss": 0.99213421, + "learning_rate": 0.0009988891875091998, + "loss": 1.00644994, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.48486328, + "step": 263, + "time_per_iteration": 2.780696392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359367, + "balance_loss_mlp": 1.31771505, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09437475228894394, + "language_loss": 0.93793595, + "learning_rate": 0.0009988683355085636, + "loss": 0.95152962, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.41625977, + "step": 264, + "time_per_iteration": 2.758275032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314446, + "balance_loss_mlp": 1.27684712, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09784246378207673, + "language_loss": 1.02612829, + "learning_rate": 0.000998847289830063, + "loss": 1.03927279, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37524414, + "step": 265, + "time_per_iteration": 2.8752288818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289086, + "balance_loss_mlp": 1.25468266, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.06973466471853282, + "language_loss": 0.95293748, + "learning_rate": 0.0009988260504818682, + "loss": 0.9658283, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.34423828, + "step": 266, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290407, + "balance_loss_mlp": 1.2563374, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.0971565340820806, + "language_loss": 1.02148294, + "learning_rate": 0.000998804617472226, + "loss": 1.03438699, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.34082031, + "step": 267, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275377, + "balance_loss_mlp": 1.24085402, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.10761719469623075, + "language_loss": 0.96939588, + "learning_rate": 0.0009987829908094568, + "loss": 0.98214972, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.34545898, + "step": 268, + "time_per_iteration": 2.8270740509033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271333, + "balance_loss_mlp": 1.23785877, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.1226169977774822, + "language_loss": 1.04002702, + "learning_rate": 0.0009987611705019569, + "loss": 1.05274034, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.33496094, + "step": 269, + "time_per_iteration": 4.483954429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277218, + "balance_loss_mlp": 1.24267149, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.07374197309260985, + "language_loss": 1.02401245, + "learning_rate": 0.0009987391565581978, + "loss": 1.03678453, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34594727, + "step": 270, + "time_per_iteration": 2.627356767654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304636, + "balance_loss_mlp": 1.26977956, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06923057034816653, + "language_loss": 0.94496262, + "learning_rate": 0.000998716948986726, + "loss": 0.95800889, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34887695, + "step": 271, + "time_per_iteration": 2.804185628890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322736, + "balance_loss_mlp": 1.28718746, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.1173780328671846, + "language_loss": 0.97372609, + "learning_rate": 0.0009986945477961633, + "loss": 0.9869535, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.35571289, + "step": 272, + "time_per_iteration": 2.739595890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297409, + "balance_loss_mlp": 1.2620039, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07261359465506025, + "language_loss": 1.02136993, + "learning_rate": 0.0009986719529952066, + "loss": 1.03434396, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.35424805, + "step": 273, + "time_per_iteration": 2.8717877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_mlp": 1.20389819, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.13624684616705834, + "language_loss": 1.01736569, + "learning_rate": 0.000998649164592628, + "loss": 1.0297575, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.35327148, + "step": 274, + "time_per_iteration": 2.590993642807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206885, + "balance_loss_mlp": 1.16945291, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.061304815826305474, + "language_loss": 0.99439085, + "learning_rate": 0.0009986261825972748, + "loss": 1.00645971, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.37426758, + "step": 275, + "time_per_iteration": 2.702202081680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_mlp": 1.14466429, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.10486338408500256, + "language_loss": 1.01433325, + "learning_rate": 0.000998603007018069, + "loss": 1.02616751, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.38745117, + "step": 276, + "time_per_iteration": 2.876267671585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190326, + "balance_loss_mlp": 1.15055728, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.08719890934761923, + "language_loss": 0.99445826, + "learning_rate": 0.0009985796378640089, + "loss": 1.00636148, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.39746094, + "step": 277, + "time_per_iteration": 2.74886155128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165278, + "balance_loss_mlp": 1.12720275, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.06292174667602014, + "language_loss": 0.99806106, + "learning_rate": 0.0009985560751441665, + "loss": 1.00971389, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.38061523, + "step": 278, + "time_per_iteration": 2.8894753456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175743, + "balance_loss_mlp": 1.13790607, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.06329003141341145, + "language_loss": 1.01538157, + "learning_rate": 0.00099853231886769, + "loss": 1.02713895, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.37792969, + "step": 279, + "time_per_iteration": 2.783085823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183406, + "balance_loss_mlp": 1.14633179, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06545769746199957, + "language_loss": 1.01316965, + "learning_rate": 0.0009985083690438024, + "loss": 1.02500367, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.37084961, + "step": 280, + "time_per_iteration": 2.707329511642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147788, + "balance_loss_mlp": 1.11245418, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.05305898567294309, + "language_loss": 0.9175781, + "learning_rate": 0.0009984842256818016, + "loss": 0.92905599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.35400391, + "step": 281, + "time_per_iteration": 3.1014201641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_mlp": 1.13106215, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.05782684737590577, + "language_loss": 1.02446878, + "learning_rate": 0.0009984598887910613, + "loss": 1.03612816, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.34912109, + "step": 282, + "time_per_iteration": 2.75343656539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_mlp": 1.14555514, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0631633618899466, + "language_loss": 0.98333299, + "learning_rate": 0.0009984353583810297, + "loss": 0.99513876, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.3503418, + "step": 283, + "time_per_iteration": 2.8092565536499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.15350997, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0821933313576245, + "language_loss": 1.00416183, + "learning_rate": 0.0009984106344612302, + "loss": 1.01602352, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.32666016, + "step": 284, + "time_per_iteration": 2.7632908821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_mlp": 1.1310904, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.06349155766627652, + "language_loss": 0.95740765, + "learning_rate": 0.0009983857170412615, + "loss": 0.96904278, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.32421875, + "step": 285, + "time_per_iteration": 2.9946134090423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130334, + "balance_loss_mlp": 1.09912539, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.0487694941790178, + "language_loss": 0.95326382, + "learning_rate": 0.000998360606130798, + "loss": 0.96456718, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.31176758, + "step": 286, + "time_per_iteration": 2.8205370903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.09512836, + "balance_loss_mlp": 7.26674223, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.42812971022266805, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.78585953, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 22.5, + "step": 287, + "time_per_iteration": 4.986966848373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173658, + "balance_loss_mlp": 1.14278328, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08917023960137904, + "language_loss": 1.01027536, + "learning_rate": 0.0009983098038774552, + "loss": 1.02201188, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.30834961, + "step": 288, + "time_per_iteration": 2.8100168704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06110836, + "balance_loss_mlp": 5.25634384, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.4031517895181362, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.84281063, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 8.5625, + "step": 289, + "time_per_iteration": 4.790200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_mlp": 1.23435044, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.18275347501036113, + "language_loss": 0.9955281, + "learning_rate": 0.0009982582277800948, + "loss": 1.00819802, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.32641602, + "step": 290, + "time_per_iteration": 2.5976333618164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281728, + "balance_loss_mlp": 1.24694288, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.14603269886404707, + "language_loss": 1.06751418, + "learning_rate": 0.0009982321495648908, + "loss": 1.08033144, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.34838867, + "step": 291, + "time_per_iteration": 2.8513312339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250537, + "balance_loss_mlp": 1.21348643, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.09283742859778188, + "language_loss": 0.97403693, + "learning_rate": 0.0009982058779188115, + "loss": 0.98654234, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.37011719, + "step": 292, + "time_per_iteration": 2.728203773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230786, + "balance_loss_mlp": 1.19170928, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.08826519450204054, + "language_loss": 1.05705655, + "learning_rate": 0.0009981794128520567, + "loss": 1.06936455, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.39038086, + "step": 293, + "time_per_iteration": 2.79616379737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253904, + "balance_loss_mlp": 1.21258569, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.08065602932127632, + "language_loss": 1.01724029, + "learning_rate": 0.000998152754374901, + "loss": 1.02977943, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.41333008, + "step": 294, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232141, + "balance_loss_mlp": 1.19132411, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.07309017642696977, + "language_loss": 0.9826439, + "learning_rate": 0.0009981259024976943, + "loss": 0.99496531, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.40820312, + "step": 295, + "time_per_iteration": 2.7376105785369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244019, + "balance_loss_mlp": 1.20112753, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.07769478500482971, + "language_loss": 0.96765345, + "learning_rate": 0.0009980988572308612, + "loss": 0.9800936, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.42871094, + "step": 296, + "time_per_iteration": 3.001779556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226011, + "balance_loss_mlp": 1.18197489, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.0588150430335769, + "language_loss": 0.99343681, + "learning_rate": 0.0009980716185849015, + "loss": 1.00569689, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44067383, + "step": 297, + "time_per_iteration": 2.9817121028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223805, + "balance_loss_mlp": 1.18153381, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06400414638033543, + "language_loss": 0.95616293, + "learning_rate": 0.0009980441865703904, + "loss": 0.96840101, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4230957, + "step": 298, + "time_per_iteration": 2.615875244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122669, + "balance_loss_mlp": 1.18513405, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.09089975305964836, + "language_loss": 1.03662193, + "learning_rate": 0.000998016561197978, + "loss": 1.04888892, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.41577148, + "step": 299, + "time_per_iteration": 2.765833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219698, + "balance_loss_mlp": 1.17835617, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.05662219614280908, + "language_loss": 0.94978034, + "learning_rate": 0.0009979887424783895, + "loss": 0.96197736, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.41357422, + "step": 300, + "time_per_iteration": 2.8931760787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122099, + "balance_loss_mlp": 1.17850339, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.05388706690809858, + "language_loss": 0.94851983, + "learning_rate": 0.0009979607304224248, + "loss": 0.96072972, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.42504883, + "step": 301, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213648, + "balance_loss_mlp": 1.16951644, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.0564182452216587, + "language_loss": 1.02312028, + "learning_rate": 0.000997932525040959, + "loss": 1.03525686, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.44140625, + "step": 302, + "time_per_iteration": 2.7084572315216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.14165473, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.07525794393376325, + "language_loss": 1.04335976, + "learning_rate": 0.000997904126344943, + "loss": 1.05521822, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.44165039, + "step": 303, + "time_per_iteration": 2.6271631717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121438, + "balance_loss_mlp": 1.17055893, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.0664075129682053, + "language_loss": 1.00263453, + "learning_rate": 0.0009978755343454018, + "loss": 1.01477838, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.791146993637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182664, + "balance_loss_mlp": 1.13869941, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07350056034493838, + "language_loss": 1.01461756, + "learning_rate": 0.0009978467490534355, + "loss": 1.0264442, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.43969727, + "step": 305, + "time_per_iteration": 2.614455461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186922, + "balance_loss_mlp": 1.14424467, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.056638515612222363, + "language_loss": 0.97774673, + "learning_rate": 0.00099781777048022, + "loss": 0.98961592, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.42700195, + "step": 306, + "time_per_iteration": 2.717700481414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011718, + "balance_loss_mlp": 1.12855101, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.056560878082468485, + "language_loss": 0.99827361, + "learning_rate": 0.0009977885986370057, + "loss": 1.00999165, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.43310547, + "step": 307, + "time_per_iteration": 2.557203531265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164879, + "balance_loss_mlp": 1.12263095, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.05991229640473007, + "language_loss": 0.9525907, + "learning_rate": 0.000997759233535118, + "loss": 0.9642396, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.42285156, + "step": 308, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174986, + "balance_loss_mlp": 1.1345737, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.06710738832596337, + "language_loss": 1.01122141, + "learning_rate": 0.0009977296751859576, + "loss": 1.02297115, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.40405273, + "step": 309, + "time_per_iteration": 2.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164837, + "balance_loss_mlp": 1.12487829, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.05223481097130428, + "language_loss": 1.03482628, + "learning_rate": 0.0009976999236009998, + "loss": 1.0464747, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.39941406, + "step": 310, + "time_per_iteration": 2.769092321395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164403, + "balance_loss_mlp": 1.1263994, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.05685909644716586, + "language_loss": 1.04877043, + "learning_rate": 0.0009976699787917955, + "loss": 1.06041443, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37963867, + "step": 311, + "time_per_iteration": 2.6526851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08775091, + "balance_loss_mlp": 7.79852915, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.2725707199289832, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.82218087, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 9.75, + "step": 312, + "time_per_iteration": 5.006884813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_mlp": 1.12172294, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.06726838636277511, + "language_loss": 0.96427834, + "learning_rate": 0.0009976095095472243, + "loss": 0.97589004, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39428711, + "step": 313, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166252, + "balance_loss_mlp": 1.12738967, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.0761643630364548, + "language_loss": 0.97957367, + "learning_rate": 0.0009975789851353334, + "loss": 0.99123621, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.38818359, + "step": 314, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_mlp": 1.13191843, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07475166161853689, + "language_loss": 1.00319684, + "learning_rate": 0.0009975482675461487, + "loss": 1.0149318, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.41601562, + "step": 315, + "time_per_iteration": 2.65468692779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159286, + "balance_loss_mlp": 1.11591756, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08252555003670439, + "language_loss": 0.98425788, + "learning_rate": 0.0009975173567915952, + "loss": 0.99585068, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.43383789, + "step": 316, + "time_per_iteration": 2.6916940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.12767935, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.0640207679679256, + "language_loss": 0.91960573, + "learning_rate": 0.000997486252883674, + "loss": 0.93133986, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.45727539, + "step": 317, + "time_per_iteration": 2.8535635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188261, + "balance_loss_mlp": 1.13979006, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.0671416603225842, + "language_loss": 0.97457695, + "learning_rate": 0.0009974549558344602, + "loss": 0.98645949, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.484375, + "step": 318, + "time_per_iteration": 3.6911113262176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189393, + "balance_loss_mlp": 1.14037383, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.09268216800999254, + "language_loss": 1.06808639, + "learning_rate": 0.000997423465656105, + "loss": 1.07998025, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.49023438, + "step": 319, + "time_per_iteration": 2.727130651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.096205, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.06029287427116143, + "language_loss": 1.04509127, + "learning_rate": 0.0009973917823608335, + "loss": 1.05656588, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.51318359, + "step": 320, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.09605646, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.03213952729051003, + "language_loss": 0.98612553, + "learning_rate": 0.0009973599059609462, + "loss": 0.99760658, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.52075195, + "step": 321, + "time_per_iteration": 2.7024786472320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.09133446, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.04984356389382333, + "language_loss": 0.97161096, + "learning_rate": 0.000997327836468819, + "loss": 0.9830358, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.51147461, + "step": 322, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_mlp": 1.0917964, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.06671524152363617, + "language_loss": 0.99795449, + "learning_rate": 0.000997295573896902, + "loss": 1.00938356, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.51171875, + "step": 323, + "time_per_iteration": 2.834237813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03299168, + "balance_loss_mlp": 3.12445545, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.43556355854402456, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.84495211, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.75, + "step": 324, + "time_per_iteration": 4.770992040634155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02151431, + "balance_loss_mlp": 1.9545927, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.14082611715048204, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80723369, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.9609375, + "step": 325, + "time_per_iteration": 4.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.14768362, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.08367806581965369, + "language_loss": 0.93651855, + "learning_rate": 0.000997197627828043, + "loss": 0.94848073, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.4855957, + "step": 326, + "time_per_iteration": 2.5508148670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215208, + "balance_loss_mlp": 1.16862106, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.06635735350324974, + "language_loss": 0.89348811, + "learning_rate": 0.0009971645930629716, + "loss": 0.90564024, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.46533203, + "step": 327, + "time_per_iteration": 2.711386203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125047, + "balance_loss_mlp": 1.20192814, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.08863859510008423, + "language_loss": 1.03147936, + "learning_rate": 0.0009971313652814872, + "loss": 1.04398406, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.48486328, + "step": 328, + "time_per_iteration": 2.8484854698181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225004, + "balance_loss_mlp": 1.17553234, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.08503417282278386, + "language_loss": 1.0059731, + "learning_rate": 0.0009970979444964903, + "loss": 1.01822317, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.49487305, + "step": 329, + "time_per_iteration": 2.957482099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197604, + "balance_loss_mlp": 1.14846587, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.06790724972181753, + "language_loss": 1.01849604, + "learning_rate": 0.0009970643307209556, + "loss": 1.03047216, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.49121094, + "step": 330, + "time_per_iteration": 2.8220374584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170349, + "balance_loss_mlp": 1.1215446, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.06721894230078661, + "language_loss": 0.98097444, + "learning_rate": 0.0009970305239679334, + "loss": 0.99267793, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.48803711, + "step": 331, + "time_per_iteration": 2.8813369274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176679, + "balance_loss_mlp": 1.12754059, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.056286161373139375, + "language_loss": 1.03013992, + "learning_rate": 0.0009969965242505483, + "loss": 1.04190671, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.4909668, + "step": 332, + "time_per_iteration": 2.6662604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168774, + "balance_loss_mlp": 1.11932611, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06031850484613652, + "language_loss": 0.99096131, + "learning_rate": 0.0009969623315820007, + "loss": 1.00264907, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.49487305, + "step": 333, + "time_per_iteration": 2.6671581268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.10619712, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06229524640691676, + "language_loss": 0.99215055, + "learning_rate": 0.000996927945975565, + "loss": 1.00368309, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.47070312, + "step": 334, + "time_per_iteration": 2.568838357925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.1125921, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.05620099657237302, + "language_loss": 0.95852566, + "learning_rate": 0.0009968933674445906, + "loss": 0.97011936, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.46728516, + "step": 335, + "time_per_iteration": 2.6725666522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160514, + "balance_loss_mlp": 1.1122818, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.05589062806096766, + "language_loss": 0.97974062, + "learning_rate": 0.0009968585960025028, + "loss": 0.99134576, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.48217773, + "step": 336, + "time_per_iteration": 2.945194959640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0396516, + "balance_loss_mlp": 3.85834861, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.42886267506062575, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.81618351, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.0703125, + "step": 337, + "time_per_iteration": 4.802944183349609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215082, + "balance_loss_mlp": 1.16968668, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.09324534870618859, + "language_loss": 0.96021777, + "learning_rate": 0.0009967884744390583, + "loss": 0.9723686, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.45361328, + "step": 338, + "time_per_iteration": 3.5247950553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251582, + "balance_loss_mlp": 1.2060678, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.09123718626917265, + "language_loss": 0.97373873, + "learning_rate": 0.0009967531243449256, + "loss": 0.98625457, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.45507812, + "step": 339, + "time_per_iteration": 2.681973695755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211309, + "balance_loss_mlp": 1.163077, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.06030156589334856, + "language_loss": 1.04525125, + "learning_rate": 0.000996717581394126, + "loss": 1.05736434, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.48242188, + "step": 340, + "time_per_iteration": 2.6031126976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205107, + "balance_loss_mlp": 1.15630233, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.06934362388274598, + "language_loss": 1.05133414, + "learning_rate": 0.000996681845600459, + "loss": 1.06338525, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.48803711, + "step": 341, + "time_per_iteration": 2.6689491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190009, + "balance_loss_mlp": 1.1402986, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07929020766121274, + "language_loss": 0.97276402, + "learning_rate": 0.0009966459169777982, + "loss": 0.98466408, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.49731445, + "step": 342, + "time_per_iteration": 2.5235347747802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183772, + "balance_loss_mlp": 1.13444376, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.06503113555429127, + "language_loss": 1.05431008, + "learning_rate": 0.0009966097955400924, + "loss": 1.0661478, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.4934082, + "step": 343, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195626, + "balance_loss_mlp": 1.14772749, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.05810753199069879, + "language_loss": 0.99792945, + "learning_rate": 0.0009965734813013652, + "loss": 1.00988579, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.47924805, + "step": 344, + "time_per_iteration": 2.8092823028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211149, + "balance_loss_mlp": 1.16191518, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.08606224500635251, + "language_loss": 1.02011895, + "learning_rate": 0.0009965369742757151, + "loss": 1.03223062, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.49243164, + "step": 345, + "time_per_iteration": 2.5981764793395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193116, + "balance_loss_mlp": 1.14435959, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.0619511290056959, + "language_loss": 0.98293203, + "learning_rate": 0.0009965002744773152, + "loss": 0.99486327, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.48730469, + "step": 346, + "time_per_iteration": 3.4968950748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178364, + "balance_loss_mlp": 1.13115668, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.04856723246232052, + "language_loss": 0.95658922, + "learning_rate": 0.0009964633819204139, + "loss": 0.96837282, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.47167969, + "step": 347, + "time_per_iteration": 2.6705336570739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04576048, + "balance_loss_mlp": 4.3029151, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.32603271390487504, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.86377156, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 2.734375, + "step": 348, + "time_per_iteration": 4.961863994598389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03789769, + "balance_loss_mlp": 3.60590124, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.16497869204612428, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.78943658, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.8359375, + "step": 349, + "time_per_iteration": 4.876751184463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181375, + "balance_loss_mlp": 1.13578987, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.07770510755269132, + "language_loss": 0.96067584, + "learning_rate": 0.000996351547842304, + "loss": 0.9724896, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.45581055, + "step": 350, + "time_per_iteration": 3.166680097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217287, + "balance_loss_mlp": 1.16969919, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.06167835917893234, + "language_loss": 0.94333142, + "learning_rate": 0.0009963138843953744, + "loss": 0.9555043, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.47558594, + "step": 351, + "time_per_iteration": 2.5784904956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122803, + "balance_loss_mlp": 1.18005991, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.06188972934791396, + "language_loss": 0.98543227, + "learning_rate": 0.000996276028262306, + "loss": 0.99771261, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.47924805, + "step": 352, + "time_per_iteration": 2.7985076904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216963, + "balance_loss_mlp": 1.16760993, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.0659402302829914, + "language_loss": 1.04801619, + "learning_rate": 0.0009962379794577964, + "loss": 1.06018579, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.49365234, + "step": 353, + "time_per_iteration": 2.608032703399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123128, + "balance_loss_mlp": 1.18266606, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.051231802586423875, + "language_loss": 0.94352609, + "learning_rate": 0.000996199737996617, + "loss": 0.95583886, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.48657227, + "step": 354, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227436, + "balance_loss_mlp": 1.17770219, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.05676190931504088, + "language_loss": 1.03759205, + "learning_rate": 0.0009961613038936149, + "loss": 1.04986644, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.49755859, + "step": 355, + "time_per_iteration": 2.617859125137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216553, + "balance_loss_mlp": 1.16572189, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.04878484453506707, + "language_loss": 0.95482612, + "learning_rate": 0.000996122677163711, + "loss": 0.96699166, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.50878906, + "step": 356, + "time_per_iteration": 2.8171308040618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230039, + "balance_loss_mlp": 1.18037653, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.06504242786199886, + "language_loss": 1.01527905, + "learning_rate": 0.000996083857821902, + "loss": 1.02757955, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.49682617, + "step": 357, + "time_per_iteration": 3.0562636852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221322, + "balance_loss_mlp": 1.17237508, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.043415107047687695, + "language_loss": 0.99947309, + "learning_rate": 0.0009960448458832588, + "loss": 1.01168633, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.48925781, + "step": 358, + "time_per_iteration": 2.6778266429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224961, + "balance_loss_mlp": 1.17675292, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.061398357107108094, + "language_loss": 0.99686754, + "learning_rate": 0.000996005641362927, + "loss": 1.00911713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.48193359, + "step": 359, + "time_per_iteration": 2.5839953422546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218039, + "balance_loss_mlp": 1.16792321, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.045504813624839685, + "language_loss": 1.02907789, + "learning_rate": 0.0009959662442761274, + "loss": 1.04125834, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.5012207, + "step": 360, + "time_per_iteration": 2.9012227058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225991, + "balance_loss_mlp": 1.17504108, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.05242893208235044, + "language_loss": 0.96392268, + "learning_rate": 0.000995926654638155, + "loss": 0.97618258, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.50976562, + "step": 361, + "time_per_iteration": 2.7972850799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120421, + "balance_loss_mlp": 1.15323579, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0452718414118582, + "language_loss": 0.98678619, + "learning_rate": 0.00099588687246438, + "loss": 0.99882829, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.51025391, + "step": 362, + "time_per_iteration": 2.845742702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011953, + "balance_loss_mlp": 1.14241886, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.06654716127982052, + "language_loss": 1.06146324, + "learning_rate": 0.0009958468977702471, + "loss": 1.07341623, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.52978516, + "step": 363, + "time_per_iteration": 2.5876591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05386722, + "balance_loss_mlp": 5.09527922, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.35536528906135745, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.85121429, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 2.921875, + "step": 364, + "time_per_iteration": 4.7958595752716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183221, + "balance_loss_mlp": 1.12800324, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.06493728064972926, + "language_loss": 0.94085538, + "learning_rate": 0.0009957663708830612, + "loss": 0.95268762, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.55273438, + "step": 365, + "time_per_iteration": 3.238919258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188034, + "balance_loss_mlp": 1.13048029, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.06418297657416602, + "language_loss": 0.98210049, + "learning_rate": 0.0009957258187212714, + "loss": 0.99398077, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.57470703, + "step": 366, + "time_per_iteration": 3.0337131023406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0292345, + "balance_loss_mlp": 2.78612089, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.09868001986151984, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.82118309, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.375, + "step": 367, + "time_per_iteration": 4.825684070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118916, + "balance_loss_mlp": 1.12988925, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.06345017711900697, + "language_loss": 0.94456601, + "learning_rate": 0.0009956441370400167, + "loss": 0.95645761, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.59179688, + "step": 368, + "time_per_iteration": 2.6685595512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203671, + "balance_loss_mlp": 1.14411354, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.07550644934377632, + "language_loss": 1.00098681, + "learning_rate": 0.0009956030075522636, + "loss": 1.0130235, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.59472656, + "step": 369, + "time_per_iteration": 2.7824065685272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.12555027, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0634963537383221, + "language_loss": 1.00245738, + "learning_rate": 0.0009955616856543587, + "loss": 1.01431036, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.59667969, + "step": 370, + "time_per_iteration": 2.6869115829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117739, + "balance_loss_mlp": 1.11649847, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.04749901473855408, + "language_loss": 0.92605507, + "learning_rate": 0.0009955201713623448, + "loss": 0.93782902, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.60791016, + "step": 371, + "time_per_iteration": 2.7894065380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03553003, + "balance_loss_mlp": 3.34700894, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.1539254818196356, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.80225718, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 2.0625, + "step": 372, + "time_per_iteration": 5.025646924972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_mlp": 1.12739396, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.05697389015463885, + "language_loss": 1.05361807, + "learning_rate": 0.0009954365656605333, + "loss": 1.06550562, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.61328125, + "step": 373, + "time_per_iteration": 2.5767741203308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203971, + "balance_loss_mlp": 1.13878703, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.0561234241567743, + "language_loss": 0.98981488, + "learning_rate": 0.0009953944742831947, + "loss": 1.00185454, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.65185547, + "step": 374, + "time_per_iteration": 3.0126912593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209318, + "balance_loss_mlp": 1.14351439, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.05197007853134015, + "language_loss": 1.02623391, + "learning_rate": 0.0009953521905766642, + "loss": 1.0383271, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.65820312, + "step": 375, + "time_per_iteration": 2.9678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207965, + "balance_loss_mlp": 1.14464104, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.05250799377029981, + "language_loss": 1.01212132, + "learning_rate": 0.0009953097145573577, + "loss": 1.02420104, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.6328125, + "step": 376, + "time_per_iteration": 2.7048561573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121329, + "balance_loss_mlp": 1.1502521, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.050651846587156886, + "language_loss": 0.98499894, + "learning_rate": 0.000995267046241766, + "loss": 0.99713182, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.62988281, + "step": 377, + "time_per_iteration": 3.287705421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225924, + "balance_loss_mlp": 1.16341114, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.05776369312695448, + "language_loss": 0.98701203, + "learning_rate": 0.0009952241856464547, + "loss": 0.99927127, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.62451172, + "step": 378, + "time_per_iteration": 2.5897629261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220563, + "balance_loss_mlp": 1.16010034, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.05450855675542614, + "language_loss": 1.05642247, + "learning_rate": 0.0009951811327880632, + "loss": 1.06862807, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.60351562, + "step": 379, + "time_per_iteration": 2.7320594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220943, + "balance_loss_mlp": 1.15924072, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.04947645913164449, + "language_loss": 0.99005401, + "learning_rate": 0.0009951378876833063, + "loss": 1.00226343, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.61669922, + "step": 380, + "time_per_iteration": 2.595810651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196634, + "balance_loss_mlp": 1.13798296, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.058807068798268386, + "language_loss": 1.05567527, + "learning_rate": 0.0009950944503489736, + "loss": 1.06764162, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.5859375, + "step": 381, + "time_per_iteration": 2.733560562133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197608, + "balance_loss_mlp": 1.13914812, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.06747680453051412, + "language_loss": 0.99337935, + "learning_rate": 0.0009950508208019285, + "loss": 1.00535548, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.58398438, + "step": 382, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176507, + "balance_loss_mlp": 1.12062192, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.05827239016363537, + "language_loss": 1.03707182, + "learning_rate": 0.0009950069990591096, + "loss": 1.04883695, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.55908203, + "step": 383, + "time_per_iteration": 2.6856980323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05393736, + "balance_loss_mlp": 5.19079447, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.38241300139143997, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.81795102, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 2.03125, + "step": 384, + "time_per_iteration": 4.860661268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128436, + "balance_loss_mlp": 1.07369518, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.06005395599718801, + "language_loss": 0.96679938, + "learning_rate": 0.0009949187790542777, + "loss": 0.97808379, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.54760742, + "step": 385, + "time_per_iteration": 2.7245922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.09042215, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.06780842756482337, + "language_loss": 0.9270733, + "learning_rate": 0.0009948743808265148, + "loss": 0.93854064, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.56298828, + "step": 386, + "time_per_iteration": 2.6745331287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187036, + "balance_loss_mlp": 1.13334417, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.04295711334598506, + "language_loss": 1.02854586, + "learning_rate": 0.0009948297904714782, + "loss": 1.04041624, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.53759766, + "step": 387, + "time_per_iteration": 2.681718111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202671, + "balance_loss_mlp": 1.15167296, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.05564614333293379, + "language_loss": 0.94366896, + "learning_rate": 0.0009947850080064796, + "loss": 0.95569569, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.51049805, + "step": 388, + "time_per_iteration": 2.788663148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216483, + "balance_loss_mlp": 1.16817975, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.07112384111458, + "language_loss": 0.99713415, + "learning_rate": 0.0009947400334489047, + "loss": 1.00929892, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.48291016, + "step": 389, + "time_per_iteration": 2.9905049800872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227498, + "balance_loss_mlp": 1.17926562, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.06900212518032732, + "language_loss": 0.91264081, + "learning_rate": 0.0009946948668162145, + "loss": 0.92491579, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.48168945, + "step": 390, + "time_per_iteration": 2.767531394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012247, + "balance_loss_mlp": 1.17277205, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.052104168644034804, + "language_loss": 0.95126128, + "learning_rate": 0.0009946495081259441, + "loss": 0.96350825, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.52001953, + "step": 391, + "time_per_iteration": 2.816908597946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192311, + "balance_loss_mlp": 1.14057434, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.051504782312047234, + "language_loss": 0.99421549, + "learning_rate": 0.0009946039573957035, + "loss": 1.00613856, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.51782227, + "step": 392, + "time_per_iteration": 2.9265222549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116666, + "balance_loss_mlp": 1.11478019, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.055053573084277836, + "language_loss": 0.95799196, + "learning_rate": 0.000994558214643177, + "loss": 0.96965855, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.51928711, + "step": 393, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165121, + "balance_loss_mlp": 1.11352682, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.05925711706254076, + "language_loss": 0.97585773, + "learning_rate": 0.000994512279886123, + "loss": 0.98750889, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.51660156, + "step": 394, + "time_per_iteration": 3.0709142684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.09191656, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.04191079383555719, + "language_loss": 0.97239089, + "learning_rate": 0.0009944661531423758, + "loss": 0.98382699, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.51757812, + "step": 395, + "time_per_iteration": 2.7044599056243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134219, + "balance_loss_mlp": 1.08338809, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.05545815376917658, + "language_loss": 0.96390671, + "learning_rate": 0.000994419834429843, + "loss": 0.97524893, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.50854492, + "step": 396, + "time_per_iteration": 2.6767609119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135922, + "balance_loss_mlp": 1.08525789, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.05307630449121137, + "language_loss": 1.01208472, + "learning_rate": 0.0009943733237665069, + "loss": 1.02344394, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.50683594, + "step": 397, + "time_per_iteration": 2.819148302078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124426, + "balance_loss_mlp": 1.07502615, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.049844903289807924, + "language_loss": 0.99488425, + "learning_rate": 0.0009943266211704248, + "loss": 1.00612843, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.49389648, + "step": 398, + "time_per_iteration": 2.9555482864379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125466, + "balance_loss_mlp": 1.07675719, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.05620775813161816, + "language_loss": 1.01430082, + "learning_rate": 0.000994279726659728, + "loss": 1.02555549, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.48706055, + "step": 399, + "time_per_iteration": 2.5138003826141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.07761765, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.05674792404596756, + "language_loss": 0.99883693, + "learning_rate": 0.0009942326402526231, + "loss": 1.01010823, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.49511719, + "step": 400, + "time_per_iteration": 2.5245604515075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_mlp": 1.07793891, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.036646942736225624, + "language_loss": 0.9767518, + "learning_rate": 0.0009941853619673902, + "loss": 0.98802906, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.49804688, + "step": 401, + "time_per_iteration": 2.644771099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_mlp": 1.07451057, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.057554732491620374, + "language_loss": 1.01884329, + "learning_rate": 0.0009941378918223844, + "loss": 1.0300777, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.48876953, + "step": 402, + "time_per_iteration": 3.051617383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_mlp": 1.07618988, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.04510164642433069, + "language_loss": 0.94372368, + "learning_rate": 0.0009940902298360354, + "loss": 0.95496523, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.47924805, + "step": 403, + "time_per_iteration": 2.7302582263946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118279, + "balance_loss_mlp": 1.0687592, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.062376946911402976, + "language_loss": 1.04687834, + "learning_rate": 0.0009940423760268473, + "loss": 1.05806112, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.49536133, + "step": 404, + "time_per_iteration": 2.856938600540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118682, + "balance_loss_mlp": 1.07009196, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.046838991637930295, + "language_loss": 0.97888398, + "learning_rate": 0.0009939943304133982, + "loss": 0.99007082, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.48608398, + "step": 405, + "time_per_iteration": 2.6161091327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115161, + "balance_loss_mlp": 1.06881261, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.04496148345425058, + "language_loss": 1.04081011, + "learning_rate": 0.0009939460930143416, + "loss": 1.0519619, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.46337891, + "step": 406, + "time_per_iteration": 2.6310677528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119218, + "balance_loss_mlp": 1.07332289, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.037201804651944344, + "language_loss": 0.98071587, + "learning_rate": 0.0009938976638484043, + "loss": 0.99190807, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.45874023, + "step": 407, + "time_per_iteration": 2.8977036476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.06844616, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.04629061554837057, + "language_loss": 0.97991359, + "learning_rate": 0.0009938490429343887, + "loss": 0.99104249, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.44458008, + "step": 408, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07315516, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.04004461216150975, + "language_loss": 0.97974342, + "learning_rate": 0.0009938002302911709, + "loss": 0.99092889, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.4543457, + "step": 409, + "time_per_iteration": 2.738518238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123547, + "balance_loss_mlp": 1.07915401, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.07048914756312923, + "language_loss": 1.00401747, + "learning_rate": 0.0009937512259377015, + "loss": 1.01525307, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.44384766, + "step": 410, + "time_per_iteration": 2.670149564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110678, + "balance_loss_mlp": 1.0668565, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.049646402233970426, + "language_loss": 0.99659574, + "learning_rate": 0.000993702029893006, + "loss": 1.00770259, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.4387207, + "step": 411, + "time_per_iteration": 2.7853777408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118473, + "balance_loss_mlp": 1.07200527, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.04880092350488667, + "language_loss": 0.98862529, + "learning_rate": 0.0009936526421761838, + "loss": 0.99981004, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.46435547, + "step": 412, + "time_per_iteration": 3.030674457550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114951, + "balance_loss_mlp": 1.07043815, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.04383720282943398, + "language_loss": 1.01490402, + "learning_rate": 0.000993603062806409, + "loss": 1.02605367, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.4453125, + "step": 413, + "time_per_iteration": 2.7101500034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0637151, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.046157231925668944, + "language_loss": 1.04664707, + "learning_rate": 0.0009935532918029298, + "loss": 1.05774391, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.45947266, + "step": 414, + "time_per_iteration": 2.593390941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118947, + "balance_loss_mlp": 1.07278943, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.058468816323775735, + "language_loss": 0.97956645, + "learning_rate": 0.0009935033291850694, + "loss": 0.99075592, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.46166992, + "step": 415, + "time_per_iteration": 2.6693851947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_mlp": 1.0654031, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.061030352209764355, + "language_loss": 1.00225627, + "learning_rate": 0.0009934531749722247, + "loss": 1.01337099, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.46044922, + "step": 416, + "time_per_iteration": 2.578746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_mlp": 1.07337523, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.05071064772829009, + "language_loss": 0.98778659, + "learning_rate": 0.0009934028291838672, + "loss": 0.99898028, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.45996094, + "step": 417, + "time_per_iteration": 2.7096333503723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106202, + "balance_loss_mlp": 1.06166553, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.045680808340910005, + "language_loss": 0.94326293, + "learning_rate": 0.0009933522918395433, + "loss": 0.95432496, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.44555664, + "step": 418, + "time_per_iteration": 2.644414186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04959176, + "balance_loss_mlp": 4.71808767, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.3214703434406663, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.83210278, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 2.40625, + "step": 419, + "time_per_iteration": 4.868964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_mlp": 1.07108891, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08060687528614664, + "language_loss": 1.13036489, + "learning_rate": 0.000993250642561551, + "loss": 1.14152122, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.4453125, + "step": 420, + "time_per_iteration": 2.632162094116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121548, + "balance_loss_mlp": 1.07538986, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.08633853635548816, + "language_loss": 0.9784801, + "learning_rate": 0.0009931995306673466, + "loss": 0.98969555, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.46118164, + "step": 421, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134412, + "balance_loss_mlp": 1.08815861, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.038770411105538145, + "language_loss": 1.03907061, + "learning_rate": 0.000993148227296103, + "loss": 1.05041468, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.4621582, + "step": 422, + "time_per_iteration": 2.669496536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133253, + "balance_loss_mlp": 1.08707166, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.053095831055692516, + "language_loss": 0.9112367, + "learning_rate": 0.000993096732467738, + "loss": 0.92256927, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.46166992, + "step": 423, + "time_per_iteration": 2.961660861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150855, + "balance_loss_mlp": 1.10498345, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.08137036582560589, + "language_loss": 0.99760056, + "learning_rate": 0.0009930450462022435, + "loss": 1.00910902, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.45874023, + "step": 424, + "time_per_iteration": 2.7952311038970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03600409, + "balance_loss_mlp": 3.48901963, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.18349806711668631, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.82790214, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.1171875, + "step": 425, + "time_per_iteration": 4.8854875564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.11344862, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.06491953183218531, + "language_loss": 0.9776966, + "learning_rate": 0.0009929410994402065, + "loss": 0.98928833, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.45703125, + "step": 426, + "time_per_iteration": 4.275091886520386 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169515, + "balance_loss_mlp": 1.12223697, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.07437504582125473, + "language_loss": 1.02033544, + "learning_rate": 0.0009928888389840196, + "loss": 1.03203058, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.47241211, + "step": 427, + "time_per_iteration": 2.7036454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145234, + "balance_loss_mlp": 1.09941018, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.05964472172349544, + "language_loss": 1.03706717, + "learning_rate": 0.0009928363871714147, + "loss": 1.04851961, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.45849609, + "step": 428, + "time_per_iteration": 2.6669116020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.10254741, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.07530468467255677, + "language_loss": 0.97491598, + "learning_rate": 0.0009927837440227556, + "loss": 0.98641634, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.47485352, + "step": 429, + "time_per_iteration": 2.8463807106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120703, + "balance_loss_mlp": 1.07588065, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.04140843961960757, + "language_loss": 0.92054397, + "learning_rate": 0.0009927309095584798, + "loss": 0.93175101, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.44824219, + "step": 430, + "time_per_iteration": 2.9767606258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116415, + "balance_loss_mlp": 1.07278419, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.04726827868993605, + "language_loss": 1.04780793, + "learning_rate": 0.0009926778837991, + "loss": 1.05897212, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.43652344, + "step": 431, + "time_per_iteration": 2.5883395671844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.06749809, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.049074519776006666, + "language_loss": 1.0243988, + "learning_rate": 0.000992624666765202, + "loss": 1.0355196, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.44604492, + "step": 432, + "time_per_iteration": 2.7943906784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_mlp": 1.07200766, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.04417562175093811, + "language_loss": 1.00109053, + "learning_rate": 0.000992571258477447, + "loss": 1.01224887, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.43823242, + "step": 433, + "time_per_iteration": 2.836127758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_mlp": 1.07260084, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.04319706549365549, + "language_loss": 0.93695247, + "learning_rate": 0.0009925176589565695, + "loss": 0.94812053, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.44213867, + "step": 434, + "time_per_iteration": 2.8157734870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131219, + "balance_loss_mlp": 1.08756483, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.04172416189060796, + "language_loss": 1.04242814, + "learning_rate": 0.0009924638682233791, + "loss": 1.05374026, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.43652344, + "step": 435, + "time_per_iteration": 2.5577316284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503783, + "balance_loss_mlp": 2.3527205, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06968128915635463, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82068378, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.5078125, + "step": 436, + "time_per_iteration": 4.594938516616821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.08348453, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.0610737753852808, + "language_loss": 0.94037408, + "learning_rate": 0.0009923557132036668, + "loss": 0.95166528, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.45629883, + "step": 437, + "time_per_iteration": 3.0716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_mlp": 1.07430601, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.04662895628051273, + "language_loss": 0.97730738, + "learning_rate": 0.0009923013489591345, + "loss": 0.98849535, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.4453125, + "step": 438, + "time_per_iteration": 2.726792812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_mlp": 1.06685066, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04626496214247174, + "language_loss": 0.96079296, + "learning_rate": 0.0009922467935862681, + "loss": 0.97189873, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.4375, + "step": 439, + "time_per_iteration": 3.0908052921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119416, + "balance_loss_mlp": 1.07273376, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.048922855388473234, + "language_loss": 0.99432743, + "learning_rate": 0.0009921920471062478, + "loss": 1.00552154, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.46655273, + "step": 440, + "time_per_iteration": 2.622451066970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117818, + "balance_loss_mlp": 1.07342434, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.07502031783190574, + "language_loss": 0.9797709, + "learning_rate": 0.0009921371095403281, + "loss": 0.99094903, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.44433594, + "step": 441, + "time_per_iteration": 2.705152750015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011116, + "balance_loss_mlp": 1.06863689, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.04941418140969711, + "language_loss": 1.00754833, + "learning_rate": 0.0009920819809098379, + "loss": 1.01866436, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.42993164, + "step": 442, + "time_per_iteration": 2.5887317657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119689, + "balance_loss_mlp": 1.07715499, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.06964486535702215, + "language_loss": 0.96275294, + "learning_rate": 0.0009920266612361798, + "loss": 0.97394979, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.42578125, + "step": 443, + "time_per_iteration": 2.745222330093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_mlp": 1.06587708, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.05163049083883061, + "language_loss": 0.96866751, + "learning_rate": 0.0009919711505408308, + "loss": 0.97974443, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.41821289, + "step": 444, + "time_per_iteration": 2.780095100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106314, + "balance_loss_mlp": 1.0654248, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.054748359311131624, + "language_loss": 0.94535226, + "learning_rate": 0.000991915448845342, + "loss": 0.95641541, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.40893555, + "step": 445, + "time_per_iteration": 2.5229337215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.06279922, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.0575820537988498, + "language_loss": 1.03181779, + "learning_rate": 0.000991859556171339, + "loss": 1.04284596, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.40039062, + "step": 446, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_mlp": 1.06497526, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.04289742759235468, + "language_loss": 1.05262291, + "learning_rate": 0.000991803472540521, + "loss": 1.06367946, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.40673828, + "step": 447, + "time_per_iteration": 2.6220486164093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_mlp": 1.06550729, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.04330621576945977, + "language_loss": 1.00096428, + "learning_rate": 0.0009917471979746615, + "loss": 1.01202178, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.40234375, + "step": 448, + "time_per_iteration": 2.9767467975616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.07379115, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.03609686036920932, + "language_loss": 0.98485255, + "learning_rate": 0.0009916907324956086, + "loss": 0.99600053, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.41015625, + "step": 449, + "time_per_iteration": 2.701143980026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117003, + "balance_loss_mlp": 1.07480288, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.04834207301210501, + "language_loss": 0.95441091, + "learning_rate": 0.0009916340761252837, + "loss": 0.965581, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.42211914, + "step": 450, + "time_per_iteration": 2.6036393642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129901, + "balance_loss_mlp": 1.08910751, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.07269963588094165, + "language_loss": 0.9243114, + "learning_rate": 0.0009915772288856832, + "loss": 0.93561041, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.40820312, + "step": 451, + "time_per_iteration": 3.05719256401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125865, + "balance_loss_mlp": 1.08359361, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.05954656443346509, + "language_loss": 0.93746579, + "learning_rate": 0.000991520190798877, + "loss": 0.94872439, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.42285156, + "step": 452, + "time_per_iteration": 2.804128885269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_mlp": 1.07723105, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.05604676795867647, + "language_loss": 1.04000187, + "learning_rate": 0.0009914629618870089, + "loss": 1.05120206, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.42797852, + "step": 453, + "time_per_iteration": 2.8959083557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02032313, + "balance_loss_mlp": 1.86675501, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.06678910630402063, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.80708182, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.65625, + "step": 454, + "time_per_iteration": 4.753306865692139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974747, + "balance_loss_mlp": 1.80537415, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.06350102966569023, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83402705, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.6953125, + "step": 455, + "time_per_iteration": 4.909627914428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_mlp": 1.05778539, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.07384563339861851, + "language_loss": 0.95938599, + "learning_rate": 0.0009912901304235883, + "loss": 0.97038674, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.42333984, + "step": 456, + "time_per_iteration": 3.0303096771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_mlp": 1.05112898, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.061767025741825826, + "language_loss": 0.93898749, + "learning_rate": 0.000991232138434397, + "loss": 0.94991863, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.41992188, + "step": 457, + "time_per_iteration": 2.834221601486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089137, + "balance_loss_mlp": 1.04824805, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.05183647995223567, + "language_loss": 1.00765896, + "learning_rate": 0.000991173955731976, + "loss": 1.0185504, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.40869141, + "step": 458, + "time_per_iteration": 2.628783702850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_mlp": 1.05569601, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.052575936673692925, + "language_loss": 1.04489028, + "learning_rate": 0.0009911155823389137, + "loss": 1.0558753, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.42797852, + "step": 459, + "time_per_iteration": 2.964416742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_mlp": 1.06523609, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.05270293395412616, + "language_loss": 1.00385904, + "learning_rate": 0.000991057018277873, + "loss": 1.01492882, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.41748047, + "step": 460, + "time_per_iteration": 2.6944808959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_mlp": 1.06245136, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.04953210926048159, + "language_loss": 1.01399374, + "learning_rate": 0.0009909982635715898, + "loss": 1.02504039, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.42236328, + "step": 461, + "time_per_iteration": 2.6137924194335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_mlp": 1.05374336, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.050729417377465176, + "language_loss": 1.00123549, + "learning_rate": 0.0009909393182428751, + "loss": 1.01219559, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.42285156, + "step": 462, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109539, + "balance_loss_mlp": 1.06891286, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.043715633324142876, + "language_loss": 0.94138575, + "learning_rate": 0.000990880182314614, + "loss": 0.95248115, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.40625, + "step": 463, + "time_per_iteration": 2.733408212661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_mlp": 1.06121325, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.051961844945365605, + "language_loss": 0.94176865, + "learning_rate": 0.0009908208558097643, + "loss": 0.9527818, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.40087891, + "step": 464, + "time_per_iteration": 2.9006474018096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105621, + "balance_loss_mlp": 1.06508923, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.04470923680131565, + "language_loss": 0.9716863, + "learning_rate": 0.000990761338751359, + "loss": 0.98274255, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.40527344, + "step": 465, + "time_per_iteration": 2.775830030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410893, + "balance_loss_mlp": 1.25296497, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.0425617539044403, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75070524, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.578125, + "step": 466, + "time_per_iteration": 5.023500919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_mlp": 1.05869305, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.04007163966797277, + "language_loss": 0.9983623, + "learning_rate": 0.0009906417330663815, + "loss": 1.00936306, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.41381836, + "step": 467, + "time_per_iteration": 2.6194305419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099405, + "balance_loss_mlp": 1.05889773, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.03985353179312445, + "language_loss": 0.96447593, + "learning_rate": 0.0009905816444862442, + "loss": 0.97546995, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.4050293, + "step": 468, + "time_per_iteration": 2.623267889022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_mlp": 1.06568456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.038840192804800056, + "language_loss": 0.93513083, + "learning_rate": 0.0009905213654454216, + "loss": 0.94620228, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.41455078, + "step": 469, + "time_per_iteration": 2.9024641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_mlp": 1.06466317, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.04985478927164425, + "language_loss": 1.01848495, + "learning_rate": 0.0009904608959673158, + "loss": 1.02953827, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.40649414, + "step": 470, + "time_per_iteration": 2.7711682319641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097659, + "balance_loss_mlp": 1.0588448, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.04989175862356038, + "language_loss": 1.02851224, + "learning_rate": 0.000990400236075403, + "loss": 1.03948903, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.38793945, + "step": 471, + "time_per_iteration": 2.536189317703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109095, + "balance_loss_mlp": 1.05113411, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.03738902964718639, + "language_loss": 0.98994756, + "learning_rate": 0.0009903393857932338, + "loss": 1.000857, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.39794922, + "step": 472, + "time_per_iteration": 2.6588857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097802, + "balance_loss_mlp": 1.05908275, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.045733529486957185, + "language_loss": 0.97091877, + "learning_rate": 0.0009902783451444317, + "loss": 0.98189688, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.38720703, + "step": 473, + "time_per_iteration": 2.6981122493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091239, + "balance_loss_mlp": 1.05406976, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.04942472768420212, + "language_loss": 1.00819659, + "learning_rate": 0.0009902171141526956, + "loss": 1.01910901, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.37158203, + "step": 474, + "time_per_iteration": 2.527256727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099497, + "balance_loss_mlp": 1.06225586, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.04275448033987936, + "language_loss": 0.88210893, + "learning_rate": 0.000990155692841797, + "loss": 0.8931039, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.37231445, + "step": 475, + "time_per_iteration": 2.989063262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_mlp": 1.06084871, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.04412440376655801, + "language_loss": 1.00229144, + "learning_rate": 0.0009900940812355818, + "loss": 1.01326227, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.36254883, + "step": 476, + "time_per_iteration": 2.8778445720672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105736, + "balance_loss_mlp": 1.07011676, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.06417087981964828, + "language_loss": 0.97168529, + "learning_rate": 0.00099003227935797, + "loss": 0.98274267, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.35620117, + "step": 477, + "time_per_iteration": 2.708608627319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101416, + "balance_loss_mlp": 1.06369829, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.06707216335576115, + "language_loss": 1.01291215, + "learning_rate": 0.000989970287232955, + "loss": 1.02392626, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.37695312, + "step": 478, + "time_per_iteration": 2.783325672149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090795, + "balance_loss_mlp": 1.05431736, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.05564878549890474, + "language_loss": 0.9726451, + "learning_rate": 0.0009899081048846043, + "loss": 0.98355305, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.36474609, + "step": 479, + "time_per_iteration": 2.6017916202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097049, + "balance_loss_mlp": 1.05964088, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.06044394784495309, + "language_loss": 1.03484094, + "learning_rate": 0.0009898457323370593, + "loss": 1.04581141, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.37402344, + "step": 480, + "time_per_iteration": 2.575676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.0533123, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.05778783373137127, + "language_loss": 0.99753714, + "learning_rate": 0.000989783169614535, + "loss": 1.00844884, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.37817383, + "step": 481, + "time_per_iteration": 2.646942615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283887, + "balance_loss_mlp": 1.15876544, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.01956789957612316, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80036646, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.25, + "step": 482, + "time_per_iteration": 4.860741376876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_mlp": 1.06158745, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.06801501049369231, + "language_loss": 0.97102278, + "learning_rate": 0.000989657473741779, + "loss": 0.98201108, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.37231445, + "step": 483, + "time_per_iteration": 2.819138526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095911, + "balance_loss_mlp": 1.05979109, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.038333848574242754, + "language_loss": 0.98462784, + "learning_rate": 0.0009895943406403465, + "loss": 0.99558693, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.36132812, + "step": 484, + "time_per_iteration": 2.7088170051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_mlp": 1.06854701, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.05828015098596693, + "language_loss": 0.92231822, + "learning_rate": 0.0009895310174615338, + "loss": 0.933357, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.35351562, + "step": 485, + "time_per_iteration": 2.760511636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_mlp": 1.14983261, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.018538812380254305, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76984316, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.15625, + "step": 486, + "time_per_iteration": 4.656491994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_mlp": 1.0699296, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.04721263549483299, + "language_loss": 0.95839012, + "learning_rate": 0.0009894038009701782, + "loss": 0.96944392, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.35498047, + "step": 487, + "time_per_iteration": 2.6169629096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_mlp": 1.06868315, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.05102581257360949, + "language_loss": 0.98848963, + "learning_rate": 0.0009893399077070253, + "loss": 0.99952644, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.3503418, + "step": 488, + "time_per_iteration": 2.5845744609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_mlp": 1.07193291, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.05918319403016569, + "language_loss": 0.92944884, + "learning_rate": 0.0009892758244652718, + "loss": 0.94051951, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.35180664, + "step": 489, + "time_per_iteration": 2.660200357437134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091731, + "balance_loss_mlp": 1.05801892, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.041386989889926534, + "language_loss": 1.00010514, + "learning_rate": 0.0009892115512697968, + "loss": 1.01102245, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.33740234, + "step": 490, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_mlp": 1.05631554, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.04182034264497562, + "language_loss": 1.00108159, + "learning_rate": 0.0009891470881455537, + "loss": 1.01198137, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.33666992, + "step": 491, + "time_per_iteration": 2.746169328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108736, + "balance_loss_mlp": 1.05319476, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.0458284589248403, + "language_loss": 0.98654628, + "learning_rate": 0.0009890824351175692, + "loss": 0.99741989, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.34204102, + "step": 492, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.05654192, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.041327442652051224, + "language_loss": 1.0219661, + "learning_rate": 0.0009890175922109435, + "loss": 1.0328722, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.34082031, + "step": 493, + "time_per_iteration": 2.6482973098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010971, + "balance_loss_mlp": 1.06086028, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.06926989533772566, + "language_loss": 1.01090789, + "learning_rate": 0.0009889525594508513, + "loss": 1.02187896, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.36254883, + "step": 494, + "time_per_iteration": 3.0095505714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_mlp": 1.05596447, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.04986765426945594, + "language_loss": 0.94310975, + "learning_rate": 0.0009888873368625404, + "loss": 0.95402986, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.3605957, + "step": 495, + "time_per_iteration": 2.5451042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05426204, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.05650320770937666, + "language_loss": 0.98877072, + "learning_rate": 0.0009888219244713326, + "loss": 0.99966443, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.3515625, + "step": 496, + "time_per_iteration": 2.8157310485839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086342, + "balance_loss_mlp": 1.05100799, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.05039739829653265, + "language_loss": 0.99588835, + "learning_rate": 0.0009887563223026229, + "loss": 1.00675178, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.35375977, + "step": 497, + "time_per_iteration": 2.6563401222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244906, + "balance_loss_mlp": 1.14648652, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.01649790273231252, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80313075, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.98046875, + "step": 498, + "time_per_iteration": 4.8689799308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098776, + "balance_loss_mlp": 1.0630604, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.06260101269903841, + "language_loss": 0.97272921, + "learning_rate": 0.0009886245487346482, + "loss": 0.98371696, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35742188, + "step": 499, + "time_per_iteration": 3.0292818546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.08159947, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.055723050712230264, + "language_loss": 1.00704551, + "learning_rate": 0.0009885583773865422, + "loss": 1.01822114, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.35986328, + "step": 500, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117756, + "balance_loss_mlp": 1.08137345, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.06268683986847115, + "language_loss": 0.9714855, + "learning_rate": 0.0009884920163632524, + "loss": 0.98266304, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.36352539, + "step": 501, + "time_per_iteration": 2.666341781616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111747, + "balance_loss_mlp": 1.07638931, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.04553274405873497, + "language_loss": 1.01245189, + "learning_rate": 0.000988425465690543, + "loss": 1.02356935, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35375977, + "step": 502, + "time_per_iteration": 2.55082106590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06867552, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.04373339165225573, + "language_loss": 0.99427342, + "learning_rate": 0.0009883587253942505, + "loss": 1.00530469, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.34472656, + "step": 503, + "time_per_iteration": 2.7674455642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_mlp": 1.07378531, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.051161986083573203, + "language_loss": 1.04393589, + "learning_rate": 0.0009882917955002862, + "loss": 1.05501866, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.3449707, + "step": 504, + "time_per_iteration": 2.549203872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_mlp": 1.07116556, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.04840022534917253, + "language_loss": 0.95342839, + "learning_rate": 0.0009882246760346343, + "loss": 0.96448457, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.3449707, + "step": 505, + "time_per_iteration": 2.653627872467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115925, + "balance_loss_mlp": 1.08128262, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.08271599518488834, + "language_loss": 1.02799106, + "learning_rate": 0.0009881573670233533, + "loss": 1.03915036, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.34692383, + "step": 506, + "time_per_iteration": 2.5279319286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104761, + "balance_loss_mlp": 1.07061946, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.05291653517072512, + "language_loss": 0.96169406, + "learning_rate": 0.0009880898684925747, + "loss": 0.97274166, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.34179688, + "step": 507, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_mlp": 1.06039834, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.053809005456099755, + "language_loss": 0.94680405, + "learning_rate": 0.0009880221804685037, + "loss": 0.95776224, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.35424805, + "step": 508, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245061, + "balance_loss_mlp": 1.15503371, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.024665830319341657, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80589479, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.8984375, + "step": 509, + "time_per_iteration": 4.705655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094606, + "balance_loss_mlp": 1.05932045, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.06644626598388864, + "language_loss": 1.02131915, + "learning_rate": 0.0009878862360456733, + "loss": 1.03226519, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.35327148, + "step": 510, + "time_per_iteration": 2.682035446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097961, + "balance_loss_mlp": 1.06336641, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.06543943311749917, + "language_loss": 0.9266718, + "learning_rate": 0.0009878179796996922, + "loss": 0.9376514, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.34619141, + "step": 511, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105256, + "balance_loss_mlp": 1.07030368, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.054213046356477584, + "language_loss": 0.96428764, + "learning_rate": 0.0009877495339659754, + "loss": 0.97534013, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.34985352, + "step": 512, + "time_per_iteration": 2.746337413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105714, + "balance_loss_mlp": 1.07190621, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.0573170093193853, + "language_loss": 0.91841626, + "learning_rate": 0.000987680898871096, + "loss": 0.9294734, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.33837891, + "step": 513, + "time_per_iteration": 2.7060482501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110186, + "balance_loss_mlp": 1.07675993, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.0786420176645203, + "language_loss": 0.95400196, + "learning_rate": 0.0009876120744417, + "loss": 0.96510386, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33447266, + "step": 514, + "time_per_iteration": 2.9473536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105767, + "balance_loss_mlp": 1.07071972, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.04861145683213968, + "language_loss": 1.01586378, + "learning_rate": 0.0009875430607045078, + "loss": 1.02692139, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.35058594, + "step": 515, + "time_per_iteration": 2.6745734214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095325, + "balance_loss_mlp": 1.06044412, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.061184004848699555, + "language_loss": 0.96467805, + "learning_rate": 0.000987473857686313, + "loss": 0.97563124, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.34887695, + "step": 516, + "time_per_iteration": 2.70771861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_mlp": 1.06909752, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.06268031252544905, + "language_loss": 1.01795554, + "learning_rate": 0.0009874044654139824, + "loss": 1.02899015, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34399414, + "step": 517, + "time_per_iteration": 2.7501027584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104488, + "balance_loss_mlp": 1.07020378, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.05802057466070587, + "language_loss": 1.01047516, + "learning_rate": 0.0009873348839144563, + "loss": 1.02152014, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34301758, + "step": 518, + "time_per_iteration": 2.5247762203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125408, + "balance_loss_mlp": 1.09100425, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.057276560313135924, + "language_loss": 1.0153054, + "learning_rate": 0.000987265113214749, + "loss": 1.02655947, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34448242, + "step": 519, + "time_per_iteration": 2.569776773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151248, + "balance_loss_mlp": 1.11705852, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.06886779278024428, + "language_loss": 1.05486548, + "learning_rate": 0.0009871951533419476, + "loss": 1.066378, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.34204102, + "step": 520, + "time_per_iteration": 2.646489381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155904, + "balance_loss_mlp": 1.12085652, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.06947260655531057, + "language_loss": 0.93715644, + "learning_rate": 0.0009871250043232132, + "loss": 0.94871557, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.35058594, + "step": 521, + "time_per_iteration": 2.729825258255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145676, + "balance_loss_mlp": 1.11196363, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.05700460680955029, + "language_loss": 0.94319808, + "learning_rate": 0.0009870546661857797, + "loss": 0.95465487, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.33740234, + "step": 522, + "time_per_iteration": 2.589205026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.10572577, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.0627280587118585, + "language_loss": 1.04607201, + "learning_rate": 0.0009869841389569553, + "loss": 1.05746591, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.33666992, + "step": 523, + "time_per_iteration": 3.007927656173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_mlp": 1.07816648, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.07025860249961899, + "language_loss": 0.94709289, + "learning_rate": 0.0009869134226641206, + "loss": 0.95821834, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.34399414, + "step": 524, + "time_per_iteration": 2.5647661685943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096367, + "balance_loss_mlp": 1.06134343, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.0754869647085307, + "language_loss": 0.96719551, + "learning_rate": 0.0009868425173347303, + "loss": 0.97815919, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.3503418, + "step": 525, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_mlp": 1.04816294, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.04461045481777941, + "language_loss": 1.01427031, + "learning_rate": 0.0009867714229963125, + "loss": 1.02508664, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.3347168, + "step": 526, + "time_per_iteration": 2.7551424503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_mlp": 1.06672287, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.06519670287778681, + "language_loss": 0.99495387, + "learning_rate": 0.000986700139676468, + "loss": 1.00596797, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34716797, + "step": 527, + "time_per_iteration": 2.5689845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_mlp": 1.08317983, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.055001529425537175, + "language_loss": 0.97175169, + "learning_rate": 0.0009866286674028717, + "loss": 0.98293233, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.34936523, + "step": 528, + "time_per_iteration": 2.6308236122131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118149, + "balance_loss_mlp": 1.08307743, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.06791274268555884, + "language_loss": 0.93964088, + "learning_rate": 0.0009865570062032717, + "loss": 0.95082229, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.35083008, + "step": 529, + "time_per_iteration": 2.931939125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117806, + "balance_loss_mlp": 1.08104193, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.05469252484924326, + "language_loss": 0.97321147, + "learning_rate": 0.0009864851561054893, + "loss": 0.98438954, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.36743164, + "step": 530, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_mlp": 1.0567745, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.053032092698093954, + "language_loss": 0.97237867, + "learning_rate": 0.0009864131171374191, + "loss": 0.9832958, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34936523, + "step": 531, + "time_per_iteration": 2.671963930130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_mlp": 1.05704737, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.037042660663456926, + "language_loss": 0.97530323, + "learning_rate": 0.0009863408893270292, + "loss": 0.98621887, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.34521484, + "step": 532, + "time_per_iteration": 2.8692965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080567, + "balance_loss_mlp": 1.0459249, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.045189468397627275, + "language_loss": 0.93818736, + "learning_rate": 0.0009862684727023605, + "loss": 0.94899297, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34692383, + "step": 533, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_mlp": 1.04978406, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.041807858593286534, + "language_loss": 0.94846106, + "learning_rate": 0.0009861958672915283, + "loss": 0.95930672, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.34814453, + "step": 534, + "time_per_iteration": 2.7894833087921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088029, + "balance_loss_mlp": 1.05348206, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.04113334704287127, + "language_loss": 0.93477535, + "learning_rate": 0.0009861230731227201, + "loss": 0.94565558, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.34570312, + "step": 535, + "time_per_iteration": 2.8369100093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_mlp": 1.06589389, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.06472741174466715, + "language_loss": 0.9716177, + "learning_rate": 0.0009860500902241973, + "loss": 0.98262858, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.35205078, + "step": 536, + "time_per_iteration": 2.6308608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_mlp": 1.06559658, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.06015330648509861, + "language_loss": 1.02488375, + "learning_rate": 0.0009859769186242942, + "loss": 1.0358845, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.34521484, + "step": 537, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094076, + "balance_loss_mlp": 1.06188989, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04182272700248836, + "language_loss": 0.96166039, + "learning_rate": 0.0009859035583514187, + "loss": 0.97260106, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32177734, + "step": 538, + "time_per_iteration": 2.665483236312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107546, + "balance_loss_mlp": 1.07497787, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.03728554890083732, + "language_loss": 0.9932602, + "learning_rate": 0.0009858300094340517, + "loss": 1.00433564, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.32568359, + "step": 539, + "time_per_iteration": 2.772207021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_mlp": 1.07908368, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.05284254114338104, + "language_loss": 0.91679931, + "learning_rate": 0.0009857562719007473, + "loss": 0.92790818, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.31787109, + "step": 540, + "time_per_iteration": 2.633002519607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_mlp": 1.06964111, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.07454941449424961, + "language_loss": 0.93962657, + "learning_rate": 0.0009856823457801331, + "loss": 0.95063812, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.888354539871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098965, + "balance_loss_mlp": 1.06682634, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.06016078646373104, + "language_loss": 1.01014686, + "learning_rate": 0.00098560823110091, + "loss": 1.02113652, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32128906, + "step": 542, + "time_per_iteration": 2.612365484237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.05664408, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.07331709746631812, + "language_loss": 0.99634022, + "learning_rate": 0.000985533927891851, + "loss": 1.00722837, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.3215332, + "step": 543, + "time_per_iteration": 2.6642584800720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_mlp": 1.05406535, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.07406485241554656, + "language_loss": 0.99318308, + "learning_rate": 0.0009854594361818044, + "loss": 1.00405657, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33300781, + "step": 544, + "time_per_iteration": 2.650541067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087044, + "balance_loss_mlp": 1.05357027, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.05515562757052397, + "language_loss": 0.98072803, + "learning_rate": 0.0009853847559996897, + "loss": 0.99159849, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.3347168, + "step": 545, + "time_per_iteration": 2.7268693447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098973, + "balance_loss_mlp": 1.0640682, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.05014767442192859, + "language_loss": 0.9781934, + "learning_rate": 0.0009853098873745, + "loss": 0.98918307, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34936523, + "step": 546, + "time_per_iteration": 3.001844644546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094885, + "balance_loss_mlp": 1.06010008, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.06665960072991474, + "language_loss": 0.96499509, + "learning_rate": 0.0009852348303353027, + "loss": 0.97594392, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34814453, + "step": 547, + "time_per_iteration": 2.7768120765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109085, + "balance_loss_mlp": 1.05692363, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.04477171592325676, + "language_loss": 0.89746928, + "learning_rate": 0.000985159584911237, + "loss": 0.90837783, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33959961, + "step": 548, + "time_per_iteration": 3.1397063732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109064, + "balance_loss_mlp": 1.0567131, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.057455808878804256, + "language_loss": 0.97617745, + "learning_rate": 0.0009850841511315162, + "loss": 0.98708391, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.33959961, + "step": 549, + "time_per_iteration": 2.6143858432769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.05660701, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.04134640300819554, + "language_loss": 0.97230792, + "learning_rate": 0.0009850085290254256, + "loss": 0.98321134, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33740234, + "step": 550, + "time_per_iteration": 2.784057855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_mlp": 1.05478084, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.041486348142279396, + "language_loss": 0.9340632, + "learning_rate": 0.0009849327186223246, + "loss": 0.94494367, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.33276367, + "step": 551, + "time_per_iteration": 2.822755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086641, + "balance_loss_mlp": 1.0536921, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.044652358506572586, + "language_loss": 1.00453854, + "learning_rate": 0.000984856719951646, + "loss": 1.01540482, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.32958984, + "step": 552, + "time_per_iteration": 2.561384439468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_mlp": 1.05577254, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.05595352831954139, + "language_loss": 0.98322356, + "learning_rate": 0.0009847805330428943, + "loss": 0.99410868, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.32739258, + "step": 553, + "time_per_iteration": 2.8988356590270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04940784, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05618387686115577, + "language_loss": 1.02895415, + "learning_rate": 0.0009847041579256481, + "loss": 1.03977895, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.33081055, + "step": 554, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088152, + "balance_loss_mlp": 1.05548859, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.04459262579832553, + "language_loss": 0.99802542, + "learning_rate": 0.0009846275946295592, + "loss": 1.00890684, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32641602, + "step": 555, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108533, + "balance_loss_mlp": 1.05347764, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.04108965909817336, + "language_loss": 0.92502242, + "learning_rate": 0.0009845508431843518, + "loss": 0.93587577, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.31835938, + "step": 556, + "time_per_iteration": 3.0189473628997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087957, + "balance_loss_mlp": 1.05612838, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.05029379164990677, + "language_loss": 0.95060432, + "learning_rate": 0.0009844739036198233, + "loss": 0.96148396, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.31811523, + "step": 557, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06340766, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.047100661757994676, + "language_loss": 1.0152961, + "learning_rate": 0.0009843967759658448, + "loss": 1.02625763, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.32739258, + "step": 558, + "time_per_iteration": 2.6677682399749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264894, + "balance_loss_mlp": 1.19775486, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.03689581784010691, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74032652, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.671875, + "step": 559, + "time_per_iteration": 4.873044013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.07234466, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.06480790167761245, + "language_loss": 1.01098323, + "learning_rate": 0.000984241956509384, + "loss": 1.02203977, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.33325195, + "step": 560, + "time_per_iteration": 2.655430555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095265, + "balance_loss_mlp": 1.0617907, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.05361377514900226, + "language_loss": 1.00074768, + "learning_rate": 0.0009841642647670078, + "loss": 1.01170027, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.33496094, + "step": 561, + "time_per_iteration": 2.5627329349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.05633116, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.04993888185520414, + "language_loss": 0.93071151, + "learning_rate": 0.0009840863850553944, + "loss": 0.94160575, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33105469, + "step": 562, + "time_per_iteration": 3.0020592212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108807, + "balance_loss_mlp": 1.05686092, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.046287089248472475, + "language_loss": 0.97956204, + "learning_rate": 0.0009840083174047782, + "loss": 0.99044275, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.31176758, + "step": 563, + "time_per_iteration": 2.7123258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_mlp": 1.06275535, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.036863902598139514, + "language_loss": 0.91394317, + "learning_rate": 0.0009839300618454685, + "loss": 0.92488301, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31176758, + "step": 564, + "time_per_iteration": 2.855482578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_mlp": 1.05386496, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0447892393855046, + "language_loss": 0.97269231, + "learning_rate": 0.0009838516184078466, + "loss": 0.98355657, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.32568359, + "step": 565, + "time_per_iteration": 2.8027093410491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090742, + "balance_loss_mlp": 1.05881739, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.039430635834492286, + "language_loss": 0.95326865, + "learning_rate": 0.0009837729871223669, + "loss": 0.964176, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3190918, + "step": 566, + "time_per_iteration": 2.621044158935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097443, + "balance_loss_mlp": 1.06473231, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.03524126234366562, + "language_loss": 0.96988255, + "learning_rate": 0.0009836941680195568, + "loss": 0.98085701, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.32714844, + "step": 567, + "time_per_iteration": 2.8241846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_mlp": 1.06359148, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.05940738915226433, + "language_loss": 0.94011569, + "learning_rate": 0.0009836151611300166, + "loss": 0.95106757, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.31567383, + "step": 568, + "time_per_iteration": 3.2259325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093362, + "balance_loss_mlp": 1.06327355, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.04952949609465528, + "language_loss": 1.01886261, + "learning_rate": 0.0009835359664844194, + "loss": 1.02979624, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.30029297, + "step": 569, + "time_per_iteration": 2.61936616897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235986, + "balance_loss_mlp": 1.17113578, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.02580255803672051, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82272792, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.6484375, + "step": 570, + "time_per_iteration": 4.946800470352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_mlp": 1.06947398, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.04088785760268294, + "language_loss": 0.98121774, + "learning_rate": 0.0009833770140481118, + "loss": 0.99224108, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.32861328, + "step": 571, + "time_per_iteration": 2.6676580905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_mlp": 1.07113993, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.04146527084622454, + "language_loss": 0.88084227, + "learning_rate": 0.000983297256319112, + "loss": 0.89187813, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.32446289, + "step": 572, + "time_per_iteration": 3.1977450847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098726, + "balance_loss_mlp": 1.06503749, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.11112801331440751, + "language_loss": 0.93675387, + "learning_rate": 0.000983217310957477, + "loss": 0.94774115, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33691406, + "step": 573, + "time_per_iteration": 2.771477222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08530974, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.046936313049011164, + "language_loss": 0.98079342, + "learning_rate": 0.000983137177994244, + "loss": 0.99198341, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.3371582, + "step": 574, + "time_per_iteration": 2.842641830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127826, + "balance_loss_mlp": 1.0945909, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.047970587572460185, + "language_loss": 0.91368234, + "learning_rate": 0.0009830568574605235, + "loss": 0.92496061, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.33227539, + "step": 575, + "time_per_iteration": 2.9841148853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136053, + "balance_loss_mlp": 1.10260296, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.06212944390612344, + "language_loss": 0.95608473, + "learning_rate": 0.0009829763493874992, + "loss": 0.96744525, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3347168, + "step": 576, + "time_per_iteration": 3.094599485397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122949, + "balance_loss_mlp": 1.08918953, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.040009357062280086, + "language_loss": 1.0022918, + "learning_rate": 0.0009828956538064264, + "loss": 1.01352131, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.33764648, + "step": 577, + "time_per_iteration": 2.7913765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128001, + "balance_loss_mlp": 1.09428823, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07834189266391174, + "language_loss": 0.97103804, + "learning_rate": 0.0009828147707486344, + "loss": 0.98231804, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.33740234, + "step": 578, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.0659467, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.066476002167881, + "language_loss": 0.94244707, + "learning_rate": 0.0009827337002455245, + "loss": 0.95344198, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.33544922, + "step": 579, + "time_per_iteration": 2.6212143898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.05940461, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.0598380025645264, + "language_loss": 0.93403691, + "learning_rate": 0.0009826524423285712, + "loss": 0.94494587, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.31469727, + "step": 580, + "time_per_iteration": 2.916363000869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_mlp": 1.05466461, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.051352596452175936, + "language_loss": 0.95457065, + "learning_rate": 0.0009825709970293218, + "loss": 0.96543789, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.32055664, + "step": 581, + "time_per_iteration": 2.975459575653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094975, + "balance_loss_mlp": 1.06414759, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.06330579048660655, + "language_loss": 1.01360774, + "learning_rate": 0.0009824893643793956, + "loss": 1.02455735, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.30810547, + "step": 582, + "time_per_iteration": 3.0850436687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109989, + "balance_loss_mlp": 1.06772757, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.05517621871728721, + "language_loss": 0.96568394, + "learning_rate": 0.0009824075444104857, + "loss": 0.9766829, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3215332, + "step": 583, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104353, + "balance_loss_mlp": 1.07214284, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.05273776870459213, + "language_loss": 1.00669086, + "learning_rate": 0.000982325537154357, + "loss": 1.01773441, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.32202148, + "step": 584, + "time_per_iteration": 2.566066265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109419, + "balance_loss_mlp": 1.07768583, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.05755454669423396, + "language_loss": 1.01869726, + "learning_rate": 0.0009822433426428484, + "loss": 1.02979159, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31713867, + "step": 585, + "time_per_iteration": 2.611968994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_mlp": 1.08987498, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.06034275506000564, + "language_loss": 0.93750811, + "learning_rate": 0.0009821609609078697, + "loss": 0.94872963, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.32275391, + "step": 586, + "time_per_iteration": 2.584847927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0726887, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.06416707827025614, + "language_loss": 0.95279968, + "learning_rate": 0.0009820783919814045, + "loss": 0.96384937, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.32275391, + "step": 587, + "time_per_iteration": 2.7885184288024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06359744, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.049104346633589514, + "language_loss": 0.92135406, + "learning_rate": 0.0009819956358955095, + "loss": 0.93231547, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32543945, + "step": 588, + "time_per_iteration": 2.560117483139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_mlp": 1.05427432, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.05114307144868452, + "language_loss": 0.93675017, + "learning_rate": 0.0009819126926823127, + "loss": 0.94761813, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.32519531, + "step": 589, + "time_per_iteration": 2.517035722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.05966008, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.04613241529975588, + "language_loss": 0.94437975, + "learning_rate": 0.000981829562374016, + "loss": 0.95531201, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.33569336, + "step": 590, + "time_per_iteration": 2.8174262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_mlp": 1.05913091, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.05348492004263644, + "language_loss": 1.04949331, + "learning_rate": 0.0009817462450028933, + "loss": 1.0604248, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.34057617, + "step": 591, + "time_per_iteration": 2.6302859783172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.0668143, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.2030818500746725, + "language_loss": 0.92329478, + "learning_rate": 0.0009816627406012916, + "loss": 0.93430716, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.34472656, + "step": 592, + "time_per_iteration": 2.8384313583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.09943521, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.0774704650100976, + "language_loss": 0.91851664, + "learning_rate": 0.0009815790492016295, + "loss": 0.92987645, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36523438, + "step": 593, + "time_per_iteration": 2.9409682750701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136834, + "balance_loss_mlp": 1.10192943, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.09332707993556091, + "language_loss": 0.94690275, + "learning_rate": 0.0009814951708363993, + "loss": 0.95827115, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.34912109, + "step": 594, + "time_per_iteration": 2.8599631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221657, + "balance_loss_mlp": 1.16023993, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.030934197408724044, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79212642, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.61328125, + "step": 595, + "time_per_iteration": 4.801583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_mlp": 1.10138512, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.0746127254366864, + "language_loss": 0.94972038, + "learning_rate": 0.0009813268533395648, + "loss": 0.96109354, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.359375, + "step": 596, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_mlp": 1.0882678, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.061536990211155544, + "language_loss": 0.95371294, + "learning_rate": 0.0009812424142733073, + "loss": 0.96494377, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.34765625, + "step": 597, + "time_per_iteration": 2.5663998126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07387781, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.04795398370622496, + "language_loss": 0.91199464, + "learning_rate": 0.000981157788372175, + "loss": 0.92308056, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.34716797, + "step": 598, + "time_per_iteration": 3.004436492919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_mlp": 1.06864619, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.04762632796488997, + "language_loss": 0.94997883, + "learning_rate": 0.0009810729756690223, + "loss": 0.96100628, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.34106445, + "step": 599, + "time_per_iteration": 2.704676628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_mlp": 1.06947374, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.06699944809564747, + "language_loss": 0.98224139, + "learning_rate": 0.0009809879761967766, + "loss": 0.99328732, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35107422, + "step": 600, + "time_per_iteration": 2.953348159790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_mlp": 1.07922578, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.06801646297960097, + "language_loss": 0.96874714, + "learning_rate": 0.0009809027899884378, + "loss": 0.97988677, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.34765625, + "step": 601, + "time_per_iteration": 2.896559953689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104267, + "balance_loss_mlp": 1.07014918, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.062436318450634756, + "language_loss": 0.9484992, + "learning_rate": 0.0009808174170770779, + "loss": 0.95954192, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.34130859, + "step": 602, + "time_per_iteration": 2.814558982849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220455, + "balance_loss_mlp": 1.16704941, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.025680107820064087, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86118698, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.53515625, + "step": 603, + "time_per_iteration": 4.897503614425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118739, + "balance_loss_mlp": 1.08566999, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.05533944227900463, + "language_loss": 1.0028702, + "learning_rate": 0.0009806461112779462, + "loss": 1.01405764, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.33081055, + "step": 604, + "time_per_iteration": 2.6172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115094, + "balance_loss_mlp": 1.08281231, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.07231087595972972, + "language_loss": 0.97971618, + "learning_rate": 0.0009805601784566814, + "loss": 0.99086702, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.32250977, + "step": 605, + "time_per_iteration": 2.4791650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125941, + "balance_loss_mlp": 1.09208584, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.06015253149930396, + "language_loss": 1.02430916, + "learning_rate": 0.0009804740590654089, + "loss": 1.03556848, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.33862305, + "step": 606, + "time_per_iteration": 2.614476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124787, + "balance_loss_mlp": 1.09229016, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.08034134565527169, + "language_loss": 0.97153747, + "learning_rate": 0.0009803877531375635, + "loss": 0.9827854, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.32495117, + "step": 607, + "time_per_iteration": 2.851011276245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_mlp": 1.09228706, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.05400582488055185, + "language_loss": 0.97512484, + "learning_rate": 0.0009803012607066523, + "loss": 0.9864068, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.35913086, + "step": 608, + "time_per_iteration": 2.700596570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128537, + "balance_loss_mlp": 1.09294093, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.15792902837654846, + "language_loss": 0.95375645, + "learning_rate": 0.0009802145818062543, + "loss": 0.96504182, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.35620117, + "step": 609, + "time_per_iteration": 2.693417549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123637, + "balance_loss_mlp": 1.08742094, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.06851059455565046, + "language_loss": 0.99132365, + "learning_rate": 0.0009801277164700212, + "loss": 1.00256002, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36254883, + "step": 610, + "time_per_iteration": 2.5825185775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131797, + "balance_loss_mlp": 1.09541452, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.1113382534985323, + "language_loss": 0.96033651, + "learning_rate": 0.0009800406647316776, + "loss": 0.97165447, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.36376953, + "step": 611, + "time_per_iteration": 2.8625166416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231096, + "balance_loss_mlp": 1.18112373, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.03346184177846584, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78145558, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.49804688, + "step": 612, + "time_per_iteration": 4.748431444168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137214, + "balance_loss_mlp": 1.09880471, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.07612220197102978, + "language_loss": 0.95326376, + "learning_rate": 0.000979866002183916, + "loss": 0.96463591, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.38378906, + "step": 613, + "time_per_iteration": 2.6311473846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155666, + "balance_loss_mlp": 1.11482501, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.0832714106614858, + "language_loss": 0.96221644, + "learning_rate": 0.0009797783914423082, + "loss": 0.97377312, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.40844727, + "step": 614, + "time_per_iteration": 2.8568782806396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126933, + "balance_loss_mlp": 1.08721232, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.08355321383380138, + "language_loss": 0.91733479, + "learning_rate": 0.0009796905944342094, + "loss": 0.92860413, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.3972168, + "step": 615, + "time_per_iteration": 2.8348331451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07517743, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.05175964705030883, + "language_loss": 0.94486296, + "learning_rate": 0.0009796026111937057, + "loss": 0.9560017, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.38671875, + "step": 616, + "time_per_iteration": 2.609276056289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111065, + "balance_loss_mlp": 1.07393384, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.1779679576065946, + "language_loss": 0.94108498, + "learning_rate": 0.0009795144417549552, + "loss": 0.95219147, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.3671875, + "step": 617, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.07760203, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.0639893702788804, + "language_loss": 0.95137906, + "learning_rate": 0.0009794260861521883, + "loss": 0.96252483, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36987305, + "step": 618, + "time_per_iteration": 2.779780387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125632, + "balance_loss_mlp": 1.08908224, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.062080445707157726, + "language_loss": 0.94238096, + "learning_rate": 0.0009793375444197075, + "loss": 0.95363724, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.3659668, + "step": 619, + "time_per_iteration": 2.6269500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.12132859, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.05728911446624217, + "language_loss": 0.93181753, + "learning_rate": 0.000979248816591888, + "loss": 0.94341516, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.38452148, + "step": 620, + "time_per_iteration": 2.7879464626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155629, + "balance_loss_mlp": 1.11600351, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.05539388103354017, + "language_loss": 0.93241715, + "learning_rate": 0.0009791599027031766, + "loss": 0.94397342, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.39624023, + "step": 621, + "time_per_iteration": 3.058497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152711, + "balance_loss_mlp": 1.11439681, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.05959109763307043, + "language_loss": 0.93889141, + "learning_rate": 0.0009790708027880932, + "loss": 0.95041847, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.38330078, + "step": 622, + "time_per_iteration": 2.857905864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217773, + "balance_loss_mlp": 1.17447615, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.033264976771994935, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78645062, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.43359375, + "step": 623, + "time_per_iteration": 4.817517518997192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130228, + "balance_loss_mlp": 1.09372652, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.07130736684785184, + "language_loss": 0.99442542, + "learning_rate": 0.0009788920450172487, + "loss": 1.00572777, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.36499023, + "step": 624, + "time_per_iteration": 2.6089231967926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_mlp": 1.0987401, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.053387747347518576, + "language_loss": 0.97139525, + "learning_rate": 0.0009788023872308875, + "loss": 0.98273742, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35522461, + "step": 625, + "time_per_iteration": 2.5482659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171918, + "balance_loss_mlp": 1.12614214, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.016755812295179123, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76600921, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.45703125, + "step": 626, + "time_per_iteration": 4.767898797988892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142548, + "balance_loss_mlp": 1.10609388, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.053046953839951706, + "language_loss": 0.99526918, + "learning_rate": 0.0009786225140303285, + "loss": 1.00669467, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.36425781, + "step": 627, + "time_per_iteration": 2.666975975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145866, + "balance_loss_mlp": 1.10974586, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.06539343990980159, + "language_loss": 0.97403502, + "learning_rate": 0.0009785322986859634, + "loss": 0.98549366, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.36132812, + "step": 628, + "time_per_iteration": 2.6613006591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116443, + "balance_loss_mlp": 1.12830925, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.05337423256033143, + "language_loss": 0.99038112, + "learning_rate": 0.0009784418975588838, + "loss": 1.00202537, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.36108398, + "step": 629, + "time_per_iteration": 2.7266693115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.11248696, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.06598420413892771, + "language_loss": 0.97636682, + "learning_rate": 0.0009783513106841862, + "loss": 0.98784697, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.35522461, + "step": 630, + "time_per_iteration": 2.7734336853027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122661, + "balance_loss_mlp": 1.17663717, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.0364602282496576, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77959311, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.5, + "step": 631, + "time_per_iteration": 4.955650091171265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118689, + "balance_loss_mlp": 1.08283055, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.061523486228641615, + "language_loss": 0.94419873, + "learning_rate": 0.0009781695798326854, + "loss": 0.95538557, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35888672, + "step": 632, + "time_per_iteration": 2.6072514057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111815, + "balance_loss_mlp": 1.08319819, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.05761126083629287, + "language_loss": 0.93996418, + "learning_rate": 0.0009780784359264365, + "loss": 0.95114571, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.34985352, + "step": 633, + "time_per_iteration": 2.6186299324035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201074, + "balance_loss_mlp": 1.15548825, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.024414945484573326, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75389773, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.45507812, + "step": 634, + "time_per_iteration": 4.757866144180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_mlp": 1.05732846, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.05071444395915749, + "language_loss": 0.91919303, + "learning_rate": 0.000977895591329867, + "loss": 0.93010104, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.3347168, + "step": 635, + "time_per_iteration": 2.7802233695983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094425, + "balance_loss_mlp": 1.06006885, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.05652682698430024, + "language_loss": 0.93613631, + "learning_rate": 0.000977803890710533, + "loss": 0.94708061, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.34399414, + "step": 636, + "time_per_iteration": 2.719989538192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109182, + "balance_loss_mlp": 1.0546267, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.05019916823038997, + "language_loss": 0.97873759, + "learning_rate": 0.0009777120045912774, + "loss": 0.98965579, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.37231445, + "step": 637, + "time_per_iteration": 2.5960683822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099212, + "balance_loss_mlp": 1.06139851, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.05186361253186237, + "language_loss": 0.97095829, + "learning_rate": 0.0009776199330077736, + "loss": 0.9819504, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37841797, + "step": 638, + "time_per_iteration": 2.7152581214904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_mlp": 1.05121303, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.05467339203371928, + "language_loss": 0.99154645, + "learning_rate": 0.0009775276759957667, + "loss": 1.00242841, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.37011719, + "step": 639, + "time_per_iteration": 2.6985981464385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090176, + "balance_loss_mlp": 1.05465198, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.06600893718108056, + "language_loss": 0.97933781, + "learning_rate": 0.0009774352335910745, + "loss": 0.99023956, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.35546875, + "step": 640, + "time_per_iteration": 2.813744306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_mlp": 1.05298471, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.05927901471916764, + "language_loss": 0.99468219, + "learning_rate": 0.000977342605829586, + "loss": 1.00554824, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.33642578, + "step": 641, + "time_per_iteration": 2.73280668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110577, + "balance_loss_mlp": 1.07240582, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.07046674646118828, + "language_loss": 0.92099506, + "learning_rate": 0.0009772497927472623, + "loss": 0.93210077, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.38183594, + "step": 642, + "time_per_iteration": 3.1258397102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.09514427, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.07438352262018386, + "language_loss": 0.93366879, + "learning_rate": 0.0009771567943801368, + "loss": 0.94501698, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3972168, + "step": 643, + "time_per_iteration": 2.6720776557922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149366, + "balance_loss_mlp": 1.10912085, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.055730629552303436, + "language_loss": 0.96261084, + "learning_rate": 0.0009770636107643152, + "loss": 0.97410446, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.40234375, + "step": 644, + "time_per_iteration": 2.7093722820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144915, + "balance_loss_mlp": 1.10734022, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.05250459899213186, + "language_loss": 0.92937833, + "learning_rate": 0.0009769702419359738, + "loss": 0.94082749, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.37597656, + "step": 645, + "time_per_iteration": 2.661512613296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173428, + "balance_loss_mlp": 1.13146591, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.052890865129340166, + "language_loss": 0.94770992, + "learning_rate": 0.000976876687931362, + "loss": 0.95944417, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.41943359, + "step": 646, + "time_per_iteration": 2.972522258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164317, + "balance_loss_mlp": 1.12555003, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.07033761546633982, + "language_loss": 0.91270661, + "learning_rate": 0.0009767829487868005, + "loss": 0.92434984, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.38769531, + "step": 647, + "time_per_iteration": 2.6150805950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164183, + "balance_loss_mlp": 1.12281775, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.07269814667774141, + "language_loss": 0.95938772, + "learning_rate": 0.000976689024538682, + "loss": 0.97102952, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.41381836, + "step": 648, + "time_per_iteration": 2.6567764282226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_mlp": 1.11497951, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.06659282576896536, + "language_loss": 0.94783676, + "learning_rate": 0.0009765949152234716, + "loss": 0.95937783, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.39135742, + "step": 649, + "time_per_iteration": 2.9032628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118823, + "balance_loss_mlp": 1.15084565, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.027365485913225348, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79874313, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.37304688, + "step": 650, + "time_per_iteration": 4.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145487, + "balance_loss_mlp": 1.10395491, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.07758701561639549, + "language_loss": 0.88880539, + "learning_rate": 0.0009764061415379919, + "loss": 0.90026021, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.41552734, + "step": 651, + "time_per_iteration": 3.2588987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_mlp": 1.09766221, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08409279007421946, + "language_loss": 0.94380724, + "learning_rate": 0.0009763114772410109, + "loss": 0.95518184, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.39794922, + "step": 652, + "time_per_iteration": 2.5698702335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112103, + "balance_loss_mlp": 1.08359814, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.056536251661147445, + "language_loss": 0.92061114, + "learning_rate": 0.0009762166280235146, + "loss": 0.93182147, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37451172, + "step": 653, + "time_per_iteration": 2.938668966293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_mlp": 1.08191729, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.0771848817407848, + "language_loss": 0.94092464, + "learning_rate": 0.0009761215939223267, + "loss": 0.95209974, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.35644531, + "step": 654, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_mlp": 1.06834149, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.07424845664771389, + "language_loss": 0.9475044, + "learning_rate": 0.0009760263749743428, + "loss": 0.95853353, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.34570312, + "step": 655, + "time_per_iteration": 2.5710902214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101838, + "balance_loss_mlp": 1.06771994, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.053259035011575195, + "language_loss": 0.94285154, + "learning_rate": 0.0009759309712165299, + "loss": 0.95386994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34130859, + "step": 656, + "time_per_iteration": 2.70626163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101868, + "balance_loss_mlp": 1.06858444, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.0693418830287988, + "language_loss": 0.9812479, + "learning_rate": 0.0009758353826859272, + "loss": 0.99226654, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.33300781, + "step": 657, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_mlp": 1.0663563, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.06782991509763603, + "language_loss": 0.96008623, + "learning_rate": 0.0009757396094196456, + "loss": 0.97111744, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36791992, + "step": 658, + "time_per_iteration": 2.8277065753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115911, + "balance_loss_mlp": 1.07926583, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.053606842709613675, + "language_loss": 0.89398581, + "learning_rate": 0.0009756436514548673, + "loss": 0.90514493, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36645508, + "step": 659, + "time_per_iteration": 2.796175718307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120986, + "balance_loss_mlp": 1.0811224, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.060525818769901533, + "language_loss": 0.92384607, + "learning_rate": 0.0009755475088288466, + "loss": 0.93505597, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.39916992, + "step": 660, + "time_per_iteration": 2.678682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133341, + "balance_loss_mlp": 1.09271395, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08191197530717065, + "language_loss": 0.958794, + "learning_rate": 0.0009754511815789095, + "loss": 0.97012746, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.40600586, + "step": 661, + "time_per_iteration": 2.7371177673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130898, + "balance_loss_mlp": 1.09093928, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08687138171908054, + "language_loss": 0.92166948, + "learning_rate": 0.0009753546697424533, + "loss": 0.93297845, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.39941406, + "step": 662, + "time_per_iteration": 2.704432249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125889, + "balance_loss_mlp": 1.08700323, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.06194581367760624, + "language_loss": 0.95628935, + "learning_rate": 0.0009752579733569475, + "loss": 0.96754825, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.38891602, + "step": 663, + "time_per_iteration": 2.682892084121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165087, + "balance_loss_mlp": 1.1326623, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.0245621431528993, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76046479, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.32421875, + "step": 664, + "time_per_iteration": 4.981603622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146598, + "balance_loss_mlp": 1.1060189, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.07818489478946229, + "language_loss": 0.96962506, + "learning_rate": 0.0009750640270890217, + "loss": 0.98109102, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.40576172, + "step": 665, + "time_per_iteration": 2.7139556407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139516, + "balance_loss_mlp": 1.10115409, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.10418725554084544, + "language_loss": 1.02824736, + "learning_rate": 0.0009749667772818983, + "loss": 1.03964257, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.38354492, + "step": 666, + "time_per_iteration": 3.000227689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148195, + "balance_loss_mlp": 1.11481678, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.027847994605201966, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78084135, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.33398438, + "step": 667, + "time_per_iteration": 4.858838319778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.1255703, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.0747922247275706, + "language_loss": 1.00932169, + "learning_rate": 0.0009747717245101093, + "loss": 1.0209403, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.36303711, + "step": 668, + "time_per_iteration": 2.4917514324188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172854, + "balance_loss_mlp": 1.13518405, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0795363237311063, + "language_loss": 0.91087645, + "learning_rate": 0.00097467392162117, + "loss": 0.92260504, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.37719727, + "step": 669, + "time_per_iteration": 2.601151466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196603, + "balance_loss_mlp": 1.15540457, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.0744221392925499, + "language_loss": 0.95630497, + "learning_rate": 0.0009745759344474708, + "loss": 0.96827102, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.41162109, + "step": 670, + "time_per_iteration": 2.878068447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200159, + "balance_loss_mlp": 1.16012812, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.07162427386273244, + "language_loss": 0.95158428, + "learning_rate": 0.0009744777630270536, + "loss": 0.96358585, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.40063477, + "step": 671, + "time_per_iteration": 2.5778517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220294, + "balance_loss_mlp": 1.17752171, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.07459259564874297, + "language_loss": 0.99775112, + "learning_rate": 0.000974379407398032, + "loss": 1.00995398, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.42797852, + "step": 672, + "time_per_iteration": 2.862168073654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_mlp": 1.15175724, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.05795101219152752, + "language_loss": 0.86696863, + "learning_rate": 0.0009742808675985913, + "loss": 0.87888587, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.3996582, + "step": 673, + "time_per_iteration": 3.0987160205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011832, + "balance_loss_mlp": 1.14142871, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.06292984682523013, + "language_loss": 0.96893597, + "learning_rate": 0.0009741821436669876, + "loss": 0.98076797, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.41772461, + "step": 674, + "time_per_iteration": 2.565317153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160814, + "balance_loss_mlp": 1.12123656, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.07127578315040689, + "language_loss": 0.99621803, + "learning_rate": 0.0009740832356415492, + "loss": 1.00782621, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.39550781, + "step": 675, + "time_per_iteration": 2.4777724742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144275, + "balance_loss_mlp": 1.10538852, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.07563598794059366, + "language_loss": 0.94837546, + "learning_rate": 0.0009739841435606756, + "loss": 0.95981824, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.38867188, + "step": 676, + "time_per_iteration": 2.9838767051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_mlp": 1.09186864, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.06693149578557214, + "language_loss": 0.94293654, + "learning_rate": 0.0009738848674628377, + "loss": 0.95424765, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.39233398, + "step": 677, + "time_per_iteration": 2.7052054405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130656, + "balance_loss_mlp": 1.0923903, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.05501746541124835, + "language_loss": 0.94784498, + "learning_rate": 0.000973785407386578, + "loss": 0.95915151, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.38232422, + "step": 678, + "time_per_iteration": 2.7535152435302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_mlp": 1.09727383, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.05430769504454563, + "language_loss": 0.91185606, + "learning_rate": 0.0009736857633705103, + "loss": 0.92322862, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.3996582, + "step": 679, + "time_per_iteration": 2.8686013221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135266, + "balance_loss_mlp": 1.09575987, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.06387426976514826, + "language_loss": 0.97335434, + "learning_rate": 0.0009735859354533196, + "loss": 0.984707, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.39501953, + "step": 680, + "time_per_iteration": 2.6952273845672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09626174, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.07637025474680663, + "language_loss": 0.97434723, + "learning_rate": 0.0009734859236737628, + "loss": 0.98571181, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.40185547, + "step": 681, + "time_per_iteration": 2.607431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_mlp": 1.09720194, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.06515090437153119, + "language_loss": 0.9831785, + "learning_rate": 0.0009733857280706678, + "loss": 0.99454683, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.39599609, + "step": 682, + "time_per_iteration": 2.5730957984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140739, + "balance_loss_mlp": 1.1007328, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.08408851923922504, + "language_loss": 0.89817083, + "learning_rate": 0.000973285348682934, + "loss": 0.90957826, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.39990234, + "step": 683, + "time_per_iteration": 2.7041609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_mlp": 1.08460057, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.021197399820989362, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7901845, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.35546875, + "step": 684, + "time_per_iteration": 4.7803051471710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145399, + "balance_loss_mlp": 1.10579789, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.06796914093678033, + "language_loss": 0.90116858, + "learning_rate": 0.0009730840387095046, + "loss": 0.91262257, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.39575195, + "step": 685, + "time_per_iteration": 3.289513111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154301, + "balance_loss_mlp": 1.11412716, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.0690044047280534, + "language_loss": 0.95956922, + "learning_rate": 0.0009729831082019642, + "loss": 0.97111225, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.40185547, + "step": 686, + "time_per_iteration": 2.8214356899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131343, + "balance_loss_mlp": 1.09383941, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08080780289155233, + "language_loss": 0.93596351, + "learning_rate": 0.0009728819940660958, + "loss": 0.94727689, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.375, + "step": 687, + "time_per_iteration": 2.749385118484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011246, + "balance_loss_mlp": 1.08542764, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.08853955851107219, + "language_loss": 0.91695315, + "learning_rate": 0.0009727806963411557, + "loss": 0.92819917, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.39135742, + "step": 688, + "time_per_iteration": 2.592099666595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_mlp": 1.08777368, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.06370494383790047, + "language_loss": 0.92130053, + "learning_rate": 0.000972679215066471, + "loss": 0.93258381, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.40551758, + "step": 689, + "time_per_iteration": 2.7344043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114145, + "balance_loss_mlp": 1.10246885, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.08478699193898473, + "language_loss": 1.04583168, + "learning_rate": 0.0009725775502814401, + "loss": 1.05724621, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.3894043, + "step": 690, + "time_per_iteration": 2.5881311893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155383, + "balance_loss_mlp": 1.1147325, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.07994389842197654, + "language_loss": 0.90077579, + "learning_rate": 0.0009724757020255327, + "loss": 0.91232961, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.40649414, + "step": 691, + "time_per_iteration": 2.8452539443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_mlp": 1.12566948, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09039906445052394, + "language_loss": 0.91914684, + "learning_rate": 0.0009723736703382902, + "loss": 0.93079573, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.39208984, + "step": 692, + "time_per_iteration": 2.5472824573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198661, + "balance_loss_mlp": 1.15557849, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07689546631051256, + "language_loss": 0.86461794, + "learning_rate": 0.0009722714552593244, + "loss": 0.87660456, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.4309082, + "step": 693, + "time_per_iteration": 2.6273465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199876, + "balance_loss_mlp": 1.15560198, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08142665414192346, + "language_loss": 1.00438499, + "learning_rate": 0.000972169056828319, + "loss": 1.01638389, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.44262695, + "step": 694, + "time_per_iteration": 2.477491617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221806, + "balance_loss_mlp": 1.17741275, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.07001491486919184, + "language_loss": 0.90590984, + "learning_rate": 0.0009720664750850283, + "loss": 0.91812789, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.4440918, + "step": 695, + "time_per_iteration": 2.7817704677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209285, + "balance_loss_mlp": 1.16870594, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.07077521288835904, + "language_loss": 0.97240067, + "learning_rate": 0.0009719637100692784, + "loss": 0.98449349, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.40625, + "step": 696, + "time_per_iteration": 2.7099833488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214589, + "balance_loss_mlp": 1.17069626, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.06395797985697109, + "language_loss": 0.87399805, + "learning_rate": 0.0009718607618209661, + "loss": 0.88614392, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.43896484, + "step": 697, + "time_per_iteration": 2.8280160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226846, + "balance_loss_mlp": 1.18445516, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.08853583224950028, + "language_loss": 0.91527486, + "learning_rate": 0.0009717576303800595, + "loss": 0.92754334, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.42382812, + "step": 698, + "time_per_iteration": 3.0102553367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206724, + "balance_loss_mlp": 1.16385674, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.07140979809376953, + "language_loss": 0.90443981, + "learning_rate": 0.0009716543157865975, + "loss": 0.91650712, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.4284668, + "step": 699, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192988, + "balance_loss_mlp": 1.15047789, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.0971528894423257, + "language_loss": 0.87731719, + "learning_rate": 0.0009715508180806907, + "loss": 0.88924706, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.42504883, + "step": 700, + "time_per_iteration": 3.183608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.12189686, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07253928509691168, + "language_loss": 0.94940412, + "learning_rate": 0.0009714471373025202, + "loss": 0.96104908, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.42578125, + "step": 701, + "time_per_iteration": 3.4071736335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_mlp": 1.10978746, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07349692890686976, + "language_loss": 0.93387866, + "learning_rate": 0.0009713432734923386, + "loss": 0.94542348, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.44702148, + "step": 702, + "time_per_iteration": 2.61545467376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149917, + "balance_loss_mlp": 1.10523736, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.07475145021416552, + "language_loss": 0.90919894, + "learning_rate": 0.0009712392266904696, + "loss": 0.92069811, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.44702148, + "step": 703, + "time_per_iteration": 2.739295482635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156115, + "balance_loss_mlp": 1.11219811, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.09690331363255131, + "language_loss": 0.90325272, + "learning_rate": 0.0009711349969373076, + "loss": 0.91481388, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.43945312, + "step": 704, + "time_per_iteration": 3.1653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175158, + "balance_loss_mlp": 1.12780786, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.09111648779989767, + "language_loss": 0.84997714, + "learning_rate": 0.0009710305842733178, + "loss": 0.86172873, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.47314453, + "step": 705, + "time_per_iteration": 2.7402727603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117117, + "balance_loss_mlp": 1.12737262, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.10189351673448747, + "language_loss": 0.9379847, + "learning_rate": 0.0009709259887390373, + "loss": 0.94969636, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.43774414, + "step": 706, + "time_per_iteration": 2.5640039443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.10467625, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.07946562356881365, + "language_loss": 0.95178437, + "learning_rate": 0.0009708212103750737, + "loss": 0.96325481, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.42382812, + "step": 707, + "time_per_iteration": 2.6138036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153312, + "balance_loss_mlp": 1.1095618, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.07708082078191984, + "language_loss": 0.91549516, + "learning_rate": 0.0009707162492221051, + "loss": 0.9270283, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.43725586, + "step": 708, + "time_per_iteration": 2.879612684249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143626, + "balance_loss_mlp": 1.10121179, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.08764140181907645, + "language_loss": 0.92509496, + "learning_rate": 0.0009706111053208815, + "loss": 0.93653119, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.42431641, + "step": 709, + "time_per_iteration": 2.804469347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156089, + "balance_loss_mlp": 1.10947847, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.07097269092186763, + "language_loss": 0.89579999, + "learning_rate": 0.0009705057787122232, + "loss": 0.90736091, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.46630859, + "step": 710, + "time_per_iteration": 2.568406105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174212, + "balance_loss_mlp": 1.12874603, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.06463299548184855, + "language_loss": 0.94250202, + "learning_rate": 0.0009704002694370216, + "loss": 0.9542442, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.45410156, + "step": 711, + "time_per_iteration": 2.525240659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116324, + "balance_loss_mlp": 1.11820245, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.06677275778781674, + "language_loss": 0.90675253, + "learning_rate": 0.0009702945775362388, + "loss": 0.91838491, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.45043945, + "step": 712, + "time_per_iteration": 2.572566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171995, + "balance_loss_mlp": 1.12478852, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.06549167744569931, + "language_loss": 0.91151595, + "learning_rate": 0.0009701887030509086, + "loss": 0.92323589, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.47167969, + "step": 713, + "time_per_iteration": 2.645202875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_mlp": 1.11450684, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.07696267649297317, + "language_loss": 0.95333648, + "learning_rate": 0.0009700826460221346, + "loss": 0.96490526, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.42382812, + "step": 714, + "time_per_iteration": 2.649831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187773, + "balance_loss_mlp": 1.13980293, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.08597126409557068, + "language_loss": 0.96336859, + "learning_rate": 0.0009699764064910921, + "loss": 0.97524625, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.47998047, + "step": 715, + "time_per_iteration": 2.8645238876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178216, + "balance_loss_mlp": 1.1317718, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08366808602410432, + "language_loss": 0.90892398, + "learning_rate": 0.0009698699844990268, + "loss": 0.92070615, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.46435547, + "step": 716, + "time_per_iteration": 2.635460376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171183, + "balance_loss_mlp": 1.12731409, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.051528021496160425, + "language_loss": 0.91132116, + "learning_rate": 0.0009697633800872555, + "loss": 0.923033, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.4387207, + "step": 717, + "time_per_iteration": 2.887854814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189757, + "balance_loss_mlp": 1.1432178, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.07388540586481528, + "language_loss": 0.94422555, + "learning_rate": 0.0009696565932971655, + "loss": 0.95612311, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.46557617, + "step": 718, + "time_per_iteration": 2.8565313816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171127, + "balance_loss_mlp": 1.12580407, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.06166568969162735, + "language_loss": 0.92794299, + "learning_rate": 0.0009695496241702153, + "loss": 0.93965423, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.45361328, + "step": 719, + "time_per_iteration": 2.827193021774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178611, + "balance_loss_mlp": 1.13152349, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.07046673128739296, + "language_loss": 0.8903814, + "learning_rate": 0.0009694424727479339, + "loss": 0.9021675, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.47094727, + "step": 720, + "time_per_iteration": 2.958855628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12150323, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.07332050167219753, + "language_loss": 0.91946507, + "learning_rate": 0.0009693351390719213, + "loss": 0.93114913, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.46899414, + "step": 721, + "time_per_iteration": 2.6910197734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012083, + "balance_loss_mlp": 1.15742183, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.06188248769550966, + "language_loss": 0.93531096, + "learning_rate": 0.000969227623183848, + "loss": 0.94739395, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.50830078, + "step": 722, + "time_per_iteration": 2.791097640991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.14776587, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06666345220966835, + "language_loss": 0.93550557, + "learning_rate": 0.0009691199251254554, + "loss": 0.94745386, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.47045898, + "step": 723, + "time_per_iteration": 2.8282151222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173107, + "balance_loss_mlp": 1.13059711, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.07191970231420823, + "language_loss": 0.88703346, + "learning_rate": 0.0009690120449385555, + "loss": 0.89876461, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.42504883, + "step": 724, + "time_per_iteration": 2.775456190109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158197, + "balance_loss_mlp": 1.11332655, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.06680700276551169, + "language_loss": 0.95181078, + "learning_rate": 0.0009689039826650312, + "loss": 0.96339279, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.44824219, + "step": 725, + "time_per_iteration": 2.7623417377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164951, + "balance_loss_mlp": 1.12756717, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.03995326528410751, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77688015, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.37304688, + "step": 726, + "time_per_iteration": 4.914167642593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146003, + "balance_loss_mlp": 1.09567261, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.07822541163530779, + "language_loss": 0.90488958, + "learning_rate": 0.0009686873120259941, + "loss": 0.91634959, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.50341797, + "step": 727, + "time_per_iteration": 2.563333749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132914, + "balance_loss_mlp": 1.09092879, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.0725242002086287, + "language_loss": 0.89161742, + "learning_rate": 0.0009685787037446004, + "loss": 0.90294659, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.41992188, + "step": 728, + "time_per_iteration": 2.7803192138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137226, + "balance_loss_mlp": 1.09192598, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.10183800223701604, + "language_loss": 0.9064362, + "learning_rate": 0.0009684699135448201, + "loss": 0.91780847, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.453125, + "step": 729, + "time_per_iteration": 2.750023603439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142476, + "balance_loss_mlp": 1.0995841, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.06503689668024501, + "language_loss": 0.94054115, + "learning_rate": 0.0009683609414688895, + "loss": 0.95196593, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.42895508, + "step": 730, + "time_per_iteration": 2.708470344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116576, + "balance_loss_mlp": 1.11652613, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.07277464462784268, + "language_loss": 0.89072424, + "learning_rate": 0.0009682517875591154, + "loss": 0.9023819, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.49243164, + "step": 731, + "time_per_iteration": 2.734145402908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173563, + "balance_loss_mlp": 1.12640429, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.08810260071203486, + "language_loss": 0.88790858, + "learning_rate": 0.0009681424518578749, + "loss": 0.8996442, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.47192383, + "step": 732, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166119, + "balance_loss_mlp": 1.11900759, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.07053265121681873, + "language_loss": 0.9010576, + "learning_rate": 0.000968032934407616, + "loss": 0.91271877, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.47143555, + "step": 733, + "time_per_iteration": 2.625128746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161975, + "balance_loss_mlp": 1.11514974, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.08143861058365946, + "language_loss": 0.84579933, + "learning_rate": 0.0009679232352508571, + "loss": 0.85741913, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.46850586, + "step": 734, + "time_per_iteration": 2.7461798191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145124, + "balance_loss_mlp": 1.10046864, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.0788084271092868, + "language_loss": 0.83272535, + "learning_rate": 0.0009678133544301871, + "loss": 0.84417665, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.44677734, + "step": 735, + "time_per_iteration": 2.68129301071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130971, + "balance_loss_mlp": 1.08731616, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.05044431767963513, + "language_loss": 0.93706036, + "learning_rate": 0.0009677032919882658, + "loss": 0.94837004, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.43652344, + "step": 736, + "time_per_iteration": 2.663874387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141167, + "balance_loss_mlp": 1.0970124, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.07155994363363784, + "language_loss": 0.94151366, + "learning_rate": 0.000967593047967823, + "loss": 0.95292532, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.44116211, + "step": 737, + "time_per_iteration": 2.512871265411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.10376751, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.07145762863961741, + "language_loss": 0.89657855, + "learning_rate": 0.0009674826224116593, + "loss": 0.90808284, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.46655273, + "step": 738, + "time_per_iteration": 2.797337293624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_mlp": 1.09865868, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.07589062836694223, + "language_loss": 0.89765012, + "learning_rate": 0.0009673720153626455, + "loss": 0.90910375, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.46728516, + "step": 739, + "time_per_iteration": 2.5743062496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.09274864, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07239717331604524, + "language_loss": 0.89863205, + "learning_rate": 0.0009672612268637235, + "loss": 0.9100163, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.45678711, + "step": 740, + "time_per_iteration": 2.6074059009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125723, + "balance_loss_mlp": 1.08125818, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.08552249660547784, + "language_loss": 0.8725301, + "learning_rate": 0.0009671502569579048, + "loss": 0.88378727, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.44458008, + "step": 741, + "time_per_iteration": 2.729733467102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116563, + "balance_loss_mlp": 1.07338512, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.05753110737252733, + "language_loss": 0.92330521, + "learning_rate": 0.0009670391056882719, + "loss": 0.93447083, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.43188477, + "step": 742, + "time_per_iteration": 2.69399356842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115871, + "balance_loss_mlp": 1.07367063, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.06711892894426404, + "language_loss": 0.91615599, + "learning_rate": 0.0009669277730979776, + "loss": 0.92731464, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.421875, + "step": 743, + "time_per_iteration": 3.1732802391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123454, + "balance_loss_mlp": 1.079561, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.07488288596065623, + "language_loss": 0.88249421, + "learning_rate": 0.0009668162592302449, + "loss": 0.89372879, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.43896484, + "step": 744, + "time_per_iteration": 2.88962459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_mlp": 1.09551311, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.08170086657731683, + "language_loss": 0.8873378, + "learning_rate": 0.0009667045641283676, + "loss": 0.89875567, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.46289062, + "step": 745, + "time_per_iteration": 2.6374380588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136601, + "balance_loss_mlp": 1.09158731, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.07376324969806651, + "language_loss": 0.9752661, + "learning_rate": 0.0009665926878357092, + "loss": 0.98663211, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.44995117, + "step": 746, + "time_per_iteration": 2.908377170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138589, + "balance_loss_mlp": 1.09283662, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.055840413500964095, + "language_loss": 0.93229979, + "learning_rate": 0.0009664806303957043, + "loss": 0.94368571, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.45751953, + "step": 747, + "time_per_iteration": 2.6940197944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_mlp": 1.11397541, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.07422855656653271, + "language_loss": 0.89923358, + "learning_rate": 0.0009663683918518571, + "loss": 0.91087878, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.50463867, + "step": 748, + "time_per_iteration": 2.8905599117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_mlp": 1.10977423, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.06951396400432043, + "language_loss": 0.88074797, + "learning_rate": 0.0009662559722477428, + "loss": 0.89237428, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.52880859, + "step": 749, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111749, + "balance_loss_mlp": 1.09059644, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.031134761916572575, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77280462, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.26953125, + "step": 750, + "time_per_iteration": 4.978729009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_mlp": 1.09359622, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.06451546089111031, + "language_loss": 0.9124738, + "learning_rate": 0.0009660305900333632, + "loss": 0.92388898, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.47973633, + "step": 751, + "time_per_iteration": 2.6556403636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145582, + "balance_loss_mlp": 1.09849465, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08083819383046088, + "language_loss": 0.8480792, + "learning_rate": 0.0009659176275105992, + "loss": 0.85953498, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.47070312, + "step": 752, + "time_per_iteration": 2.6868016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154886, + "balance_loss_mlp": 1.10667825, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.0601727082776222, + "language_loss": 0.87400204, + "learning_rate": 0.0009658044841025701, + "loss": 0.88555086, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.48217773, + "step": 753, + "time_per_iteration": 2.7701456546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189813, + "balance_loss_mlp": 1.136765, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.0800468655776831, + "language_loss": 0.83957088, + "learning_rate": 0.0009656911598532021, + "loss": 0.85146904, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.53051758, + "step": 754, + "time_per_iteration": 2.630211353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192149, + "balance_loss_mlp": 1.13943434, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.0631545589319864, + "language_loss": 0.9278729, + "learning_rate": 0.0009655776548064917, + "loss": 0.93979442, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.52758789, + "step": 755, + "time_per_iteration": 2.6447510719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.12506902, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.06497808848967317, + "language_loss": 0.90460694, + "learning_rate": 0.0009654639690065054, + "loss": 0.91637456, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.51708984, + "step": 756, + "time_per_iteration": 2.910578727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116602, + "balance_loss_mlp": 1.11785972, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.0580393303136577, + "language_loss": 0.90340179, + "learning_rate": 0.00096535010249738, + "loss": 0.91506201, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.48120117, + "step": 757, + "time_per_iteration": 2.7232277393341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149847, + "balance_loss_mlp": 1.10092402, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.07370663524734816, + "language_loss": 0.8531146, + "learning_rate": 0.0009652360553233224, + "loss": 0.86461306, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.48901367, + "step": 758, + "time_per_iteration": 2.7501397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.03528047, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.02263224740377231, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74837828, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.28710938, + "step": 759, + "time_per_iteration": 4.953639268875122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150341, + "balance_loss_mlp": 1.1019187, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.05750780582661247, + "language_loss": 0.83513778, + "learning_rate": 0.0009650074191575883, + "loss": 0.84664118, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.48388672, + "step": 760, + "time_per_iteration": 3.202252149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152626, + "balance_loss_mlp": 1.10179496, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.05303129095981597, + "language_loss": 0.88240772, + "learning_rate": 0.0009648928302546766, + "loss": 0.89393395, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.50878906, + "step": 761, + "time_per_iteration": 2.65380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_mlp": 1.09960222, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.06114398209353547, + "language_loss": 0.87573165, + "learning_rate": 0.0009647780608643613, + "loss": 0.88720453, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.47705078, + "step": 762, + "time_per_iteration": 3.3394339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.10831833, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.09093438426480749, + "language_loss": 0.90765309, + "learning_rate": 0.0009646631110312001, + "loss": 0.91919315, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.45678711, + "step": 763, + "time_per_iteration": 2.622671604156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.11200595, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.047784585244551814, + "language_loss": 0.90468627, + "learning_rate": 0.0009645479807998203, + "loss": 0.91626436, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.45751953, + "step": 764, + "time_per_iteration": 2.7322580814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156125, + "balance_loss_mlp": 1.11487842, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06523928090243644, + "language_loss": 0.94106412, + "learning_rate": 0.0009644326702149196, + "loss": 0.95262539, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.41235352, + "step": 765, + "time_per_iteration": 2.7013158798217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174535, + "balance_loss_mlp": 1.12761474, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.08055574364553787, + "language_loss": 0.86730242, + "learning_rate": 0.0009643171793212653, + "loss": 0.87904775, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.46923828, + "step": 766, + "time_per_iteration": 3.083709478378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_mlp": 1.11473966, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.07722330054572468, + "language_loss": 0.92188174, + "learning_rate": 0.0009642015081636952, + "loss": 0.93350834, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.47949219, + "step": 767, + "time_per_iteration": 2.6836585998535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.1132586, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.07123168873353844, + "language_loss": 0.90995437, + "learning_rate": 0.0009640856567871166, + "loss": 0.9215681, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.48168945, + "step": 768, + "time_per_iteration": 2.543670177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156907, + "balance_loss_mlp": 1.10626745, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07039727350928661, + "language_loss": 0.9123286, + "learning_rate": 0.0009639696252365072, + "loss": 0.92389768, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.50634766, + "step": 769, + "time_per_iteration": 3.027188539505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146634, + "balance_loss_mlp": 1.10326576, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.06094559984807647, + "language_loss": 0.83659029, + "learning_rate": 0.0009638534135569144, + "loss": 0.84805667, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.43359375, + "step": 770, + "time_per_iteration": 2.9126267433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_mlp": 1.09489226, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.06702358278695762, + "language_loss": 0.92293191, + "learning_rate": 0.0009637370217934554, + "loss": 0.93433982, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.45922852, + "step": 771, + "time_per_iteration": 2.6426541805267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.08600211, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.04968709901212579, + "language_loss": 0.84857935, + "learning_rate": 0.0009636204499913175, + "loss": 0.85987568, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.43603516, + "step": 772, + "time_per_iteration": 2.830029010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_mlp": 1.08478057, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06444605868824185, + "language_loss": 0.90028566, + "learning_rate": 0.0009635036981957581, + "loss": 0.91150796, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.37451172, + "step": 773, + "time_per_iteration": 2.850893259048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128047, + "balance_loss_mlp": 1.08546507, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.07558916443605426, + "language_loss": 0.92137265, + "learning_rate": 0.0009633867664521043, + "loss": 0.93265319, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.42553711, + "step": 774, + "time_per_iteration": 2.8405416011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154281, + "balance_loss_mlp": 1.10614467, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.07793461844194936, + "language_loss": 0.8938297, + "learning_rate": 0.0009632696548057527, + "loss": 0.9053725, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.48168945, + "step": 775, + "time_per_iteration": 2.5543088912963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158921, + "balance_loss_mlp": 1.11419404, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.07948352168051111, + "language_loss": 0.86982578, + "learning_rate": 0.0009631523633021704, + "loss": 0.88141501, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.44702148, + "step": 776, + "time_per_iteration": 2.8373982906341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151944, + "balance_loss_mlp": 1.10726452, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.07613081492567164, + "language_loss": 0.90593684, + "learning_rate": 0.0009630348919868936, + "loss": 0.91745627, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.4465332, + "step": 777, + "time_per_iteration": 2.688340187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164011, + "balance_loss_mlp": 1.1162796, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.07284380806791231, + "language_loss": 0.83743048, + "learning_rate": 0.0009629172409055293, + "loss": 0.84907055, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.47753906, + "step": 778, + "time_per_iteration": 2.496121406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_mlp": 1.13260555, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.0582041699055768, + "language_loss": 0.89173234, + "learning_rate": 0.0009627994101037531, + "loss": 0.9034642, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.40576172, + "step": 779, + "time_per_iteration": 2.7287445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116917, + "balance_loss_mlp": 1.12670779, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.06429714570378213, + "language_loss": 0.91374522, + "learning_rate": 0.0009626813996273114, + "loss": 0.92543697, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.42431641, + "step": 780, + "time_per_iteration": 2.8357532024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174237, + "balance_loss_mlp": 1.13258517, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.07735356487079731, + "language_loss": 0.90820873, + "learning_rate": 0.0009625632095220198, + "loss": 0.91995108, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.41625977, + "step": 781, + "time_per_iteration": 2.8360986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165333, + "balance_loss_mlp": 1.12408686, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.07591811383481707, + "language_loss": 0.88784671, + "learning_rate": 0.0009624448398337637, + "loss": 0.89950007, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.41259766, + "step": 782, + "time_per_iteration": 2.550873041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_mlp": 1.09920812, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.06500535683801296, + "language_loss": 0.90907973, + "learning_rate": 0.0009623262906084984, + "loss": 0.92046738, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.39550781, + "step": 783, + "time_per_iteration": 3.002237319946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127947, + "balance_loss_mlp": 1.08622408, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.06722303964642193, + "language_loss": 0.92323947, + "learning_rate": 0.0009622075618922486, + "loss": 0.93451893, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.41699219, + "step": 784, + "time_per_iteration": 2.669541120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117088, + "balance_loss_mlp": 1.07636571, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.06286377137641418, + "language_loss": 0.88948303, + "learning_rate": 0.0009620886537311091, + "loss": 0.90065384, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.40722656, + "step": 785, + "time_per_iteration": 2.6505391597747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132411, + "balance_loss_mlp": 1.08563375, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.06858268632652799, + "language_loss": 0.87318397, + "learning_rate": 0.000961969566171244, + "loss": 0.88450807, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.46777344, + "step": 786, + "time_per_iteration": 2.5492002964019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143812, + "balance_loss_mlp": 1.10037243, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.06762455123923776, + "language_loss": 0.9226557, + "learning_rate": 0.0009618502992588873, + "loss": 0.93409383, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.43481445, + "step": 787, + "time_per_iteration": 2.6596381664276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153043, + "balance_loss_mlp": 1.10714722, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07210135364095939, + "language_loss": 0.90213263, + "learning_rate": 0.0009617308530403424, + "loss": 0.91366303, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.45922852, + "step": 788, + "time_per_iteration": 2.9965012073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133874, + "balance_loss_mlp": 1.09358144, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0646084728999688, + "language_loss": 0.89177096, + "learning_rate": 0.0009616112275619825, + "loss": 0.90310967, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.40283203, + "step": 789, + "time_per_iteration": 2.702927350997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.08760214, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.04914514873585108, + "language_loss": 0.85434246, + "learning_rate": 0.0009614914228702503, + "loss": 0.86562753, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.40917969, + "step": 790, + "time_per_iteration": 2.734309196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120439, + "balance_loss_mlp": 1.08031344, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.0510031662309952, + "language_loss": 0.90581405, + "learning_rate": 0.0009613714390116581, + "loss": 0.91701841, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.40112305, + "step": 791, + "time_per_iteration": 2.9846036434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119178, + "balance_loss_mlp": 1.07890868, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.06466161117660295, + "language_loss": 0.87842512, + "learning_rate": 0.0009612512760327879, + "loss": 0.88961697, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.40283203, + "step": 792, + "time_per_iteration": 2.879507303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.0749234, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.06761791569724282, + "language_loss": 0.86834276, + "learning_rate": 0.0009611309339802909, + "loss": 0.87955594, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.46435547, + "step": 793, + "time_per_iteration": 2.4628419876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125819, + "balance_loss_mlp": 1.08180666, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.06955338926819006, + "language_loss": 0.85776877, + "learning_rate": 0.0009610104129008881, + "loss": 0.86902696, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.43994141, + "step": 794, + "time_per_iteration": 3.1157610416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112048, + "balance_loss_mlp": 1.07751703, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.0812849574801687, + "language_loss": 0.89832217, + "learning_rate": 0.0009608897128413701, + "loss": 0.90952694, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.4296875, + "step": 795, + "time_per_iteration": 2.7580387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_mlp": 1.08070254, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.07320179377966478, + "language_loss": 0.87414771, + "learning_rate": 0.0009607688338485965, + "loss": 0.88536048, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.40576172, + "step": 796, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112803, + "balance_loss_mlp": 1.08358848, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.08676784428227541, + "language_loss": 0.92063487, + "learning_rate": 0.0009606477759694969, + "loss": 0.93191516, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.4440918, + "step": 797, + "time_per_iteration": 3.0136139392852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129989, + "balance_loss_mlp": 1.08547592, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07379760567815713, + "language_loss": 0.89430279, + "learning_rate": 0.0009605265392510703, + "loss": 0.90560269, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.44555664, + "step": 798, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_mlp": 1.10169339, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.06797963908333281, + "language_loss": 0.93481082, + "learning_rate": 0.0009604051237403846, + "loss": 0.94626689, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.43896484, + "step": 799, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167168, + "balance_loss_mlp": 1.1217972, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.06891264186704958, + "language_loss": 0.88271165, + "learning_rate": 0.0009602835294845776, + "loss": 0.89438331, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.45361328, + "step": 800, + "time_per_iteration": 2.4739739894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12188447, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.06820302888180714, + "language_loss": 0.91848779, + "learning_rate": 0.0009601617565308565, + "loss": 0.93017173, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.46557617, + "step": 801, + "time_per_iteration": 2.599102020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196072, + "balance_loss_mlp": 1.14941311, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.08155438121007776, + "language_loss": 0.88506758, + "learning_rate": 0.0009600398049264977, + "loss": 0.89702827, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.46679688, + "step": 802, + "time_per_iteration": 2.9645981788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193217, + "balance_loss_mlp": 1.14574742, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.10468166660144326, + "language_loss": 0.93512642, + "learning_rate": 0.0009599176747188469, + "loss": 0.94705856, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.47485352, + "step": 803, + "time_per_iteration": 2.7997000217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160191, + "balance_loss_mlp": 1.11856318, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.07174757520021151, + "language_loss": 0.84728193, + "learning_rate": 0.0009597953659553196, + "loss": 0.85888386, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.41625977, + "step": 804, + "time_per_iteration": 2.700530529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_mlp": 1.09408379, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.4143347029392257, + "language_loss": 0.9033978, + "learning_rate": 0.0009596728786833997, + "loss": 0.91473466, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.39575195, + "step": 805, + "time_per_iteration": 2.6122889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150784, + "balance_loss_mlp": 1.10772574, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.061887733402931855, + "language_loss": 0.91321814, + "learning_rate": 0.0009595502129506415, + "loss": 0.92472601, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.43066406, + "step": 806, + "time_per_iteration": 3.336061716079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180508, + "balance_loss_mlp": 1.13694847, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.06807019640067784, + "language_loss": 0.84292483, + "learning_rate": 0.0009594273688046678, + "loss": 0.85472989, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.43579102, + "step": 807, + "time_per_iteration": 2.709182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210101, + "balance_loss_mlp": 1.15960383, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.0856522073787927, + "language_loss": 0.8780278, + "learning_rate": 0.000959304346293171, + "loss": 0.89012885, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.50512695, + "step": 808, + "time_per_iteration": 2.6307153701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236008, + "balance_loss_mlp": 1.18305564, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.09531038088821206, + "language_loss": 0.90107393, + "learning_rate": 0.0009591811454639125, + "loss": 0.91343403, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.52954102, + "step": 809, + "time_per_iteration": 2.742725372314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197955, + "balance_loss_mlp": 1.15184498, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.06212883071305714, + "language_loss": 0.902493, + "learning_rate": 0.0009590577663647234, + "loss": 0.91447246, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.4609375, + "step": 810, + "time_per_iteration": 2.711411237716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187108, + "balance_loss_mlp": 1.13837492, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.06321996034865444, + "language_loss": 0.88015836, + "learning_rate": 0.0009589342090435036, + "loss": 0.8920294, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.48779297, + "step": 811, + "time_per_iteration": 2.763784170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.12610841, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07315119709604147, + "language_loss": 0.89953744, + "learning_rate": 0.0009588104735482223, + "loss": 0.91127443, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.47631836, + "step": 812, + "time_per_iteration": 2.645106077194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169234, + "balance_loss_mlp": 1.12019134, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.06895714089970095, + "language_loss": 0.86002952, + "learning_rate": 0.0009586865599269177, + "loss": 0.87172186, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.49047852, + "step": 813, + "time_per_iteration": 2.6313953399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144128, + "balance_loss_mlp": 1.09851837, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.06467027207336487, + "language_loss": 0.90443802, + "learning_rate": 0.0009585624682276977, + "loss": 0.91587937, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.45605469, + "step": 814, + "time_per_iteration": 2.7377047538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144046, + "balance_loss_mlp": 1.09705353, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.06824176290368998, + "language_loss": 0.89156437, + "learning_rate": 0.0009584381984987386, + "loss": 0.90300483, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.47021484, + "step": 815, + "time_per_iteration": 2.5524120330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134862, + "balance_loss_mlp": 1.09225655, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.061358262400161866, + "language_loss": 0.92449033, + "learning_rate": 0.0009583137507882864, + "loss": 0.93583906, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.42626953, + "step": 816, + "time_per_iteration": 2.699207305908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.08698916, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.06309616730716378, + "language_loss": 0.82620019, + "learning_rate": 0.000958189125144656, + "loss": 0.8375479, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.47851562, + "step": 817, + "time_per_iteration": 2.6626293659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142354, + "balance_loss_mlp": 1.09493256, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08013787804574789, + "language_loss": 0.90297949, + "learning_rate": 0.0009580643216162313, + "loss": 0.91440302, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.47436523, + "step": 818, + "time_per_iteration": 2.6708288192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.09368527, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.06582812199168771, + "language_loss": 0.82167578, + "learning_rate": 0.0009579393402514652, + "loss": 0.83310658, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.49389648, + "step": 819, + "time_per_iteration": 2.577592611312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_mlp": 1.09898734, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.07647809261390527, + "language_loss": 0.92362559, + "learning_rate": 0.0009578141810988801, + "loss": 0.93505466, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.43920898, + "step": 820, + "time_per_iteration": 2.5464515686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152369, + "balance_loss_mlp": 1.10678363, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07136182637629812, + "language_loss": 0.92042351, + "learning_rate": 0.0009576888442070668, + "loss": 0.93194717, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.45556641, + "step": 821, + "time_per_iteration": 2.5755786895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114609, + "balance_loss_mlp": 1.10288835, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08295395391365894, + "language_loss": 0.94583452, + "learning_rate": 0.0009575633296246854, + "loss": 0.95729542, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.43212891, + "step": 822, + "time_per_iteration": 2.5701425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162298, + "balance_loss_mlp": 1.11821485, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.06548151577025092, + "language_loss": 0.85385978, + "learning_rate": 0.0009574376374004652, + "loss": 0.86548281, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.44116211, + "step": 823, + "time_per_iteration": 2.622905731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_mlp": 1.12019491, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.1009087476503521, + "language_loss": 0.82624936, + "learning_rate": 0.000957311767583204, + "loss": 0.83794677, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.49536133, + "step": 824, + "time_per_iteration": 2.5683999061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196161, + "balance_loss_mlp": 1.1752758, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.05150472419389455, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83267754, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.20898438, + "step": 825, + "time_per_iteration": 4.722898960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176776, + "balance_loss_mlp": 1.12170124, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.10062471557735768, + "language_loss": 0.94017303, + "learning_rate": 0.0009570594953650961, + "loss": 0.95194077, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.55029297, + "step": 826, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173437, + "balance_loss_mlp": 1.12091362, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.0719939675894647, + "language_loss": 0.8219676, + "learning_rate": 0.00095693309306219, + "loss": 0.83370197, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.52612305, + "step": 827, + "time_per_iteration": 3.0926811695098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_mlp": 1.12434745, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.06038838021195225, + "language_loss": 0.90083122, + "learning_rate": 0.0009568065133621244, + "loss": 0.91261542, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54077148, + "step": 828, + "time_per_iteration": 3.315122604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164888, + "balance_loss_mlp": 1.12013662, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.07025990147709567, + "language_loss": 0.87178355, + "learning_rate": 0.0009566797563140422, + "loss": 0.88343245, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.44775391, + "step": 829, + "time_per_iteration": 2.8680243492126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116659, + "balance_loss_mlp": 1.11912107, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.061296828426512996, + "language_loss": 0.89984798, + "learning_rate": 0.0009565528219671547, + "loss": 0.91151381, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.47460938, + "step": 830, + "time_per_iteration": 2.9325318336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.1076839, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07652275644998038, + "language_loss": 0.86699682, + "learning_rate": 0.0009564257103707418, + "loss": 0.87860584, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.53198242, + "step": 831, + "time_per_iteration": 2.598191976547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184474, + "balance_loss_mlp": 1.12973261, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08337472663089728, + "language_loss": 0.92543364, + "learning_rate": 0.0009562984215741533, + "loss": 0.93727839, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54736328, + "step": 832, + "time_per_iteration": 2.676666736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_mlp": 1.11177731, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.05762908483075192, + "language_loss": 0.8408711, + "learning_rate": 0.0009561709556268065, + "loss": 0.85247904, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.49047852, + "step": 833, + "time_per_iteration": 2.7075538635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162528, + "balance_loss_mlp": 1.11141133, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.06044842900072245, + "language_loss": 0.96042889, + "learning_rate": 0.0009560433125781884, + "loss": 0.97205412, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.51171875, + "step": 834, + "time_per_iteration": 2.7619521617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.09130979, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.06441579465763399, + "language_loss": 0.94159138, + "learning_rate": 0.0009559154924778544, + "loss": 0.95304114, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.53686523, + "step": 835, + "time_per_iteration": 2.7467222213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_mlp": 1.08218372, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.07312538570388089, + "language_loss": 0.86469144, + "learning_rate": 0.0009557874953754284, + "loss": 0.87598646, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.47314453, + "step": 836, + "time_per_iteration": 3.0907793045043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126281, + "balance_loss_mlp": 1.07618928, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08101808751207061, + "language_loss": 0.85894346, + "learning_rate": 0.0009556593213206038, + "loss": 0.87020624, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.5012207, + "step": 837, + "time_per_iteration": 2.7060487270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.07765627, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.060960398488271, + "language_loss": 0.89031309, + "learning_rate": 0.0009555309703631414, + "loss": 0.9015379, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.44848633, + "step": 838, + "time_per_iteration": 2.6838622093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131245, + "balance_loss_mlp": 1.07853079, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.0637381399971671, + "language_loss": 0.88547724, + "learning_rate": 0.0009554024425528722, + "loss": 0.89678967, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.52685547, + "step": 839, + "time_per_iteration": 2.7301504611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124486, + "balance_loss_mlp": 1.07978272, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0692663948027758, + "language_loss": 0.90811443, + "learning_rate": 0.0009552737379396948, + "loss": 0.91935933, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.44726562, + "step": 840, + "time_per_iteration": 2.6181893348693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129368, + "balance_loss_mlp": 1.08208978, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06449676765287365, + "language_loss": 0.89640445, + "learning_rate": 0.0009551448565735767, + "loss": 0.90769809, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.47265625, + "step": 841, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135502, + "balance_loss_mlp": 1.08555281, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.07291825437583387, + "language_loss": 0.86443651, + "learning_rate": 0.0009550157985045543, + "loss": 0.87579155, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.49926758, + "step": 842, + "time_per_iteration": 3.0523600578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_mlp": 1.08724499, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.06222432903322319, + "language_loss": 0.90556312, + "learning_rate": 0.0009548865637827321, + "loss": 0.91690183, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.46630859, + "step": 843, + "time_per_iteration": 2.6370396614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113271, + "balance_loss_mlp": 1.08757734, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.07459586377830821, + "language_loss": 0.91347718, + "learning_rate": 0.0009547571524582838, + "loss": 0.92480427, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.45141602, + "step": 844, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142931, + "balance_loss_mlp": 1.09460354, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.08463351541898638, + "language_loss": 0.94371468, + "learning_rate": 0.0009546275645814512, + "loss": 0.95514405, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.48339844, + "step": 845, + "time_per_iteration": 2.632861375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117157, + "balance_loss_mlp": 1.12107265, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.08033911629378378, + "language_loss": 0.92129737, + "learning_rate": 0.0009544978002025446, + "loss": 0.93301302, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.50561523, + "step": 846, + "time_per_iteration": 2.7044737339019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193795, + "balance_loss_mlp": 1.14096177, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.052695226385161484, + "language_loss": 0.88037688, + "learning_rate": 0.0009543678593719434, + "loss": 0.89231491, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.52880859, + "step": 847, + "time_per_iteration": 2.798231601715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208721, + "balance_loss_mlp": 1.15734136, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.056853368929671785, + "language_loss": 0.88963962, + "learning_rate": 0.0009542377421400945, + "loss": 0.90172684, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.51391602, + "step": 848, + "time_per_iteration": 2.7955727577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122402, + "balance_loss_mlp": 1.16584587, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06352967983147602, + "language_loss": 0.85259467, + "learning_rate": 0.0009541074485575145, + "loss": 0.86483485, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.58154297, + "step": 849, + "time_per_iteration": 2.703871488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_mlp": 1.17088127, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07774946886845908, + "language_loss": 0.93468195, + "learning_rate": 0.0009539769786747874, + "loss": 0.94693196, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.54125977, + "step": 850, + "time_per_iteration": 2.6687557697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012154, + "balance_loss_mlp": 1.16130245, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.057605035940766894, + "language_loss": 0.82393861, + "learning_rate": 0.0009538463325425665, + "loss": 0.83609259, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.54101562, + "step": 851, + "time_per_iteration": 2.751335382461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199288, + "balance_loss_mlp": 1.1491015, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.06621147850271279, + "language_loss": 0.87526274, + "learning_rate": 0.0009537155102115728, + "loss": 0.88725561, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.50170898, + "step": 852, + "time_per_iteration": 2.568573474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168884, + "balance_loss_mlp": 1.12236834, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.07419725806034035, + "language_loss": 0.85374665, + "learning_rate": 0.0009535845117325961, + "loss": 0.86543554, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.46533203, + "step": 853, + "time_per_iteration": 2.628973960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137862, + "balance_loss_mlp": 1.09511375, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.05551255594321189, + "language_loss": 0.94495642, + "learning_rate": 0.0009534533371564946, + "loss": 0.95633507, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.42724609, + "step": 854, + "time_per_iteration": 2.780510902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133546, + "balance_loss_mlp": 1.09003448, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.08632067881035285, + "language_loss": 0.90547508, + "learning_rate": 0.0009533219865341949, + "loss": 0.91681051, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.43530273, + "step": 855, + "time_per_iteration": 2.583874464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_mlp": 1.07188785, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.06082853882497287, + "language_loss": 0.88071746, + "learning_rate": 0.0009531904599166916, + "loss": 0.89188123, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.44482422, + "step": 856, + "time_per_iteration": 2.626354217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_mlp": 1.06231081, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.0709999882269981, + "language_loss": 0.86807954, + "learning_rate": 0.0009530587573550478, + "loss": 0.87915355, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.45068359, + "step": 857, + "time_per_iteration": 2.5761454105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142125, + "balance_loss_mlp": 1.11237001, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04095057850479287, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75461513, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.296875, + "step": 858, + "time_per_iteration": 5.055138349533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_mlp": 1.06165087, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.08838989258306214, + "language_loss": 0.91845137, + "learning_rate": 0.0009527948246039337, + "loss": 0.92946172, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.39379883, + "step": 859, + "time_per_iteration": 2.582608461380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111338, + "balance_loss_mlp": 1.0715934, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.06489567580347368, + "language_loss": 0.89263308, + "learning_rate": 0.000952662594516931, + "loss": 0.90374649, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.39746094, + "step": 860, + "time_per_iteration": 3.067707061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_mlp": 1.07018054, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.055059247831062384, + "language_loss": 0.88479781, + "learning_rate": 0.0009525301886907234, + "loss": 0.89590299, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.40307617, + "step": 861, + "time_per_iteration": 2.8873865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112184, + "balance_loss_mlp": 1.07758975, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.06995538812096423, + "language_loss": 0.89499515, + "learning_rate": 0.0009523976071767155, + "loss": 0.90621358, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.44262695, + "step": 862, + "time_per_iteration": 2.6588613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.08183372, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.06313062043432274, + "language_loss": 0.89038265, + "learning_rate": 0.00095226485002638, + "loss": 0.90163255, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.43115234, + "step": 863, + "time_per_iteration": 2.797896146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.07232881, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.054774526957085325, + "language_loss": 0.90381318, + "learning_rate": 0.0009521319172912576, + "loss": 0.91494584, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.40917969, + "step": 864, + "time_per_iteration": 2.7238612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_mlp": 1.08132839, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.05854649520245602, + "language_loss": 0.96491337, + "learning_rate": 0.0009519988090229579, + "loss": 0.97618109, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.4543457, + "step": 865, + "time_per_iteration": 2.683509111404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_mlp": 1.07907248, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.05699467986566688, + "language_loss": 0.89545953, + "learning_rate": 0.0009518655252731576, + "loss": 0.90669084, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.44067383, + "step": 866, + "time_per_iteration": 2.729865550994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_mlp": 1.08456326, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.06482393342324422, + "language_loss": 0.9171015, + "learning_rate": 0.0009517320660936022, + "loss": 0.9284128, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.46557617, + "step": 867, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133142, + "balance_loss_mlp": 1.08843839, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.06614373571764609, + "language_loss": 0.84472704, + "learning_rate": 0.0009515984315361051, + "loss": 0.85605848, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.44702148, + "step": 868, + "time_per_iteration": 2.796868085861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121806, + "balance_loss_mlp": 1.07657838, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08270078218547869, + "language_loss": 0.88773656, + "learning_rate": 0.000951464621652548, + "loss": 0.89895463, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.45239258, + "step": 869, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141117, + "balance_loss_mlp": 1.09751046, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.06072661062765564, + "language_loss": 0.80103016, + "learning_rate": 0.0009513306364948804, + "loss": 0.81244129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.43579102, + "step": 870, + "time_per_iteration": 2.799009084701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_mlp": 1.10373545, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09261319168225486, + "language_loss": 0.90277344, + "learning_rate": 0.0009511964761151197, + "loss": 0.91426206, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.45117188, + "step": 871, + "time_per_iteration": 2.5934712886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158743, + "balance_loss_mlp": 1.1145407, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.06739805293344515, + "language_loss": 0.91524243, + "learning_rate": 0.0009510621405653521, + "loss": 0.92682987, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.44213867, + "step": 872, + "time_per_iteration": 2.5557620525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11627746, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.06267535529199315, + "language_loss": 0.85553813, + "learning_rate": 0.0009509276298977309, + "loss": 0.86710668, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.40576172, + "step": 873, + "time_per_iteration": 2.9965007305145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187981, + "balance_loss_mlp": 1.13760364, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.07409010972210926, + "language_loss": 0.82916558, + "learning_rate": 0.0009507929441644778, + "loss": 0.84104538, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.50415039, + "step": 874, + "time_per_iteration": 3.5573699474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118449, + "balance_loss_mlp": 1.14097893, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.07388150752212762, + "language_loss": 0.8737148, + "learning_rate": 0.0009506580834178826, + "loss": 0.88555974, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.43530273, + "step": 875, + "time_per_iteration": 2.7659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215839, + "balance_loss_mlp": 1.16841793, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.06935842584614806, + "language_loss": 0.92793226, + "learning_rate": 0.0009505230477103028, + "loss": 0.94009066, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.47436523, + "step": 876, + "time_per_iteration": 2.7306137084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_mlp": 1.18224776, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10053146783154573, + "language_loss": 0.82997662, + "learning_rate": 0.0009503878370941641, + "loss": 0.84224302, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.44433594, + "step": 877, + "time_per_iteration": 2.7356183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211684, + "balance_loss_mlp": 1.16793382, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.10508781605450683, + "language_loss": 0.9020679, + "learning_rate": 0.0009502524516219595, + "loss": 0.91418481, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.43798828, + "step": 878, + "time_per_iteration": 2.7525370121002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185028, + "balance_loss_mlp": 1.14232683, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.07887273759437702, + "language_loss": 0.91364408, + "learning_rate": 0.0009501168913462506, + "loss": 0.92549431, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.42724609, + "step": 879, + "time_per_iteration": 2.7009639739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115086, + "balance_loss_mlp": 1.11919844, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04902821320434346, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80272782, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.31640625, + "step": 880, + "time_per_iteration": 4.812703609466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116281, + "balance_loss_mlp": 1.11748707, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.06555145426806878, + "language_loss": 0.86756283, + "learning_rate": 0.0009498452465949042, + "loss": 0.87919092, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.453125, + "step": 881, + "time_per_iteration": 3.230407476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159747, + "balance_loss_mlp": 1.1133033, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.0753185527775994, + "language_loss": 0.92756218, + "learning_rate": 0.0009497091622247285, + "loss": 0.93915963, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.46459961, + "step": 882, + "time_per_iteration": 2.7412030696868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141259, + "balance_loss_mlp": 1.09734213, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.07197762243887564, + "language_loss": 0.94941783, + "learning_rate": 0.0009495729032619723, + "loss": 0.96083045, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.43945312, + "step": 883, + "time_per_iteration": 2.6705245971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_mlp": 1.09724283, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07033792867334165, + "language_loss": 0.85310471, + "learning_rate": 0.0009494364697595354, + "loss": 0.86451751, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.44018555, + "step": 884, + "time_per_iteration": 2.9024457931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115677, + "balance_loss_mlp": 1.10977769, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.0673266035955572, + "language_loss": 0.90739167, + "learning_rate": 0.0009492998617703867, + "loss": 0.91895938, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.47045898, + "step": 885, + "time_per_iteration": 2.6497459411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151127, + "balance_loss_mlp": 1.10813999, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.0863252086663651, + "language_loss": 0.89101255, + "learning_rate": 0.0009491630793475619, + "loss": 0.90252388, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.42993164, + "step": 886, + "time_per_iteration": 2.6258063316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159756, + "balance_loss_mlp": 1.11231089, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.0686214928272948, + "language_loss": 0.85993534, + "learning_rate": 0.0009490261225441643, + "loss": 0.87153292, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.47412109, + "step": 887, + "time_per_iteration": 2.9036519527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168173, + "balance_loss_mlp": 1.12370825, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07914830411429463, + "language_loss": 0.91452426, + "learning_rate": 0.0009488889914133656, + "loss": 0.92620599, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.44458008, + "step": 888, + "time_per_iteration": 3.0038132667541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155268, + "balance_loss_mlp": 1.10706019, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07300075385020723, + "language_loss": 0.90558064, + "learning_rate": 0.0009487516860084047, + "loss": 0.91713333, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.48193359, + "step": 889, + "time_per_iteration": 2.7158679962158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147984, + "balance_loss_mlp": 1.0996089, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.09172908653222724, + "language_loss": 0.90068781, + "learning_rate": 0.0009486142063825884, + "loss": 0.91216767, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.48364258, + "step": 890, + "time_per_iteration": 2.5330443382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.06175303, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.031797672969882694, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73511147, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.23144531, + "step": 891, + "time_per_iteration": 4.953175783157349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_mlp": 1.11835372, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.06989736404119995, + "language_loss": 0.91231126, + "learning_rate": 0.0009483387246819542, + "loss": 0.92398739, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.49243164, + "step": 892, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.0426023, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.022698270048783192, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83350885, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.2265625, + "step": 893, + "time_per_iteration": 4.662828683853149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12312233, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.06047387129149895, + "language_loss": 0.90360647, + "learning_rate": 0.0009480625467392688, + "loss": 0.91527206, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.43481445, + "step": 894, + "time_per_iteration": 2.615447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046079, + "balance_loss_mlp": 1.02433491, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.017910617622931155, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79040754, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.21777344, + "step": 895, + "time_per_iteration": 4.802469968795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196327, + "balance_loss_mlp": 1.15264833, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0591778940977726, + "language_loss": 0.88960874, + "learning_rate": 0.0009477856729834196, + "loss": 0.90157199, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.43652344, + "step": 896, + "time_per_iteration": 2.743036985397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214543, + "balance_loss_mlp": 1.17217648, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.09709817551063968, + "language_loss": 0.91585428, + "learning_rate": 0.0009476469753098809, + "loss": 0.92799973, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.42358398, + "step": 897, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206917, + "balance_loss_mlp": 1.16080689, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08785360527314089, + "language_loss": 0.87616539, + "learning_rate": 0.0009475081038443738, + "loss": 0.88823456, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.46118164, + "step": 898, + "time_per_iteration": 2.5958664417266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178905, + "balance_loss_mlp": 1.13436794, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.08099470404026293, + "language_loss": 0.87109447, + "learning_rate": 0.0009473690586408124, + "loss": 0.88288355, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.44482422, + "step": 899, + "time_per_iteration": 2.885279417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.13184392, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.060075693842180825, + "language_loss": 0.87349975, + "learning_rate": 0.0009472298397531792, + "loss": 0.88526928, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.45141602, + "step": 900, + "time_per_iteration": 2.6987335681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117213, + "balance_loss_mlp": 1.12244344, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.06597136758704356, + "language_loss": 0.87749296, + "learning_rate": 0.0009470904472355235, + "loss": 0.88921428, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.49707031, + "step": 901, + "time_per_iteration": 2.6920526027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_mlp": 1.08898544, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.06929151708835651, + "language_loss": 0.8084361, + "learning_rate": 0.0009469508811419626, + "loss": 0.81977129, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.44555664, + "step": 902, + "time_per_iteration": 2.7087764739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_mlp": 1.01825094, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.018918236495105482, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7265144, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.831868648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130376, + "balance_loss_mlp": 1.08429003, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.06904883588321564, + "language_loss": 0.84871197, + "learning_rate": 0.0009466712284439292, + "loss": 0.86001575, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.46118164, + "step": 904, + "time_per_iteration": 2.727154493331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135856, + "balance_loss_mlp": 1.08867335, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.0797697294198037, + "language_loss": 0.90077758, + "learning_rate": 0.0009465311419480276, + "loss": 0.9121362, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.47216797, + "step": 905, + "time_per_iteration": 2.659696340560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130539, + "balance_loss_mlp": 1.0859549, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.0780460064240459, + "language_loss": 0.89685637, + "learning_rate": 0.0009463908820933622, + "loss": 0.90816176, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.44604492, + "step": 906, + "time_per_iteration": 2.845508337020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_mlp": 1.10657179, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.06621529993663824, + "language_loss": 0.83420271, + "learning_rate": 0.0009462504489343868, + "loss": 0.84573436, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.46582031, + "step": 907, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152101, + "balance_loss_mlp": 1.10246193, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0823987818854668, + "language_loss": 0.9018122, + "learning_rate": 0.0009461098425256222, + "loss": 0.91333324, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.49633789, + "step": 908, + "time_per_iteration": 2.5904529094696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.11457169, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.0762262609163865, + "language_loss": 0.87090451, + "learning_rate": 0.0009459690629216567, + "loss": 0.88250846, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.45874023, + "step": 909, + "time_per_iteration": 2.61710524559021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155202, + "balance_loss_mlp": 1.10921121, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06657664395828655, + "language_loss": 0.88943893, + "learning_rate": 0.0009458281101771457, + "loss": 0.90099096, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.46020508, + "step": 910, + "time_per_iteration": 2.6421282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176316, + "balance_loss_mlp": 1.12810779, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.08799417436837091, + "language_loss": 0.8354404, + "learning_rate": 0.0009456869843468122, + "loss": 0.84720349, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.48217773, + "step": 911, + "time_per_iteration": 2.8633837699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178078, + "balance_loss_mlp": 1.12688971, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.08410877580390771, + "language_loss": 0.79552639, + "learning_rate": 0.0009455456854854459, + "loss": 0.80730712, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.51220703, + "step": 912, + "time_per_iteration": 2.661038875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180916, + "balance_loss_mlp": 1.13564038, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.17307911593328887, + "language_loss": 0.85480136, + "learning_rate": 0.0009454042136479039, + "loss": 0.86661053, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.45263672, + "step": 913, + "time_per_iteration": 2.561790943145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198041, + "balance_loss_mlp": 1.15183568, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.06959724621682493, + "language_loss": 0.8438077, + "learning_rate": 0.0009452625688891103, + "loss": 0.85578811, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.4621582, + "step": 914, + "time_per_iteration": 2.5396227836608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092507, + "balance_loss_mlp": 1.07600832, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.034614734916794516, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79827243, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.16503906, + "step": 915, + "time_per_iteration": 4.550157308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_mlp": 1.21347213, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08235911171958209, + "language_loss": 0.94223297, + "learning_rate": 0.0009449787608278015, + "loss": 0.95488179, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.51489258, + "step": 916, + "time_per_iteration": 2.8292665481567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243722, + "balance_loss_mlp": 1.19525158, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08361954447634375, + "language_loss": 0.9338274, + "learning_rate": 0.0009448365976354704, + "loss": 0.94626462, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.48461914, + "step": 917, + "time_per_iteration": 2.543883800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216482, + "balance_loss_mlp": 1.16622329, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.08482517786251102, + "language_loss": 0.91736883, + "learning_rate": 0.0009446942617422558, + "loss": 0.9295336, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.50317383, + "step": 918, + "time_per_iteration": 2.6130669116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118235, + "balance_loss_mlp": 1.13740778, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.07957198864097685, + "language_loss": 0.8648746, + "learning_rate": 0.0009445517532034176, + "loss": 0.87669808, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.44970703, + "step": 919, + "time_per_iteration": 2.7341010570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116033, + "balance_loss_mlp": 1.11002386, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.08371374964142012, + "language_loss": 0.9020586, + "learning_rate": 0.0009444090720742824, + "loss": 0.9136619, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.50341797, + "step": 920, + "time_per_iteration": 2.628169298171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158892, + "balance_loss_mlp": 1.1083951, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.07483188289837522, + "language_loss": 0.89025688, + "learning_rate": 0.0009442662184102439, + "loss": 0.90184581, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.50512695, + "step": 921, + "time_per_iteration": 2.7538435459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154828, + "balance_loss_mlp": 1.11210358, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.05276545299780942, + "language_loss": 0.88537991, + "learning_rate": 0.000944123192266763, + "loss": 0.89692819, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.42724609, + "step": 922, + "time_per_iteration": 2.788759469985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190709, + "balance_loss_mlp": 1.13887644, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.07681776188261369, + "language_loss": 0.84657156, + "learning_rate": 0.0009439799936993671, + "loss": 0.85847867, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.51904297, + "step": 923, + "time_per_iteration": 2.7123734951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196866, + "balance_loss_mlp": 1.14787149, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.09732559260361714, + "language_loss": 0.89131558, + "learning_rate": 0.0009438366227636511, + "loss": 0.90328419, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.49047852, + "step": 924, + "time_per_iteration": 2.6907341480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171193, + "balance_loss_mlp": 1.12396216, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07379366042998667, + "language_loss": 0.86971134, + "learning_rate": 0.0009436930795152763, + "loss": 0.88142323, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.47241211, + "step": 925, + "time_per_iteration": 2.865673065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168215, + "balance_loss_mlp": 1.12174773, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07469970420174622, + "language_loss": 0.8767308, + "learning_rate": 0.0009435493640099713, + "loss": 0.88841295, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.46411133, + "step": 926, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_mlp": 1.10388088, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.06972760602295516, + "language_loss": 0.85458124, + "learning_rate": 0.0009434054763035314, + "loss": 0.86612737, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.50756836, + "step": 927, + "time_per_iteration": 2.5972957611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 1.09983397, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.05666425765353489, + "language_loss": 0.86302543, + "learning_rate": 0.0009432614164518185, + "loss": 0.8745054, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.48168945, + "step": 928, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150828, + "balance_loss_mlp": 1.09780383, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07484249942420804, + "language_loss": 0.85464913, + "learning_rate": 0.000943117184510762, + "loss": 0.86615741, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 2.9855945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124448, + "balance_loss_mlp": 1.10556555, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.03465095249088487, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79914415, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.18847656, + "step": 930, + "time_per_iteration": 5.016055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148447, + "balance_loss_mlp": 1.09997642, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.07304481613225793, + "language_loss": 0.89790976, + "learning_rate": 0.0009428282045846674, + "loss": 0.90939426, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.48461914, + "step": 931, + "time_per_iteration": 2.787473678588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134009, + "balance_loss_mlp": 1.08797026, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.05043968313129053, + "language_loss": 0.90432143, + "learning_rate": 0.0009426834567118214, + "loss": 0.91566151, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.46044922, + "step": 932, + "time_per_iteration": 3.1106340885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149699, + "balance_loss_mlp": 1.10091829, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.0884624873286247, + "language_loss": 0.81563932, + "learning_rate": 0.0009425385369740155, + "loss": 0.82713628, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.48779297, + "step": 933, + "time_per_iteration": 3.056328296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.1138767, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.0672899912264689, + "language_loss": 0.88411558, + "learning_rate": 0.0009423934454275125, + "loss": 0.8957603, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.50561523, + "step": 934, + "time_per_iteration": 2.827507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162381, + "balance_loss_mlp": 1.11333871, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.07880287247644589, + "language_loss": 0.92845738, + "learning_rate": 0.0009422481821286418, + "loss": 0.94008112, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.49072266, + "step": 935, + "time_per_iteration": 2.7188265323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164456, + "balance_loss_mlp": 1.11918044, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.07978340192275198, + "language_loss": 0.88968349, + "learning_rate": 0.0009421027471337998, + "loss": 0.90132797, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.45239258, + "step": 936, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176728, + "balance_loss_mlp": 1.1271131, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.07049523693926517, + "language_loss": 0.83782339, + "learning_rate": 0.0009419571404994493, + "loss": 0.84959066, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.49584961, + "step": 937, + "time_per_iteration": 2.641847610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_mlp": 1.11354589, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.06745021535989586, + "language_loss": 0.91665328, + "learning_rate": 0.00094181136228212, + "loss": 0.92827624, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.48803711, + "step": 938, + "time_per_iteration": 2.622314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146811, + "balance_loss_mlp": 1.10334706, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06209482952821168, + "language_loss": 0.87085009, + "learning_rate": 0.0009416654125384077, + "loss": 0.88231826, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.43432617, + "step": 939, + "time_per_iteration": 2.735565423965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167753, + "balance_loss_mlp": 1.15230346, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.039552666267989665, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80940127, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15429688, + "step": 940, + "time_per_iteration": 4.9464662075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_mlp": 1.10293126, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.06405620484007693, + "language_loss": 0.85002685, + "learning_rate": 0.000941372998698552, + "loss": 0.86150396, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.44750977, + "step": 941, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152914, + "balance_loss_mlp": 1.10344219, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.07883971857950696, + "language_loss": 0.82437575, + "learning_rate": 0.0009412265347159336, + "loss": 0.8359049, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.49487305, + "step": 942, + "time_per_iteration": 2.727071762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135445, + "balance_loss_mlp": 1.09083664, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.10057326993772005, + "language_loss": 0.85614288, + "learning_rate": 0.0009410798994339829, + "loss": 0.86749732, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.44604492, + "step": 943, + "time_per_iteration": 2.6305696964263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.09248304, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.05478952043416941, + "language_loss": 0.88907182, + "learning_rate": 0.000940933092909628, + "loss": 0.90042174, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.42529297, + "step": 944, + "time_per_iteration": 2.631101369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.10530019, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.06051663433249254, + "language_loss": 0.84961444, + "learning_rate": 0.0009407861151998649, + "loss": 0.8611083, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.44067383, + "step": 945, + "time_per_iteration": 2.5717978477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116837, + "balance_loss_mlp": 1.12040067, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.06666982795430461, + "language_loss": 0.87044382, + "learning_rate": 0.0009406389663617552, + "loss": 0.88212758, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.47998047, + "step": 946, + "time_per_iteration": 2.6768407821655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170796, + "balance_loss_mlp": 1.12757087, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0759743739596538, + "language_loss": 0.87192827, + "learning_rate": 0.000940491646452427, + "loss": 0.88363624, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.43212891, + "step": 947, + "time_per_iteration": 2.7174758911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174804, + "balance_loss_mlp": 1.1271199, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.06285362616764655, + "language_loss": 0.91503757, + "learning_rate": 0.000940344155529075, + "loss": 0.92678559, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.47680664, + "step": 948, + "time_per_iteration": 2.6130924224853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175811, + "balance_loss_mlp": 1.12643504, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.07182633578445446, + "language_loss": 0.88395435, + "learning_rate": 0.0009401964936489605, + "loss": 0.89571244, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.4934082, + "step": 949, + "time_per_iteration": 2.518735885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154077, + "balance_loss_mlp": 1.11173368, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.08616214546245322, + "language_loss": 0.86381257, + "learning_rate": 0.0009400486608694108, + "loss": 0.87535334, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.42358398, + "step": 950, + "time_per_iteration": 2.7356269359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_mlp": 1.10071373, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.05684050086710682, + "language_loss": 0.88146299, + "learning_rate": 0.0009399006572478195, + "loss": 0.89294124, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.47119141, + "step": 951, + "time_per_iteration": 3.0829784870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113898, + "balance_loss_mlp": 1.09449124, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06809630737889293, + "language_loss": 0.91594249, + "learning_rate": 0.0009397524828416468, + "loss": 0.92733228, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.44482422, + "step": 952, + "time_per_iteration": 2.710500478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141316, + "balance_loss_mlp": 1.09339356, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.06814185159234107, + "language_loss": 0.97457635, + "learning_rate": 0.0009396041377084192, + "loss": 0.98598951, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.47949219, + "step": 953, + "time_per_iteration": 2.6530585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011339, + "balance_loss_mlp": 1.08716977, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.06688505748067412, + "language_loss": 0.88496006, + "learning_rate": 0.0009394556219057295, + "loss": 0.896299, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.46704102, + "step": 954, + "time_per_iteration": 2.662543773651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.08948374, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08148035498798997, + "language_loss": 0.84775722, + "learning_rate": 0.0009393069354912362, + "loss": 0.85911626, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.46386719, + "step": 955, + "time_per_iteration": 2.7262632846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_mlp": 1.0954181, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07343823471440349, + "language_loss": 0.83466816, + "learning_rate": 0.0009391580785226649, + "loss": 0.8460598, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.43774414, + "step": 956, + "time_per_iteration": 2.8661141395568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_mlp": 1.04708123, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.029557521366383285, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80407178, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.19628906, + "step": 957, + "time_per_iteration": 4.751030921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.08978534, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.06490118531587029, + "language_loss": 0.87677503, + "learning_rate": 0.0009388598531545196, + "loss": 0.88812232, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.44946289, + "step": 958, + "time_per_iteration": 2.8378970623016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143042, + "balance_loss_mlp": 1.09702718, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.07391212127287443, + "language_loss": 0.86896807, + "learning_rate": 0.000938710484870727, + "loss": 0.88039851, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.46044922, + "step": 959, + "time_per_iteration": 4.31168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128823, + "balance_loss_mlp": 1.08416748, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0638837232249089, + "language_loss": 0.86957002, + "learning_rate": 0.0009385609462644189, + "loss": 0.88085824, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.44702148, + "step": 960, + "time_per_iteration": 2.6793572902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_mlp": 1.07233214, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07248975394705585, + "language_loss": 0.86711299, + "learning_rate": 0.0009384112373936514, + "loss": 0.87830293, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.46679688, + "step": 961, + "time_per_iteration": 2.6220860481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119858, + "balance_loss_mlp": 1.07334304, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.06813544125014795, + "language_loss": 0.92053163, + "learning_rate": 0.0009382613583165467, + "loss": 0.93173021, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.46533203, + "step": 962, + "time_per_iteration": 2.8032093048095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108588, + "balance_loss_mlp": 1.06142831, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07296294799157402, + "language_loss": 0.9064188, + "learning_rate": 0.0009381113090912928, + "loss": 0.91750467, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.47167969, + "step": 963, + "time_per_iteration": 2.7358789443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_mlp": 1.06741881, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.07962159601741099, + "language_loss": 0.90353996, + "learning_rate": 0.000937961089776144, + "loss": 0.91463923, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.42480469, + "step": 964, + "time_per_iteration": 2.5761237144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.07924736, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09082243760489998, + "language_loss": 0.83673573, + "learning_rate": 0.0009378107004294208, + "loss": 0.84802246, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.49438477, + "step": 965, + "time_per_iteration": 2.9681291580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132442, + "balance_loss_mlp": 1.08542585, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08405098410424734, + "language_loss": 0.92054594, + "learning_rate": 0.0009376601411095096, + "loss": 0.93187034, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.4699707, + "step": 966, + "time_per_iteration": 2.696122407913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.09773731, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07104128547690361, + "language_loss": 0.87554526, + "learning_rate": 0.0009375094118748622, + "loss": 0.88693225, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.40991211, + "step": 967, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179223, + "balance_loss_mlp": 1.13373268, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.0728928893981835, + "language_loss": 0.91626799, + "learning_rate": 0.0009373585127839976, + "loss": 0.92806023, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.45507812, + "step": 968, + "time_per_iteration": 2.9854021072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212732, + "balance_loss_mlp": 1.16905367, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08777237711590531, + "language_loss": 0.91368866, + "learning_rate": 0.0009372074438954994, + "loss": 0.92581606, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.43652344, + "step": 969, + "time_per_iteration": 2.5014536380767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211792, + "balance_loss_mlp": 1.16539574, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.0704882552763471, + "language_loss": 0.92436379, + "learning_rate": 0.0009370562052680181, + "loss": 0.93648171, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.46411133, + "step": 970, + "time_per_iteration": 2.453458070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120766, + "balance_loss_mlp": 1.16183591, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.07372597108689087, + "language_loss": 0.89988613, + "learning_rate": 0.0009369047969602695, + "loss": 0.91196281, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.45825195, + "step": 971, + "time_per_iteration": 2.703948497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192702, + "balance_loss_mlp": 1.14396954, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.08557962606734577, + "language_loss": 0.8750906, + "learning_rate": 0.0009367532190310357, + "loss": 0.88701761, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.48657227, + "step": 972, + "time_per_iteration": 4.1564977169036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148896, + "balance_loss_mlp": 1.1052649, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.06811184838385763, + "language_loss": 0.89467651, + "learning_rate": 0.0009366014715391644, + "loss": 0.90616548, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.43603516, + "step": 973, + "time_per_iteration": 2.695730209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134701, + "balance_loss_mlp": 1.09307301, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.054567817192194557, + "language_loss": 0.84347546, + "learning_rate": 0.0009364495545435693, + "loss": 0.85482252, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.41625977, + "step": 974, + "time_per_iteration": 2.828831672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146218, + "balance_loss_mlp": 1.09970224, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.08256927623824414, + "language_loss": 0.89333141, + "learning_rate": 0.0009362974681032297, + "loss": 0.90479362, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.46484375, + "step": 975, + "time_per_iteration": 2.5982418060302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143654, + "balance_loss_mlp": 1.09909391, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.07754570301250979, + "language_loss": 0.89447427, + "learning_rate": 0.0009361452122771907, + "loss": 0.90591079, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.44555664, + "step": 976, + "time_per_iteration": 2.881242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_mlp": 1.08834195, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.0965092241218366, + "language_loss": 0.84541976, + "learning_rate": 0.0009359927871245635, + "loss": 0.85675669, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.45361328, + "step": 977, + "time_per_iteration": 2.4720265865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113596, + "balance_loss_mlp": 1.09039843, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09227923665031239, + "language_loss": 0.87538362, + "learning_rate": 0.0009358401927045246, + "loss": 0.88674331, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.45581055, + "step": 978, + "time_per_iteration": 2.8225297927856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_mlp": 1.0945406, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.05953389716062443, + "language_loss": 0.88990903, + "learning_rate": 0.0009356874290763166, + "loss": 0.90131652, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.46264648, + "step": 979, + "time_per_iteration": 3.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_mlp": 1.09494936, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.06969100284100371, + "language_loss": 0.89955008, + "learning_rate": 0.0009355344962992474, + "loss": 0.91095543, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.45581055, + "step": 980, + "time_per_iteration": 2.6008429527282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138568, + "balance_loss_mlp": 1.09291101, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07021551702573088, + "language_loss": 0.88888156, + "learning_rate": 0.0009353813944326908, + "loss": 0.90026724, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.45654297, + "step": 981, + "time_per_iteration": 2.9102253913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141833, + "balance_loss_mlp": 1.09352899, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0640154196439605, + "language_loss": 0.83560127, + "learning_rate": 0.0009352281235360863, + "loss": 0.84701967, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.4831543, + "step": 982, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.10627127, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.06254433649037737, + "language_loss": 0.85791624, + "learning_rate": 0.0009350746836689389, + "loss": 0.86940861, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.4296875, + "step": 983, + "time_per_iteration": 2.524491548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.02905524, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.024687708549402564, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82486492, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.18261719, + "step": 984, + "time_per_iteration": 5.200335741043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156154, + "balance_loss_mlp": 1.1069684, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08202626484000469, + "language_loss": 0.84151661, + "learning_rate": 0.0009347672972613634, + "loss": 0.85307819, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.49145508, + "step": 985, + "time_per_iteration": 2.6939473152160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011517, + "balance_loss_mlp": 1.10756862, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.061889675774481866, + "language_loss": 0.8651796, + "learning_rate": 0.0009346133508402735, + "loss": 0.87669659, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.44140625, + "step": 986, + "time_per_iteration": 2.695004463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146205, + "balance_loss_mlp": 1.1000948, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07730871241699967, + "language_loss": 0.84821075, + "learning_rate": 0.0009344592356873166, + "loss": 0.85967278, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.46118164, + "step": 987, + "time_per_iteration": 2.635143518447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_mlp": 1.0975666, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.058246004489727894, + "language_loss": 0.79289091, + "learning_rate": 0.0009343049518623255, + "loss": 0.80432773, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.46142578, + "step": 988, + "time_per_iteration": 2.7257165908813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126709, + "balance_loss_mlp": 1.08503366, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.06464318177286693, + "language_loss": 0.83752143, + "learning_rate": 0.0009341504994251985, + "loss": 0.84878862, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.41674805, + "step": 989, + "time_per_iteration": 2.8336057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_mlp": 1.03692603, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.01962059038868396, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74572587, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.15136719, + "step": 990, + "time_per_iteration": 4.980287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.07682681, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.06360467015426281, + "language_loss": 0.82411575, + "learning_rate": 0.0009338410889544574, + "loss": 0.83530033, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.41601562, + "step": 991, + "time_per_iteration": 3.0192768573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123102, + "balance_loss_mlp": 1.0790422, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.06107834506241764, + "language_loss": 0.88440853, + "learning_rate": 0.000933686131040967, + "loss": 0.89563954, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.44067383, + "step": 992, + "time_per_iteration": 2.795952796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118187, + "balance_loss_mlp": 1.07479525, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.08075044213119366, + "language_loss": 0.91145802, + "learning_rate": 0.0009335310047555883, + "loss": 0.92263985, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.43383789, + "step": 993, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144036, + "balance_loss_mlp": 1.10052443, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.06789475617385991, + "language_loss": 0.89048505, + "learning_rate": 0.0009333757101585467, + "loss": 0.90192544, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.43554688, + "step": 994, + "time_per_iteration": 2.659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_mlp": 1.11687493, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.05475551086737561, + "language_loss": 0.94071913, + "learning_rate": 0.0009332202473101329, + "loss": 0.95231587, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.42822266, + "step": 995, + "time_per_iteration": 2.672307014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153956, + "balance_loss_mlp": 1.11011088, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.060816834986447306, + "language_loss": 0.8370983, + "learning_rate": 0.0009330646162707028, + "loss": 0.84863788, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.4387207, + "step": 996, + "time_per_iteration": 2.7483248710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.11274719, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.05013127115514869, + "language_loss": 0.85195571, + "learning_rate": 0.0009329088171006779, + "loss": 0.86350954, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.42626953, + "step": 997, + "time_per_iteration": 3.1445202827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_mlp": 1.1197654, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.07353815647154911, + "language_loss": 0.86074895, + "learning_rate": 0.0009327528498605446, + "loss": 0.87238026, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.43383789, + "step": 998, + "time_per_iteration": 2.536146402359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159094, + "balance_loss_mlp": 1.11844337, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.06861677349241169, + "language_loss": 0.9080506, + "learning_rate": 0.0009325967146108548, + "loss": 0.91964149, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.40649414, + "step": 999, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151834, + "balance_loss_mlp": 1.11049271, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.0672850368289366, + "language_loss": 0.88138115, + "learning_rate": 0.0009324404114122258, + "loss": 0.89289951, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.41357422, + "step": 1000, + "time_per_iteration": 2.677651882171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.12221444, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.06402741154285656, + "language_loss": 0.8710497, + "learning_rate": 0.0009322839403253397, + "loss": 0.88269627, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.42431641, + "step": 1001, + "time_per_iteration": 2.7528679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169088, + "balance_loss_mlp": 1.12440836, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07104878229054386, + "language_loss": 0.84949791, + "learning_rate": 0.0009321273014109439, + "loss": 0.86118877, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.44702148, + "step": 1002, + "time_per_iteration": 2.9990484714508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114889, + "balance_loss_mlp": 1.10523582, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.0673469195429183, + "language_loss": 0.85240018, + "learning_rate": 0.0009319704947298513, + "loss": 0.8638891, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.43676758, + "step": 1003, + "time_per_iteration": 2.8755459785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141118, + "balance_loss_mlp": 1.10127831, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.0925310675323854, + "language_loss": 0.89122581, + "learning_rate": 0.0009318135203429393, + "loss": 0.902637, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.3984375, + "step": 1004, + "time_per_iteration": 2.771192789077759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_mlp": 1.0866611, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.05779097302789, + "language_loss": 0.88602638, + "learning_rate": 0.0009316563783111511, + "loss": 0.8973062, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.41308594, + "step": 1005, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_mlp": 1.08638334, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06006842888316194, + "language_loss": 0.83199531, + "learning_rate": 0.0009314990686954943, + "loss": 0.84330451, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.44506836, + "step": 1006, + "time_per_iteration": 2.935081720352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_mlp": 1.09561515, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.0666735983489841, + "language_loss": 0.81657201, + "learning_rate": 0.000931341591557042, + "loss": 0.82798046, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.45263672, + "step": 1007, + "time_per_iteration": 3.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155404, + "balance_loss_mlp": 1.1041683, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.08115294197805281, + "language_loss": 0.87899536, + "learning_rate": 0.0009311839469569325, + "loss": 0.89054936, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.51171875, + "step": 1008, + "time_per_iteration": 2.6384472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150065, + "balance_loss_mlp": 1.10030699, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.07776470075981182, + "language_loss": 0.88065994, + "learning_rate": 0.0009310261349563687, + "loss": 0.89216053, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.49804688, + "step": 1009, + "time_per_iteration": 2.703058958053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_mlp": 1.11160064, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.05519618089274153, + "language_loss": 0.86250293, + "learning_rate": 0.0009308681556166186, + "loss": 0.87407839, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.45922852, + "step": 1010, + "time_per_iteration": 2.8404791355133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177928, + "balance_loss_mlp": 1.12480855, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10323239067467582, + "language_loss": 0.8870275, + "learning_rate": 0.0009307100089990152, + "loss": 0.89880681, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.53100586, + "step": 1011, + "time_per_iteration": 2.7103512287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185201, + "balance_loss_mlp": 1.13530004, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.08766026563197518, + "language_loss": 0.84582877, + "learning_rate": 0.0009305516951649568, + "loss": 0.8576808, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.49902344, + "step": 1012, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175674, + "balance_loss_mlp": 1.12818122, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07259628373080033, + "language_loss": 0.87723738, + "learning_rate": 0.0009303932141759057, + "loss": 0.8889941, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.47485352, + "step": 1013, + "time_per_iteration": 2.7738490104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161359, + "balance_loss_mlp": 1.11200666, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.07589756885314788, + "language_loss": 0.84698361, + "learning_rate": 0.0009302345660933902, + "loss": 0.85859716, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.49291992, + "step": 1014, + "time_per_iteration": 2.7809414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152692, + "balance_loss_mlp": 1.10579538, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.06636914889533592, + "language_loss": 0.85938931, + "learning_rate": 0.0009300757509790026, + "loss": 0.87091625, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.46875, + "step": 1015, + "time_per_iteration": 2.886200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151123, + "balance_loss_mlp": 1.10324848, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.08384883211824797, + "language_loss": 0.91210115, + "learning_rate": 0.0009299167688944005, + "loss": 0.92361236, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.47827148, + "step": 1016, + "time_per_iteration": 2.5308799743652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135215, + "balance_loss_mlp": 1.09036839, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07612639660839114, + "language_loss": 0.86733758, + "learning_rate": 0.0009297576199013063, + "loss": 0.87868977, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.44873047, + "step": 1017, + "time_per_iteration": 2.699352264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_mlp": 1.14159799, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.04987694814110311, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74158609, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.1484375, + "step": 1018, + "time_per_iteration": 4.927512168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099249, + "balance_loss_mlp": 1.08494341, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.032347612483235935, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80525547, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14257812, + "step": 1019, + "time_per_iteration": 5.494646787643433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_mlp": 1.08855522, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.06601293097738069, + "language_loss": 0.87223667, + "learning_rate": 0.0009292791720892659, + "loss": 0.88352561, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.40332031, + "step": 1020, + "time_per_iteration": 2.8718464374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_mlp": 1.08823943, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07136038826441608, + "language_loss": 0.89387941, + "learning_rate": 0.0009291193560807218, + "loss": 0.90521628, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.45483398, + "step": 1021, + "time_per_iteration": 2.588604211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132851, + "balance_loss_mlp": 1.09141409, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.06738480994857221, + "language_loss": 0.87651652, + "learning_rate": 0.0009289593734732688, + "loss": 0.88784504, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.41430664, + "step": 1022, + "time_per_iteration": 2.5915818214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.09036541, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.06942729809827348, + "language_loss": 0.94984972, + "learning_rate": 0.0009287992243290175, + "loss": 0.96114612, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.39282227, + "step": 1023, + "time_per_iteration": 2.4477546215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142342, + "balance_loss_mlp": 1.09880638, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.1017247644504036, + "language_loss": 0.91891634, + "learning_rate": 0.0009286389087101435, + "loss": 0.93033981, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 2.765334129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142412, + "balance_loss_mlp": 1.09942544, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07195718640229302, + "language_loss": 0.8893857, + "learning_rate": 0.0009284784266788864, + "loss": 0.90080982, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.42993164, + "step": 1025, + "time_per_iteration": 2.7323853969573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_mlp": 1.10327554, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.069193395974369, + "language_loss": 0.93259764, + "learning_rate": 0.0009283177782975512, + "loss": 0.94401753, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.38696289, + "step": 1026, + "time_per_iteration": 2.9729068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114551, + "balance_loss_mlp": 1.10142589, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08755988500201482, + "language_loss": 0.88955659, + "learning_rate": 0.000928156963628507, + "loss": 0.90101171, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.44067383, + "step": 1027, + "time_per_iteration": 2.594200849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138947, + "balance_loss_mlp": 1.09855926, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.07316483198701504, + "language_loss": 0.89277303, + "learning_rate": 0.0009279959827341877, + "loss": 0.90416259, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.40405273, + "step": 1028, + "time_per_iteration": 2.7378368377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140451, + "balance_loss_mlp": 1.09727335, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.059550544329949856, + "language_loss": 0.88526183, + "learning_rate": 0.0009278348356770915, + "loss": 0.89666629, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.43188477, + "step": 1029, + "time_per_iteration": 2.5737922191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133825, + "balance_loss_mlp": 1.0914098, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.06393748023743129, + "language_loss": 0.8587814, + "learning_rate": 0.0009276735225197814, + "loss": 0.87011963, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.42431641, + "step": 1030, + "time_per_iteration": 2.648477077484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146668, + "balance_loss_mlp": 1.10170269, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.06069855374703422, + "language_loss": 0.86812896, + "learning_rate": 0.0009275120433248847, + "loss": 0.87959564, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.44946289, + "step": 1031, + "time_per_iteration": 2.6862802505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.10327268, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.06482797348212818, + "language_loss": 0.87033594, + "learning_rate": 0.0009273503981550931, + "loss": 0.8818205, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.45166016, + "step": 1032, + "time_per_iteration": 3.0549416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157268, + "balance_loss_mlp": 1.11235023, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.07571303407420105, + "language_loss": 0.87661642, + "learning_rate": 0.0009271885870731626, + "loss": 0.88818914, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.44946289, + "step": 1033, + "time_per_iteration": 2.4938008785247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172373, + "balance_loss_mlp": 1.12495148, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.07801561202279184, + "language_loss": 0.89466584, + "learning_rate": 0.0009270266101419143, + "loss": 0.90638959, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.47460938, + "step": 1034, + "time_per_iteration": 2.61181378364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.12681675, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.07487269237991181, + "language_loss": 0.85762119, + "learning_rate": 0.0009268644674242328, + "loss": 0.86931992, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.43066406, + "step": 1035, + "time_per_iteration": 2.6761085987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163027, + "balance_loss_mlp": 1.1147716, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.06997084642295975, + "language_loss": 0.81697071, + "learning_rate": 0.0009267021589830678, + "loss": 0.828601, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.4831543, + "step": 1036, + "time_per_iteration": 2.6166343688964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162737, + "balance_loss_mlp": 1.14547551, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.04224955266067769, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78789818, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.17285156, + "step": 1037, + "time_per_iteration": 4.932336330413818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124804, + "balance_loss_mlp": 1.08224678, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.07370646472771722, + "language_loss": 0.9354341, + "learning_rate": 0.000926377045182406, + "loss": 0.94668216, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.42553711, + "step": 1038, + "time_per_iteration": 2.89486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122228, + "balance_loss_mlp": 1.07704759, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.06351485696264159, + "language_loss": 0.88915765, + "learning_rate": 0.0009262142399491296, + "loss": 0.9003799, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.4519043, + "step": 1039, + "time_per_iteration": 3.0843544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132553, + "balance_loss_mlp": 1.08784938, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.06429886269356283, + "language_loss": 0.89007306, + "learning_rate": 0.0009260512692448105, + "loss": 0.9013986, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.44677734, + "step": 1040, + "time_per_iteration": 2.7221181392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143871, + "balance_loss_mlp": 1.10071695, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0714265416650486, + "language_loss": 0.85044324, + "learning_rate": 0.000925888133132719, + "loss": 0.86188197, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.43164062, + "step": 1041, + "time_per_iteration": 2.7112865447998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113685, + "balance_loss_mlp": 1.09566069, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.0301437897992815, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072412, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.18066406, + "step": 1042, + "time_per_iteration": 4.913869380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.13338971, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.11345429965909062, + "language_loss": 0.82242954, + "learning_rate": 0.0009255613649386244, + "loss": 0.83422714, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.46337891, + "step": 1043, + "time_per_iteration": 2.6586339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153581, + "balance_loss_mlp": 1.11133325, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07362734504976313, + "language_loss": 0.79954398, + "learning_rate": 0.0009253977329834838, + "loss": 0.81107974, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.42236328, + "step": 1044, + "time_per_iteration": 2.7028462886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143902, + "balance_loss_mlp": 1.0951457, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.07842723007783056, + "language_loss": 0.8753317, + "learning_rate": 0.0009252339358742965, + "loss": 0.88677073, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.48779297, + "step": 1045, + "time_per_iteration": 2.8069612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139165, + "balance_loss_mlp": 1.0902648, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07197327624603128, + "language_loss": 0.84128577, + "learning_rate": 0.000925069973674654, + "loss": 0.85267735, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.48925781, + "step": 1046, + "time_per_iteration": 2.603602409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136955, + "balance_loss_mlp": 1.09303868, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06199919012721526, + "language_loss": 0.89849102, + "learning_rate": 0.000924905846448212, + "loss": 0.90986055, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.43896484, + "step": 1047, + "time_per_iteration": 2.733009099960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166193, + "balance_loss_mlp": 1.11726964, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.08010189097684783, + "language_loss": 0.86224002, + "learning_rate": 0.0009247415542586906, + "loss": 0.87390196, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.48950195, + "step": 1048, + "time_per_iteration": 2.8471555709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186895, + "balance_loss_mlp": 1.13675559, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.050762349186412876, + "language_loss": 0.83535373, + "learning_rate": 0.0009245770971698735, + "loss": 0.84722269, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.50170898, + "step": 1049, + "time_per_iteration": 2.889474630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183513, + "balance_loss_mlp": 1.13671136, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.07506320746734087, + "language_loss": 0.8918047, + "learning_rate": 0.0009244124752456087, + "loss": 0.90363979, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.46826172, + "step": 1050, + "time_per_iteration": 2.5762786865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205722, + "balance_loss_mlp": 1.15453339, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.08917577036116058, + "language_loss": 0.86475039, + "learning_rate": 0.0009242476885498081, + "loss": 0.87680757, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.51220703, + "step": 1051, + "time_per_iteration": 2.720395565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193502, + "balance_loss_mlp": 1.14009643, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.08090891256677915, + "language_loss": 0.81871718, + "learning_rate": 0.0009240827371464474, + "loss": 0.83065224, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.53442383, + "step": 1052, + "time_per_iteration": 2.535388231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162702, + "balance_loss_mlp": 1.11833251, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.08177732735855556, + "language_loss": 0.84886205, + "learning_rate": 0.0009239176210995666, + "loss": 0.86048913, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.4440918, + "step": 1053, + "time_per_iteration": 3.4955379962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148392, + "balance_loss_mlp": 1.0973227, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.9822109545682867, + "language_loss": 0.94933617, + "learning_rate": 0.0009237523404732695, + "loss": 0.96082008, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51074219, + "step": 1054, + "time_per_iteration": 2.90132737159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137464, + "balance_loss_mlp": 1.09118664, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.09331279688006895, + "language_loss": 0.85504258, + "learning_rate": 0.0009235868953317235, + "loss": 0.86641729, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.46264648, + "step": 1055, + "time_per_iteration": 2.813202381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212355, + "balance_loss_mlp": 1.16388512, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08645469446577787, + "language_loss": 0.86679947, + "learning_rate": 0.0009234212857391602, + "loss": 0.87892294, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.48486328, + "step": 1056, + "time_per_iteration": 3.184723377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_mlp": 1.23723245, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.11402704661401492, + "language_loss": 0.90548229, + "learning_rate": 0.000923255511759875, + "loss": 0.91837716, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.52319336, + "step": 1057, + "time_per_iteration": 2.8404476642608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374128, + "balance_loss_mlp": 1.3215096, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.12448379126392096, + "language_loss": 0.86306804, + "learning_rate": 0.000923089573458227, + "loss": 0.87680936, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.52661133, + "step": 1058, + "time_per_iteration": 2.921942949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411943, + "balance_loss_mlp": 1.35701096, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.12614323996078466, + "language_loss": 0.84856015, + "learning_rate": 0.0009229234708986392, + "loss": 0.8626796, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.54931641, + "step": 1059, + "time_per_iteration": 2.922795057296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01629047, + "balance_loss_mlp": 1.60253465, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.12493252943786969, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83295941, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.265625, + "step": 1060, + "time_per_iteration": 4.733684062957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333622, + "balance_loss_mlp": 1.27976346, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.0936460184690869, + "language_loss": 0.86563337, + "learning_rate": 0.0009225907732636548, + "loss": 0.87896961, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.53881836, + "step": 1061, + "time_per_iteration": 2.761353015899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296883, + "balance_loss_mlp": 1.24183202, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.09002543594031559, + "language_loss": 0.87698424, + "learning_rate": 0.0009224241783174227, + "loss": 0.88995302, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.55078125, + "step": 1062, + "time_per_iteration": 2.7161052227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252808, + "balance_loss_mlp": 1.19947362, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.08928798499879465, + "language_loss": 0.87254798, + "learning_rate": 0.0009222574193715802, + "loss": 0.88507611, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.53369141, + "step": 1063, + "time_per_iteration": 2.779623031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122391, + "balance_loss_mlp": 1.16757131, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06606001070927259, + "language_loss": 0.87212694, + "learning_rate": 0.000922090496490869, + "loss": 0.88436604, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.56323242, + "step": 1064, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217897, + "balance_loss_mlp": 1.16120076, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.3109146854617931, + "language_loss": 0.90918952, + "learning_rate": 0.0009219234097400937, + "loss": 0.92136848, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.56665039, + "step": 1065, + "time_per_iteration": 2.804588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245438, + "balance_loss_mlp": 1.18359244, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06908392980849179, + "language_loss": 0.84456235, + "learning_rate": 0.0009217561591841237, + "loss": 0.85701674, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.61816406, + "step": 1066, + "time_per_iteration": 3.303875207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287048, + "balance_loss_mlp": 1.21867001, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.1162597514909173, + "language_loss": 0.82140827, + "learning_rate": 0.0009215887448878913, + "loss": 0.83427876, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.68408203, + "step": 1067, + "time_per_iteration": 2.568690776824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293452, + "balance_loss_mlp": 1.22288036, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08586469474305494, + "language_loss": 0.85986763, + "learning_rate": 0.0009214211669163922, + "loss": 0.87280214, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.70654297, + "step": 1068, + "time_per_iteration": 2.700090169906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_mlp": 1.21408105, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.06609725061841937, + "language_loss": 0.94520444, + "learning_rate": 0.0009212534253346862, + "loss": 0.95800096, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.65478516, + "step": 1069, + "time_per_iteration": 2.696699857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285979, + "balance_loss_mlp": 1.21912634, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.07442061186670905, + "language_loss": 0.85475862, + "learning_rate": 0.0009210855202078964, + "loss": 0.86761844, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.66845703, + "step": 1070, + "time_per_iteration": 2.5769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_mlp": 1.21771979, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.07631989099853977, + "language_loss": 0.88063252, + "learning_rate": 0.0009209174516012091, + "loss": 0.89347488, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.66601562, + "step": 1071, + "time_per_iteration": 2.6154239177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261362, + "balance_loss_mlp": 1.19317448, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.05883273983798781, + "language_loss": 0.90461957, + "learning_rate": 0.0009207492195798747, + "loss": 0.91723317, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.68164062, + "step": 1072, + "time_per_iteration": 2.764965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261739, + "balance_loss_mlp": 1.18997467, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.07316980575900926, + "language_loss": 0.86156094, + "learning_rate": 0.0009205808242092061, + "loss": 0.87417829, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.71728516, + "step": 1073, + "time_per_iteration": 2.6222856044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258718, + "balance_loss_mlp": 1.18952858, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.06600331144021966, + "language_loss": 0.83598334, + "learning_rate": 0.0009204122655545808, + "loss": 0.84857053, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.69189453, + "step": 1074, + "time_per_iteration": 3.313964605331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252953, + "balance_loss_mlp": 1.18571925, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.06834339296378739, + "language_loss": 0.82186073, + "learning_rate": 0.0009202435436814388, + "loss": 0.83439028, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.67236328, + "step": 1075, + "time_per_iteration": 2.68725848197937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260409, + "balance_loss_mlp": 1.1926024, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.07476886245144747, + "language_loss": 0.91110998, + "learning_rate": 0.0009200746586552836, + "loss": 0.92371404, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.67773438, + "step": 1076, + "time_per_iteration": 2.889910936355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238308, + "balance_loss_mlp": 1.17145491, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.06855298516082668, + "language_loss": 0.84957182, + "learning_rate": 0.0009199056105416825, + "loss": 0.86195493, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.66894531, + "step": 1077, + "time_per_iteration": 3.0826096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242457, + "balance_loss_mlp": 1.17312455, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.0732932371665923, + "language_loss": 0.87494361, + "learning_rate": 0.0009197363994062654, + "loss": 0.8873682, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.69287109, + "step": 1078, + "time_per_iteration": 2.814481735229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121373, + "balance_loss_mlp": 1.15455508, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.060498447021287705, + "language_loss": 0.85097158, + "learning_rate": 0.0009195670253147262, + "loss": 0.86310887, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.59179688, + "step": 1079, + "time_per_iteration": 2.989818572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216427, + "balance_loss_mlp": 1.15286458, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.0563328194871683, + "language_loss": 0.83052152, + "learning_rate": 0.0009193974883328216, + "loss": 0.84268576, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.63574219, + "step": 1080, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209864, + "balance_loss_mlp": 1.14553857, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06150097183917509, + "language_loss": 0.87932825, + "learning_rate": 0.0009192277885263718, + "loss": 0.89142686, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.64306641, + "step": 1081, + "time_per_iteration": 2.65731143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198356, + "balance_loss_mlp": 1.13264751, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.05302154537588453, + "language_loss": 0.86579674, + "learning_rate": 0.0009190579259612602, + "loss": 0.87778032, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.65722656, + "step": 1082, + "time_per_iteration": 3.2999303340911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207666, + "balance_loss_mlp": 1.14300656, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.07988409962843289, + "language_loss": 0.87673134, + "learning_rate": 0.000918887900703433, + "loss": 0.88880801, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.64648438, + "step": 1083, + "time_per_iteration": 2.7956981658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204411, + "balance_loss_mlp": 1.14361465, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07357181622228276, + "language_loss": 0.91242653, + "learning_rate": 0.0009187177128188999, + "loss": 0.92447066, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.60693359, + "step": 1084, + "time_per_iteration": 2.4656450748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194774, + "balance_loss_mlp": 1.16902518, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.038082499218869, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78351313, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.2578125, + "step": 1085, + "time_per_iteration": 4.855400323867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181967, + "balance_loss_mlp": 1.12419796, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07376491342946172, + "language_loss": 0.86747313, + "learning_rate": 0.000918376849434071, + "loss": 0.87929279, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.57739258, + "step": 1086, + "time_per_iteration": 2.5493998527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192292, + "balance_loss_mlp": 1.1305418, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.07728027722551846, + "language_loss": 0.9155581, + "learning_rate": 0.0009182061740661098, + "loss": 0.92748106, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.61767578, + "step": 1087, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192286, + "balance_loss_mlp": 1.13144195, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.057753656338862314, + "language_loss": 0.85712528, + "learning_rate": 0.0009180353363361127, + "loss": 0.86904812, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.60888672, + "step": 1088, + "time_per_iteration": 3.1143646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180296, + "balance_loss_mlp": 1.11868906, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.07221088423930573, + "language_loss": 0.83469599, + "learning_rate": 0.0009178643363104044, + "loss": 0.84649897, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.61621094, + "step": 1089, + "time_per_iteration": 3.092656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199477, + "balance_loss_mlp": 1.138394, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.08745424257973078, + "language_loss": 0.92463166, + "learning_rate": 0.0009176931740553735, + "loss": 0.93662637, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.61083984, + "step": 1090, + "time_per_iteration": 2.53558349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207875, + "balance_loss_mlp": 1.14850855, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.07295134358518522, + "language_loss": 0.83623219, + "learning_rate": 0.0009175218496374708, + "loss": 0.84831095, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.59277344, + "step": 1091, + "time_per_iteration": 3.3514459133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226261, + "balance_loss_mlp": 1.16503549, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.0645587086921242, + "language_loss": 0.86590576, + "learning_rate": 0.0009173503631232103, + "loss": 0.87816834, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.61181641, + "step": 1092, + "time_per_iteration": 3.3893167972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122226, + "balance_loss_mlp": 1.16194034, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.12026645314545058, + "language_loss": 0.8245008, + "learning_rate": 0.0009171787145791691, + "loss": 0.83672333, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.60351562, + "step": 1093, + "time_per_iteration": 3.251084327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251584, + "balance_loss_mlp": 1.18854666, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.08481501206118727, + "language_loss": 0.8143028, + "learning_rate": 0.000917006904071987, + "loss": 0.82681859, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.63037109, + "step": 1094, + "time_per_iteration": 2.613060712814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_mlp": 1.20551634, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.08143629367900677, + "language_loss": 0.87639427, + "learning_rate": 0.0009168349316683669, + "loss": 0.88911939, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.66992188, + "step": 1095, + "time_per_iteration": 2.705172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269472, + "balance_loss_mlp": 1.20462179, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.05512017255927588, + "language_loss": 0.83512938, + "learning_rate": 0.0009166627974350741, + "loss": 0.8478241, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.64746094, + "step": 1096, + "time_per_iteration": 2.8979411125183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259233, + "balance_loss_mlp": 1.19390619, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.06519728045913388, + "language_loss": 0.90715098, + "learning_rate": 0.0009164905014389373, + "loss": 0.91974336, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.65283203, + "step": 1097, + "time_per_iteration": 2.7965359687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291926, + "balance_loss_mlp": 1.22445381, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.07891140172991894, + "language_loss": 0.87571776, + "learning_rate": 0.0009163180437468476, + "loss": 0.88863701, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.67480469, + "step": 1098, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012877, + "balance_loss_mlp": 1.22065675, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.06282838131309415, + "language_loss": 0.86816525, + "learning_rate": 0.000916145424425759, + "loss": 0.88104224, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.67041016, + "step": 1099, + "time_per_iteration": 2.6685678958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305165, + "balance_loss_mlp": 1.23554707, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.08616648204830919, + "language_loss": 0.916682, + "learning_rate": 0.0009159726435426885, + "loss": 0.92973363, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.69628906, + "step": 1100, + "time_per_iteration": 3.0852713584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282199, + "balance_loss_mlp": 1.21677744, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.07323647205544051, + "language_loss": 0.91053265, + "learning_rate": 0.0009157997011647154, + "loss": 0.92335469, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.65380859, + "step": 1101, + "time_per_iteration": 2.6137943267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_mlp": 1.20784807, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.05451247925490285, + "language_loss": 0.87014931, + "learning_rate": 0.0009156265973589817, + "loss": 0.88285577, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.62792969, + "step": 1102, + "time_per_iteration": 2.7920916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255362, + "balance_loss_mlp": 1.1928488, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.06310879580708054, + "language_loss": 0.90527534, + "learning_rate": 0.0009154533321926926, + "loss": 0.91782892, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.62548828, + "step": 1103, + "time_per_iteration": 2.646440029144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234037, + "balance_loss_mlp": 1.17214394, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.07831819024350671, + "language_loss": 0.88472342, + "learning_rate": 0.0009152799057331156, + "loss": 0.89706385, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.61865234, + "step": 1104, + "time_per_iteration": 3.122450590133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214804, + "balance_loss_mlp": 1.15462673, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.06719929320387279, + "language_loss": 0.91964042, + "learning_rate": 0.0009151063180475805, + "loss": 0.9317885, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.6015625, + "step": 1105, + "time_per_iteration": 2.5321173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181276, + "balance_loss_mlp": 1.12772751, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.07726558156265032, + "language_loss": 0.8518455, + "learning_rate": 0.0009149325692034803, + "loss": 0.86365819, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.53613281, + "step": 1106, + "time_per_iteration": 2.6019790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129115, + "balance_loss_mlp": 1.10660839, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.0458739418309424, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80332541, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.22460938, + "step": 1107, + "time_per_iteration": 4.859830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180766, + "balance_loss_mlp": 1.12478542, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08338906086238376, + "language_loss": 0.88186961, + "learning_rate": 0.0009145845883094678, + "loss": 0.89367729, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.56005859, + "step": 1108, + "time_per_iteration": 3.04249906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.10114598, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07708602471843919, + "language_loss": 0.85793281, + "learning_rate": 0.000914410356394654, + "loss": 0.86946738, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.5234375, + "step": 1109, + "time_per_iteration": 4.412867307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163449, + "balance_loss_mlp": 1.10751617, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.08187458054057056, + "language_loss": 0.85334879, + "learning_rate": 0.0009142359635914709, + "loss": 0.86498332, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.55957031, + "step": 1110, + "time_per_iteration": 3.023928642272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148781, + "balance_loss_mlp": 1.09570932, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.0669404625356857, + "language_loss": 0.85089076, + "learning_rate": 0.0009140614099676245, + "loss": 0.86237848, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.53076172, + "step": 1111, + "time_per_iteration": 2.625797748565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148537, + "balance_loss_mlp": 1.09632301, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.06784083874149466, + "language_loss": 0.83744586, + "learning_rate": 0.0009138866955908821, + "loss": 0.84893119, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.52246094, + "step": 1112, + "time_per_iteration": 2.9033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152374, + "balance_loss_mlp": 1.10042286, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.0756009236441896, + "language_loss": 0.81778276, + "learning_rate": 0.0009137118205290738, + "loss": 0.82930648, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.51977539, + "step": 1113, + "time_per_iteration": 3.00955867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163421, + "balance_loss_mlp": 1.10677314, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.07649003777848401, + "language_loss": 0.90946341, + "learning_rate": 0.0009135367848500924, + "loss": 0.92109764, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.56591797, + "step": 1114, + "time_per_iteration": 2.50858211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167845, + "balance_loss_mlp": 1.11472559, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.0823134598214501, + "language_loss": 0.87556803, + "learning_rate": 0.0009133615886218927, + "loss": 0.88724649, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.53125, + "step": 1115, + "time_per_iteration": 2.717454195022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178358, + "balance_loss_mlp": 1.11651218, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.06887665628973552, + "language_loss": 0.89567351, + "learning_rate": 0.0009131862319124917, + "loss": 0.90745711, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.61816406, + "step": 1116, + "time_per_iteration": 2.623767852783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176568, + "balance_loss_mlp": 1.1235671, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08365937432877864, + "language_loss": 0.85244483, + "learning_rate": 0.0009130107147899691, + "loss": 0.86421049, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.53051758, + "step": 1117, + "time_per_iteration": 2.795011281967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178642, + "balance_loss_mlp": 1.12561774, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.06665693704910039, + "language_loss": 0.8600654, + "learning_rate": 0.0009128350373224665, + "loss": 0.8718518, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.53076172, + "step": 1118, + "time_per_iteration": 2.5644795894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011837, + "balance_loss_mlp": 1.15928602, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.058896568697900505, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82640129, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.24414062, + "step": 1119, + "time_per_iteration": 4.683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204932, + "balance_loss_mlp": 1.15031052, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07135490421069918, + "language_loss": 0.85804355, + "learning_rate": 0.0009124832016254005, + "loss": 0.87009287, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.54663086, + "step": 1120, + "time_per_iteration": 2.6158647537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206508, + "balance_loss_mlp": 1.14571166, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.055578106746994274, + "language_loss": 0.89113355, + "learning_rate": 0.0009123070435324316, + "loss": 0.9031986, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.60791016, + "step": 1121, + "time_per_iteration": 2.755823850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.07988179, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.03051163671975961, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78977883, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.22265625, + "step": 1122, + "time_per_iteration": 4.996071100234985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211089, + "balance_loss_mlp": 1.15358257, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.06035521524280068, + "language_loss": 0.87145722, + "learning_rate": 0.0009119542471995752, + "loss": 0.88356811, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.57446289, + "step": 1123, + "time_per_iteration": 2.8323612213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204972, + "balance_loss_mlp": 1.14675009, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.060035653180353525, + "language_loss": 0.8248235, + "learning_rate": 0.0009117776090966554, + "loss": 0.83687323, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.58251953, + "step": 1124, + "time_per_iteration": 2.954216480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216387, + "balance_loss_mlp": 1.1558764, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.07791040933307145, + "language_loss": 0.876288, + "learning_rate": 0.0009116008111274899, + "loss": 0.88845193, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.60498047, + "step": 1125, + "time_per_iteration": 3.2826616764068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08216333, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.030294405796961115, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80209303, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.20214844, + "step": 1126, + "time_per_iteration": 4.8284173011779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.1455152, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.10762952047928877, + "language_loss": 0.8553561, + "learning_rate": 0.0009112467358650396, + "loss": 0.86737764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.56640625, + "step": 1127, + "time_per_iteration": 3.1621291637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192643, + "balance_loss_mlp": 1.13561273, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.06435190440672867, + "language_loss": 0.87181705, + "learning_rate": 0.0009110694587092192, + "loss": 0.88374346, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.56982422, + "step": 1128, + "time_per_iteration": 2.7597765922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194699, + "balance_loss_mlp": 1.13452196, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.06894978951163175, + "language_loss": 0.8223331, + "learning_rate": 0.0009108920219620815, + "loss": 0.83428001, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.6015625, + "step": 1129, + "time_per_iteration": 2.6658482551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198898, + "balance_loss_mlp": 1.14072335, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06550313542995663, + "language_loss": 0.90210444, + "learning_rate": 0.0009107144256925133, + "loss": 0.91409343, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.58154297, + "step": 1130, + "time_per_iteration": 2.7298777103424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211101, + "balance_loss_mlp": 1.15464389, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08430456831611369, + "language_loss": 0.82975614, + "learning_rate": 0.0009105366699694638, + "loss": 0.84186715, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.56445312, + "step": 1131, + "time_per_iteration": 2.7422807216644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121305, + "balance_loss_mlp": 1.15263498, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.05499133039406014, + "language_loss": 0.82219702, + "learning_rate": 0.0009103587548619439, + "loss": 0.83432752, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.60400391, + "step": 1132, + "time_per_iteration": 2.8834011554718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202147, + "balance_loss_mlp": 1.14468873, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.12855794167944481, + "language_loss": 0.87174821, + "learning_rate": 0.0009101806804390261, + "loss": 0.88376963, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.57421875, + "step": 1133, + "time_per_iteration": 2.8493435382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186046, + "balance_loss_mlp": 1.13082814, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.07046865468216726, + "language_loss": 0.91345453, + "learning_rate": 0.0009100024467698453, + "loss": 0.92531502, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.55175781, + "step": 1134, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184337, + "balance_loss_mlp": 1.12613893, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.07929007457036284, + "language_loss": 0.8353889, + "learning_rate": 0.0009098240539235981, + "loss": 0.84723222, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.58227539, + "step": 1135, + "time_per_iteration": 2.6736483573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176396, + "balance_loss_mlp": 1.12122619, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.06661367385494366, + "language_loss": 0.88575935, + "learning_rate": 0.0009096455019695423, + "loss": 0.89752328, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.55224609, + "step": 1136, + "time_per_iteration": 2.8438823223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172318, + "balance_loss_mlp": 1.1156702, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07075177433605506, + "language_loss": 0.90707165, + "learning_rate": 0.000909466790976998, + "loss": 0.91879487, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.56616211, + "step": 1137, + "time_per_iteration": 2.4795870780944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185042, + "balance_loss_mlp": 1.12801182, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07051320604800417, + "language_loss": 0.83409071, + "learning_rate": 0.0009092879210153473, + "loss": 0.84594113, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.57080078, + "step": 1138, + "time_per_iteration": 3.1328911781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186779, + "balance_loss_mlp": 1.13284826, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.06458215213012623, + "language_loss": 0.89566886, + "learning_rate": 0.0009091088921540333, + "loss": 0.90753663, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.54003906, + "step": 1139, + "time_per_iteration": 2.5608675479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_mlp": 1.03115106, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.027642480599540168, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76555562, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.15722656, + "step": 1140, + "time_per_iteration": 4.908522605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117392, + "balance_loss_mlp": 1.11908412, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.0906322081519832, + "language_loss": 0.84775734, + "learning_rate": 0.0009087503580104985, + "loss": 0.85949653, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.54882812, + "step": 1141, + "time_per_iteration": 2.696129083633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181851, + "balance_loss_mlp": 1.12558413, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.16226849767110665, + "language_loss": 0.80068243, + "learning_rate": 0.0009085708528674728, + "loss": 0.81250095, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.56347656, + "step": 1142, + "time_per_iteration": 2.7995505332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157793, + "balance_loss_mlp": 1.09985733, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08217329602320493, + "language_loss": 0.874843, + "learning_rate": 0.0009083911891031745, + "loss": 0.88642091, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.57958984, + "step": 1143, + "time_per_iteration": 3.1351919174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115076, + "balance_loss_mlp": 1.09578109, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.06169995263224583, + "language_loss": 0.92273706, + "learning_rate": 0.0009082113667873553, + "loss": 0.93424463, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.55029297, + "step": 1144, + "time_per_iteration": 3.1171934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153616, + "balance_loss_mlp": 1.10087752, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.07183124767141379, + "language_loss": 0.91221762, + "learning_rate": 0.0009080313859898283, + "loss": 0.9237538, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.52758789, + "step": 1145, + "time_per_iteration": 2.506591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153774, + "balance_loss_mlp": 1.09986758, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07077080612529597, + "language_loss": 0.92340779, + "learning_rate": 0.0009078512467804684, + "loss": 0.93494552, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.53881836, + "step": 1146, + "time_per_iteration": 2.591327667236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172392, + "balance_loss_mlp": 1.11800838, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.07651793216141736, + "language_loss": 0.91144007, + "learning_rate": 0.0009076709492292119, + "loss": 0.92316401, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.54418945, + "step": 1147, + "time_per_iteration": 2.609628438949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.11723804, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.07920780045429675, + "language_loss": 0.89603102, + "learning_rate": 0.0009074904934060562, + "loss": 0.90772295, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.51928711, + "step": 1148, + "time_per_iteration": 2.6755712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173959, + "balance_loss_mlp": 1.11697721, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.08245317941840166, + "language_loss": 0.8559376, + "learning_rate": 0.0009073098793810607, + "loss": 0.86767721, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.57006836, + "step": 1149, + "time_per_iteration": 2.9874348640441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177675, + "balance_loss_mlp": 1.12293434, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.08525751827962168, + "language_loss": 0.88982397, + "learning_rate": 0.000907129107224346, + "loss": 0.90160072, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.54785156, + "step": 1150, + "time_per_iteration": 2.739461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180589, + "balance_loss_mlp": 1.12658715, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.05205595876874212, + "language_loss": 0.88991034, + "learning_rate": 0.0009069481770060939, + "loss": 0.90171623, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.54077148, + "step": 1151, + "time_per_iteration": 2.7024669647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187248, + "balance_loss_mlp": 1.13212562, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06739531662392768, + "language_loss": 0.84448045, + "learning_rate": 0.000906767088796548, + "loss": 0.85635293, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.55126953, + "step": 1152, + "time_per_iteration": 3.4467508792877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117836, + "balance_loss_mlp": 1.12571764, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.05411857974090042, + "language_loss": 0.8779093, + "learning_rate": 0.0009065858426660127, + "loss": 0.8896929, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.52661133, + "step": 1153, + "time_per_iteration": 2.6216752529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182109, + "balance_loss_mlp": 1.12736845, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.07769931213358174, + "language_loss": 0.84979808, + "learning_rate": 0.0009064044386848543, + "loss": 0.86161917, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.54833984, + "step": 1154, + "time_per_iteration": 2.91601824760437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172512, + "balance_loss_mlp": 1.11381316, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.0711084155390928, + "language_loss": 0.89741302, + "learning_rate": 0.0009062228769234997, + "loss": 0.90913814, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.58691406, + "step": 1155, + "time_per_iteration": 2.5972864627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116208, + "balance_loss_mlp": 1.10690951, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.09100503083112628, + "language_loss": 0.81526613, + "learning_rate": 0.0009060411574524376, + "loss": 0.82688695, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.55224609, + "step": 1156, + "time_per_iteration": 2.6763274669647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182591, + "balance_loss_mlp": 1.12684917, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.06563385289017937, + "language_loss": 0.88585329, + "learning_rate": 0.0009058592803422178, + "loss": 0.89767921, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.55810547, + "step": 1157, + "time_per_iteration": 3.1414153575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_mlp": 1.00955701, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.012760142008093896, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79737109, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.17285156, + "step": 1158, + "time_per_iteration": 4.802858352661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171905, + "balance_loss_mlp": 1.12126482, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.060083734909452326, + "language_loss": 0.90886426, + "learning_rate": 0.00090549505348681, + "loss": 0.92058331, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.50683594, + "step": 1159, + "time_per_iteration": 2.5810928344726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168069, + "balance_loss_mlp": 1.11137354, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07069918091424116, + "language_loss": 0.85149121, + "learning_rate": 0.0009053127038830275, + "loss": 0.86317194, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.56689453, + "step": 1160, + "time_per_iteration": 3.009434223175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162107, + "balance_loss_mlp": 1.1050297, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.07200535138488619, + "language_loss": 0.87409687, + "learning_rate": 0.000905130196922898, + "loss": 0.88571799, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.57080078, + "step": 1161, + "time_per_iteration": 2.5972068309783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157457, + "balance_loss_mlp": 1.10223973, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.053497533436564174, + "language_loss": 0.8808614, + "learning_rate": 0.0009049475326772769, + "loss": 0.89243597, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.55224609, + "step": 1162, + "time_per_iteration": 2.580254316329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167432, + "balance_loss_mlp": 1.11092722, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.105825736895628, + "language_loss": 0.83639884, + "learning_rate": 0.0009047647112170811, + "loss": 0.84807312, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.56469727, + "step": 1163, + "time_per_iteration": 2.7509572505950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170262, + "balance_loss_mlp": 1.11041939, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.11729347611284674, + "language_loss": 0.8833853, + "learning_rate": 0.0009045817326132876, + "loss": 0.89508796, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.59814453, + "step": 1164, + "time_per_iteration": 3.6648380756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170775, + "balance_loss_mlp": 1.11226714, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.05704665841604838, + "language_loss": 0.83974147, + "learning_rate": 0.0009043985969369357, + "loss": 0.85144925, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.58544922, + "step": 1165, + "time_per_iteration": 2.868560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176977, + "balance_loss_mlp": 1.11665666, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.059940537627208516, + "language_loss": 0.84960037, + "learning_rate": 0.0009042153042591245, + "loss": 0.86137015, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.60302734, + "step": 1166, + "time_per_iteration": 2.8023743629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116839, + "balance_loss_mlp": 1.11271954, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.054742371261080745, + "language_loss": 0.85761929, + "learning_rate": 0.0009040318546510146, + "loss": 0.86930317, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.55639648, + "step": 1167, + "time_per_iteration": 3.141993999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117745, + "balance_loss_mlp": 1.1215651, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.07712318573741421, + "language_loss": 0.8582288, + "learning_rate": 0.0009038482481838275, + "loss": 0.87000328, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.55957031, + "step": 1168, + "time_per_iteration": 2.675204038619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116517, + "balance_loss_mlp": 1.1128844, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05640688657343365, + "language_loss": 0.88303328, + "learning_rate": 0.0009036644849288455, + "loss": 0.89468497, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.52319336, + "step": 1169, + "time_per_iteration": 3.0777511596679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.09441662, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.07174166621143864, + "language_loss": 0.86291218, + "learning_rate": 0.0009034805649574118, + "loss": 0.87439895, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.54394531, + "step": 1170, + "time_per_iteration": 2.7120091915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157496, + "balance_loss_mlp": 1.10513926, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.05497638968028837, + "language_loss": 0.85883957, + "learning_rate": 0.0009032964883409308, + "loss": 0.87041461, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.52441406, + "step": 1171, + "time_per_iteration": 2.8770556449890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_mlp": 1.03001809, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.027786176955518046, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74097812, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.17285156, + "step": 1172, + "time_per_iteration": 4.997943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150837, + "balance_loss_mlp": 1.0977174, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.06380875138992877, + "language_loss": 0.87640917, + "learning_rate": 0.0009029278654587462, + "loss": 0.88791752, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.53173828, + "step": 1173, + "time_per_iteration": 2.6070940494537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148484, + "balance_loss_mlp": 1.09546018, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.057211485944593306, + "language_loss": 0.83189976, + "learning_rate": 0.0009027433193361548, + "loss": 0.84338462, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.53027344, + "step": 1174, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.09708285, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06182212989299174, + "language_loss": 0.86948568, + "learning_rate": 0.00090255861685474, + "loss": 0.88097882, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.52246094, + "step": 1175, + "time_per_iteration": 2.7387607097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146248, + "balance_loss_mlp": 1.09284246, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.06871471519475823, + "language_loss": 0.92170686, + "learning_rate": 0.0009023737580862095, + "loss": 0.93316931, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.53442383, + "step": 1176, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160546, + "balance_loss_mlp": 1.11035883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0563237464245993, + "language_loss": 0.83948356, + "learning_rate": 0.0009021887431023321, + "loss": 0.851089, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.50219727, + "step": 1177, + "time_per_iteration": 2.5911412239074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161678, + "balance_loss_mlp": 1.11063254, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.06510699727290163, + "language_loss": 0.88054293, + "learning_rate": 0.0009020035719749369, + "loss": 0.8921597, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.51098633, + "step": 1178, + "time_per_iteration": 2.747715473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_mlp": 1.1255312, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0760827261000747, + "language_loss": 0.78592283, + "learning_rate": 0.0009018182447759136, + "loss": 0.79774463, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.56616211, + "step": 1179, + "time_per_iteration": 2.9912376403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177946, + "balance_loss_mlp": 1.12287188, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.05857060866656224, + "language_loss": 0.80403864, + "learning_rate": 0.0009016327615772126, + "loss": 0.81581813, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.55126953, + "step": 1180, + "time_per_iteration": 2.951934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178867, + "balance_loss_mlp": 1.1241498, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.07803208794693026, + "language_loss": 0.88654709, + "learning_rate": 0.0009014471224508451, + "loss": 0.8983357, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.54711914, + "step": 1181, + "time_per_iteration": 2.6834704875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175396, + "balance_loss_mlp": 1.12280107, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.07891792311297686, + "language_loss": 0.84171915, + "learning_rate": 0.0009012613274688823, + "loss": 0.85347319, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.52636719, + "step": 1182, + "time_per_iteration": 2.6773135662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193932, + "balance_loss_mlp": 1.13711679, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.06685387295915801, + "language_loss": 0.88334668, + "learning_rate": 0.0009010753767034565, + "loss": 0.89528602, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.56811523, + "step": 1183, + "time_per_iteration": 2.53671932220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192681, + "balance_loss_mlp": 1.13732028, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.05676884979808662, + "language_loss": 0.79381895, + "learning_rate": 0.0009008892702267599, + "loss": 0.80574578, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.55297852, + "step": 1184, + "time_per_iteration": 2.9609317779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218637, + "balance_loss_mlp": 1.16055822, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.11080255811352213, + "language_loss": 0.897048, + "learning_rate": 0.0009007030081110457, + "loss": 0.9092344, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.58105469, + "step": 1185, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212656, + "balance_loss_mlp": 1.15872598, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.06215110995007368, + "language_loss": 0.8510564, + "learning_rate": 0.000900516590428627, + "loss": 0.8631829, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.53955078, + "step": 1186, + "time_per_iteration": 2.66469407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206955, + "balance_loss_mlp": 1.15416956, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07510292852734143, + "language_loss": 0.90231287, + "learning_rate": 0.0009003300172518778, + "loss": 0.91438246, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.52807617, + "step": 1187, + "time_per_iteration": 2.6872987747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189379, + "balance_loss_mlp": 1.13559163, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.06187047573177096, + "language_loss": 0.84854043, + "learning_rate": 0.0009001432886532321, + "loss": 0.86043417, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.53808594, + "step": 1188, + "time_per_iteration": 2.961327314376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185796, + "balance_loss_mlp": 1.13248527, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0670290505569486, + "language_loss": 0.87277937, + "learning_rate": 0.0008999564047051843, + "loss": 0.88463724, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.53320312, + "step": 1189, + "time_per_iteration": 2.5120058059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119823, + "balance_loss_mlp": 1.14773321, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.07775817493182749, + "language_loss": 0.85562766, + "learning_rate": 0.0008997693654802894, + "loss": 0.86760998, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.50537109, + "step": 1190, + "time_per_iteration": 2.6584115028381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203195, + "balance_loss_mlp": 1.15343666, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08092173087187808, + "language_loss": 0.87245274, + "learning_rate": 0.0008995821710511625, + "loss": 0.88448465, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49780273, + "step": 1191, + "time_per_iteration": 2.75514817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189711, + "balance_loss_mlp": 1.14376771, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.058050392882622655, + "language_loss": 0.85975361, + "learning_rate": 0.0008993948214904786, + "loss": 0.8716507, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.45922852, + "step": 1192, + "time_per_iteration": 2.5808064937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132885, + "balance_loss_mlp": 1.11629128, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.04438752541684951, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.795551, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.16601562, + "step": 1193, + "time_per_iteration": 4.915351629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170271, + "balance_loss_mlp": 1.11338401, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.06516354982073377, + "language_loss": 0.79226351, + "learning_rate": 0.0008990196572654427, + "loss": 0.80396616, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.56933594, + "step": 1194, + "time_per_iteration": 2.914353609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159508, + "balance_loss_mlp": 1.10982203, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.053033431306196574, + "language_loss": 0.88186455, + "learning_rate": 0.0008988318427467426, + "loss": 0.89345956, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.49707031, + "step": 1195, + "time_per_iteration": 2.763303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146949, + "balance_loss_mlp": 1.09754825, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.06471781599702997, + "language_loss": 0.87142104, + "learning_rate": 0.0008986438733877887, + "loss": 0.88289052, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.49414062, + "step": 1196, + "time_per_iteration": 3.453037738800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138036, + "balance_loss_mlp": 1.08901691, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.05831436273017673, + "language_loss": 0.84795159, + "learning_rate": 0.0008984557492615576, + "loss": 0.85933197, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.49023438, + "step": 1197, + "time_per_iteration": 2.9209883213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147831, + "balance_loss_mlp": 1.09816873, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.06183090029168821, + "language_loss": 0.90001792, + "learning_rate": 0.0008982674704410854, + "loss": 0.91149628, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.49658203, + "step": 1198, + "time_per_iteration": 2.723980665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.10364521, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.06439147944581719, + "language_loss": 0.78128076, + "learning_rate": 0.0008980790369994682, + "loss": 0.7928164, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.49926758, + "step": 1199, + "time_per_iteration": 2.968733787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148741, + "balance_loss_mlp": 1.09817219, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.060755539801175186, + "language_loss": 0.8790828, + "learning_rate": 0.000897890449009863, + "loss": 0.89057022, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.50561523, + "step": 1200, + "time_per_iteration": 2.7373695373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159063, + "balance_loss_mlp": 1.11052144, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09508340337221405, + "language_loss": 0.9041636, + "learning_rate": 0.0008977017065454853, + "loss": 0.91575426, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.4855957, + "step": 1201, + "time_per_iteration": 2.6561479568481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172289, + "balance_loss_mlp": 1.12393796, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06896397472633412, + "language_loss": 0.8110497, + "learning_rate": 0.0008975128096796121, + "loss": 0.82277262, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48413086, + "step": 1202, + "time_per_iteration": 2.850882053375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166428, + "balance_loss_mlp": 1.11583591, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.07234791297382964, + "language_loss": 0.86751068, + "learning_rate": 0.0008973237584855794, + "loss": 0.87917495, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.50610352, + "step": 1203, + "time_per_iteration": 2.898651599884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201199, + "balance_loss_mlp": 1.14912796, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.0647782155366788, + "language_loss": 0.82535917, + "learning_rate": 0.0008971345530367832, + "loss": 0.83737111, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.52172852, + "step": 1204, + "time_per_iteration": 2.479710102081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188056, + "balance_loss_mlp": 1.13743997, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07976239468268423, + "language_loss": 0.86050093, + "learning_rate": 0.0008969451934066799, + "loss": 0.87238145, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.50561523, + "step": 1205, + "time_per_iteration": 2.7891948223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190172, + "balance_loss_mlp": 1.13834012, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08603625620414594, + "language_loss": 0.8068459, + "learning_rate": 0.0008967556796687854, + "loss": 0.81874764, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.51879883, + "step": 1206, + "time_per_iteration": 2.879742383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182453, + "balance_loss_mlp": 1.1313839, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.06613018456643845, + "language_loss": 0.8416872, + "learning_rate": 0.0008965660118966752, + "loss": 0.85351169, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.51098633, + "step": 1207, + "time_per_iteration": 2.8900513648986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.11610246, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06058209183838784, + "language_loss": 0.90754479, + "learning_rate": 0.0008963761901639851, + "loss": 0.91918385, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.47802734, + "step": 1208, + "time_per_iteration": 2.805534601211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176094, + "balance_loss_mlp": 1.12457156, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.06993420403149982, + "language_loss": 0.83909518, + "learning_rate": 0.0008961862145444103, + "loss": 0.85085618, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.51538086, + "step": 1209, + "time_per_iteration": 2.6882550716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197419, + "balance_loss_mlp": 1.14587319, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08646594069324176, + "language_loss": 0.85994279, + "learning_rate": 0.0008959960851117059, + "loss": 0.87191701, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.51611328, + "step": 1210, + "time_per_iteration": 2.6176648139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118703, + "balance_loss_mlp": 1.13340998, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06670419812311852, + "language_loss": 0.84013158, + "learning_rate": 0.0008958058019396868, + "loss": 0.85200191, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.53637695, + "step": 1211, + "time_per_iteration": 2.7867624759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177443, + "balance_loss_mlp": 1.12754154, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08722593193124767, + "language_loss": 0.87226063, + "learning_rate": 0.0008956153651022274, + "loss": 0.88403505, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.49926758, + "step": 1212, + "time_per_iteration": 2.671705961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169147, + "balance_loss_mlp": 1.11726665, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.06082314874639417, + "language_loss": 0.84296238, + "learning_rate": 0.0008954247746732618, + "loss": 0.85465384, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.51904297, + "step": 1213, + "time_per_iteration": 2.58005952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163838, + "balance_loss_mlp": 1.1156534, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.06006865966510304, + "language_loss": 0.91204965, + "learning_rate": 0.0008952340307267837, + "loss": 0.92368799, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48144531, + "step": 1214, + "time_per_iteration": 2.842824697494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149903, + "balance_loss_mlp": 1.09983516, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.07140080071894721, + "language_loss": 0.84202802, + "learning_rate": 0.0008950431333368468, + "loss": 0.85352707, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.50097656, + "step": 1215, + "time_per_iteration": 2.5616672039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155221, + "balance_loss_mlp": 1.10656011, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.083723319453273, + "language_loss": 0.85366404, + "learning_rate": 0.0008948520825775634, + "loss": 0.86521626, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48657227, + "step": 1216, + "time_per_iteration": 3.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114764, + "balance_loss_mlp": 1.09895492, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.05781662545039131, + "language_loss": 0.84181142, + "learning_rate": 0.0008946608785231067, + "loss": 0.85328782, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48706055, + "step": 1217, + "time_per_iteration": 2.861449956893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131497, + "balance_loss_mlp": 1.08352745, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.06428977242182035, + "language_loss": 0.85432529, + "learning_rate": 0.0008944695212477084, + "loss": 0.86564028, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.47973633, + "step": 1218, + "time_per_iteration": 2.540524959564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148618, + "balance_loss_mlp": 1.09907508, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.060914019840806265, + "language_loss": 0.86493349, + "learning_rate": 0.0008942780108256599, + "loss": 0.87641972, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.49560547, + "step": 1219, + "time_per_iteration": 2.613769769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142119, + "balance_loss_mlp": 1.09100199, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.05108155821019921, + "language_loss": 0.87340164, + "learning_rate": 0.0008940863473313121, + "loss": 0.88482285, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.51123047, + "step": 1220, + "time_per_iteration": 2.4549899101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145999, + "balance_loss_mlp": 1.09702742, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07702998226564757, + "language_loss": 0.8851074, + "learning_rate": 0.0008938945308390756, + "loss": 0.8965674, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48974609, + "step": 1221, + "time_per_iteration": 2.6133854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149321, + "balance_loss_mlp": 1.10211444, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.057479910137590906, + "language_loss": 0.88199294, + "learning_rate": 0.00089370256142342, + "loss": 0.89348614, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.47192383, + "step": 1222, + "time_per_iteration": 2.713489532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.09286284, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.05442066188859713, + "language_loss": 0.85879123, + "learning_rate": 0.0008935104391588746, + "loss": 0.87021047, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.49121094, + "step": 1223, + "time_per_iteration": 2.7304563522338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145924, + "balance_loss_mlp": 1.09447336, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.05049406517739995, + "language_loss": 0.8341555, + "learning_rate": 0.0008933181641200276, + "loss": 0.84561473, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.51513672, + "step": 1224, + "time_per_iteration": 3.122603416442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.09279394, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.0678885239417847, + "language_loss": 0.8627063, + "learning_rate": 0.0008931257363815271, + "loss": 0.87410253, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.46826172, + "step": 1225, + "time_per_iteration": 2.86014986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142208, + "balance_loss_mlp": 1.09490585, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.0639396043769501, + "language_loss": 0.90318632, + "learning_rate": 0.0008929331560180798, + "loss": 0.91460842, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.47338867, + "step": 1226, + "time_per_iteration": 2.9069020748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158077, + "balance_loss_mlp": 1.10924876, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.05735405278544162, + "language_loss": 0.9124881, + "learning_rate": 0.0008927404231044525, + "loss": 0.92406881, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48828125, + "step": 1227, + "time_per_iteration": 2.745591163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154284, + "balance_loss_mlp": 1.10571766, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.062458312515348655, + "language_loss": 0.8233285, + "learning_rate": 0.0008925475377154703, + "loss": 0.83487129, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48583984, + "step": 1228, + "time_per_iteration": 2.7165796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147881, + "balance_loss_mlp": 1.09664452, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.06307879716822463, + "language_loss": 0.82915187, + "learning_rate": 0.0008923544999260183, + "loss": 0.84063065, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.51293945, + "step": 1229, + "time_per_iteration": 2.787444829940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156102, + "balance_loss_mlp": 1.10567617, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.06236445133400911, + "language_loss": 0.92471206, + "learning_rate": 0.00089216130981104, + "loss": 0.9362731, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.50439453, + "step": 1230, + "time_per_iteration": 3.0671463012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148349, + "balance_loss_mlp": 1.09816241, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.06420697058211047, + "language_loss": 0.82893002, + "learning_rate": 0.000891967967445539, + "loss": 0.84041357, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.50195312, + "step": 1231, + "time_per_iteration": 2.692819356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147263, + "balance_loss_mlp": 1.09733796, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.044472050821074895, + "language_loss": 0.89257467, + "learning_rate": 0.0008917744729045772, + "loss": 0.90404725, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.49975586, + "step": 1232, + "time_per_iteration": 2.911123037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.10190618, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.055115174481180494, + "language_loss": 0.84317499, + "learning_rate": 0.0008915808262632757, + "loss": 0.85468972, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.49633789, + "step": 1233, + "time_per_iteration": 2.8429055213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164245, + "balance_loss_mlp": 1.1117928, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.07089823280283834, + "language_loss": 0.93916011, + "learning_rate": 0.0008913870275968148, + "loss": 0.95080256, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.52392578, + "step": 1234, + "time_per_iteration": 2.7355082035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152305, + "balance_loss_mlp": 1.10321498, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06512180670183462, + "language_loss": 0.87916219, + "learning_rate": 0.0008911930769804342, + "loss": 0.8906852, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.49145508, + "step": 1235, + "time_per_iteration": 3.320653200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115702, + "balance_loss_mlp": 1.10549772, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.04926889071384256, + "language_loss": 0.91928077, + "learning_rate": 0.0008909989744894318, + "loss": 0.93085092, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.51513672, + "step": 1236, + "time_per_iteration": 2.860095500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114863, + "balance_loss_mlp": 1.09808517, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.06373579401102465, + "language_loss": 0.81724823, + "learning_rate": 0.0008908047201991649, + "loss": 0.82873452, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.50512695, + "step": 1237, + "time_per_iteration": 2.7173092365264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146815, + "balance_loss_mlp": 1.10065758, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.06973577397583665, + "language_loss": 0.86895192, + "learning_rate": 0.0008906103141850502, + "loss": 0.88042009, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.46142578, + "step": 1238, + "time_per_iteration": 2.9070518016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149112, + "balance_loss_mlp": 1.10068893, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.07438040904238923, + "language_loss": 0.88608682, + "learning_rate": 0.0008904157565225621, + "loss": 0.897578, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48461914, + "step": 1239, + "time_per_iteration": 2.598175287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114606, + "balance_loss_mlp": 1.09758997, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.07265689268382322, + "language_loss": 0.82424903, + "learning_rate": 0.000890221047287235, + "loss": 0.83570957, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48486328, + "step": 1240, + "time_per_iteration": 3.5255463123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149116, + "balance_loss_mlp": 1.10207629, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07692592831537566, + "language_loss": 0.91524613, + "learning_rate": 0.0008900261865546615, + "loss": 0.92673725, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47021484, + "step": 1241, + "time_per_iteration": 2.626298189163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150585, + "balance_loss_mlp": 1.10101807, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.06193436068824588, + "language_loss": 0.85487348, + "learning_rate": 0.0008898311744004936, + "loss": 0.86637932, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.49584961, + "step": 1242, + "time_per_iteration": 2.6845884323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143159, + "balance_loss_mlp": 1.09638107, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06489370510499948, + "language_loss": 0.87195957, + "learning_rate": 0.0008896360109004414, + "loss": 0.88339114, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.46801758, + "step": 1243, + "time_per_iteration": 2.6279244422912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149339, + "balance_loss_mlp": 1.10239482, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.05690023470638135, + "language_loss": 0.84913921, + "learning_rate": 0.0008894406961302742, + "loss": 0.8606326, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.46948242, + "step": 1244, + "time_per_iteration": 2.5823607444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161498, + "balance_loss_mlp": 1.11591244, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.06599652790645752, + "language_loss": 0.84225279, + "learning_rate": 0.0008892452301658201, + "loss": 0.85386777, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.45581055, + "step": 1245, + "time_per_iteration": 3.0007240772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153792, + "balance_loss_mlp": 1.1045351, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.05569216777143309, + "language_loss": 0.83851659, + "learning_rate": 0.0008890496130829653, + "loss": 0.8500545, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.49316406, + "step": 1246, + "time_per_iteration": 2.656524658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.10424757, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.0643203237989141, + "language_loss": 0.85808307, + "learning_rate": 0.0008888538449576555, + "loss": 0.86958289, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.45751953, + "step": 1247, + "time_per_iteration": 2.5420141220092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148571, + "balance_loss_mlp": 1.09993315, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.07931889136759729, + "language_loss": 0.83083689, + "learning_rate": 0.0008886579258658944, + "loss": 0.84232259, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48632812, + "step": 1248, + "time_per_iteration": 2.574025869369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136833, + "balance_loss_mlp": 1.08786154, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.057547694087262784, + "language_loss": 0.85210383, + "learning_rate": 0.0008884618558837446, + "loss": 0.8634721, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.48974609, + "step": 1249, + "time_per_iteration": 2.808790922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146334, + "balance_loss_mlp": 1.09407234, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.05843363394571656, + "language_loss": 0.87170362, + "learning_rate": 0.0008882656350873273, + "loss": 0.88316691, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.52319336, + "step": 1250, + "time_per_iteration": 2.839341163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139888, + "balance_loss_mlp": 1.08998704, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.06920486589868534, + "language_loss": 0.87495792, + "learning_rate": 0.0008880692635528219, + "loss": 0.88635677, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.49975586, + "step": 1251, + "time_per_iteration": 3.0422415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141134, + "balance_loss_mlp": 1.09404635, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09445201185980338, + "language_loss": 0.89987123, + "learning_rate": 0.0008878727413564669, + "loss": 0.91128266, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47094727, + "step": 1252, + "time_per_iteration": 2.7974343299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110917, + "balance_loss_mlp": 1.09066832, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.0270998190046769, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81244767, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.18457031, + "step": 1253, + "time_per_iteration": 4.892668724060059 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150056, + "balance_loss_mlp": 1.09707963, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.06472275672686992, + "language_loss": 0.79044139, + "learning_rate": 0.0008874792452834528, + "loss": 0.80194199, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.53076172, + "step": 1254, + "time_per_iteration": 2.759533643722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144784, + "balance_loss_mlp": 1.09397733, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08671647217417044, + "language_loss": 0.87847424, + "learning_rate": 0.0008872822715595626, + "loss": 0.88992208, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.50878906, + "step": 1255, + "time_per_iteration": 2.6758921146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136115, + "balance_loss_mlp": 1.08731091, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.07818195128513271, + "language_loss": 0.87750483, + "learning_rate": 0.0008870851474793598, + "loss": 0.88886595, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.48803711, + "step": 1256, + "time_per_iteration": 2.5903451442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140246, + "balance_loss_mlp": 1.09196591, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.06462138017812241, + "language_loss": 0.90108514, + "learning_rate": 0.0008868878731193752, + "loss": 0.91248751, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48291016, + "step": 1257, + "time_per_iteration": 2.9156484603881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131411, + "balance_loss_mlp": 1.08611095, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.06839520252820154, + "language_loss": 0.89823216, + "learning_rate": 0.0008866904485561973, + "loss": 0.90954626, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.45361328, + "step": 1258, + "time_per_iteration": 2.709073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128152, + "balance_loss_mlp": 1.07698727, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.061516465429869265, + "language_loss": 0.83619797, + "learning_rate": 0.000886492873866473, + "loss": 0.84747952, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.51245117, + "step": 1259, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122174, + "balance_loss_mlp": 1.07315516, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.07532562043269028, + "language_loss": 0.85057306, + "learning_rate": 0.000886295149126908, + "loss": 0.86179483, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.49023438, + "step": 1260, + "time_per_iteration": 2.7702596187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_mlp": 1.07291138, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.06506459806255929, + "language_loss": 0.86249155, + "learning_rate": 0.0008860972744142655, + "loss": 0.87369466, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47363281, + "step": 1261, + "time_per_iteration": 2.9010353088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111356, + "balance_loss_mlp": 1.06575668, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.05333874014607912, + "language_loss": 0.82215619, + "learning_rate": 0.0008858992498053671, + "loss": 0.83329183, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47729492, + "step": 1262, + "time_per_iteration": 2.8307647705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_mlp": 1.08506405, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.04388178085496151, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77694511, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.20703125, + "step": 1263, + "time_per_iteration": 4.839150428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113047, + "balance_loss_mlp": 1.06517243, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07576677138650743, + "language_loss": 0.83877796, + "learning_rate": 0.0008855027512063817, + "loss": 0.84990847, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47924805, + "step": 1264, + "time_per_iteration": 2.6955387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116702, + "balance_loss_mlp": 1.06847, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.08737911579836782, + "language_loss": 0.86160326, + "learning_rate": 0.0008853042773702292, + "loss": 0.87277025, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.48217773, + "step": 1265, + "time_per_iteration": 2.718477725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123795, + "balance_loss_mlp": 1.07191551, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.05410456343654981, + "language_loss": 0.87916005, + "learning_rate": 0.0008851056539456896, + "loss": 0.89039803, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.51855469, + "step": 1266, + "time_per_iteration": 2.668398380279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127352, + "balance_loss_mlp": 1.07792759, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.06341671281787149, + "language_loss": 0.82546353, + "learning_rate": 0.0008849068810098755, + "loss": 0.8367371, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.49414062, + "step": 1267, + "time_per_iteration": 3.348644971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132731, + "balance_loss_mlp": 1.08523834, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.08675992555990221, + "language_loss": 0.8333391, + "learning_rate": 0.0008847079586399575, + "loss": 0.84466636, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47509766, + "step": 1268, + "time_per_iteration": 2.549433946609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126198, + "balance_loss_mlp": 1.07994461, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07249150513377325, + "language_loss": 0.8672694, + "learning_rate": 0.0008845088869131641, + "loss": 0.87853134, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.46289062, + "step": 1269, + "time_per_iteration": 2.6586451530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.10145724, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.06266770628228314, + "language_loss": 0.89411461, + "learning_rate": 0.0008843096659067818, + "loss": 0.90561438, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.48510742, + "step": 1270, + "time_per_iteration": 2.626946210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146652, + "balance_loss_mlp": 1.10228229, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.056965438466979365, + "language_loss": 0.86992264, + "learning_rate": 0.000884110295698155, + "loss": 0.88138914, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.44335938, + "step": 1271, + "time_per_iteration": 2.970078706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160922, + "balance_loss_mlp": 1.11080623, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.06894839907125858, + "language_loss": 0.86557794, + "learning_rate": 0.0008839107763646861, + "loss": 0.87718713, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.5012207, + "step": 1272, + "time_per_iteration": 2.592349052429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183532, + "balance_loss_mlp": 1.13437057, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.06647703149266906, + "language_loss": 0.90856385, + "learning_rate": 0.0008837111079838353, + "loss": 0.92039919, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.49194336, + "step": 1273, + "time_per_iteration": 2.7098910808563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118943, + "balance_loss_mlp": 1.14289117, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.05923779703064254, + "language_loss": 0.90316379, + "learning_rate": 0.000883511290633121, + "loss": 0.91505814, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.46533203, + "step": 1274, + "time_per_iteration": 2.5714197158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.13739181, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.060927364177961095, + "language_loss": 0.92697686, + "learning_rate": 0.000883311324390119, + "loss": 0.93883693, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.48608398, + "step": 1275, + "time_per_iteration": 2.740896224975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189584, + "balance_loss_mlp": 1.13474798, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.07775603238406727, + "language_loss": 0.82056022, + "learning_rate": 0.0008831112093324629, + "loss": 0.83245611, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.5480957, + "step": 1276, + "time_per_iteration": 3.0821468830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190059, + "balance_loss_mlp": 1.13927567, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.05600773018776359, + "language_loss": 0.89543378, + "learning_rate": 0.0008829109455378444, + "loss": 0.90733445, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.50830078, + "step": 1277, + "time_per_iteration": 2.7299413681030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192458, + "balance_loss_mlp": 1.14241397, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.05156937738675093, + "language_loss": 0.87083036, + "learning_rate": 0.000882710533084013, + "loss": 0.88275498, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.5, + "step": 1278, + "time_per_iteration": 2.6295228004455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185847, + "balance_loss_mlp": 1.13568354, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.05927927368096647, + "language_loss": 0.90088928, + "learning_rate": 0.0008825099720487755, + "loss": 0.91274774, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.50195312, + "step": 1279, + "time_per_iteration": 2.630868434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149494, + "balance_loss_mlp": 1.13461673, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04555367127523109, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76410633, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.1484375, + "step": 1280, + "time_per_iteration": 4.843670129776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118256, + "balance_loss_mlp": 1.10366488, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.038204832859796624, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79062366, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.14550781, + "step": 1281, + "time_per_iteration": 4.784554481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115452, + "balance_loss_mlp": 1.10547721, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.05852441511604794, + "language_loss": 0.89541078, + "learning_rate": 0.0008819073982335619, + "loss": 0.90695602, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.49047852, + "step": 1282, + "time_per_iteration": 2.8370161056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141297, + "balance_loss_mlp": 1.09726083, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07515840278086762, + "language_loss": 0.84908974, + "learning_rate": 0.0008817062436519235, + "loss": 0.86050272, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.44042969, + "step": 1283, + "time_per_iteration": 2.6532042026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114078, + "balance_loss_mlp": 1.09164214, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.051214690731677004, + "language_loss": 0.9022612, + "learning_rate": 0.0008815049408787788, + "loss": 0.91366905, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.49072266, + "step": 1284, + "time_per_iteration": 2.577040195465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145806, + "balance_loss_mlp": 1.09857535, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.06399849872592922, + "language_loss": 0.86388409, + "learning_rate": 0.0008813034899922805, + "loss": 0.87534213, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47216797, + "step": 1285, + "time_per_iteration": 2.586411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153157, + "balance_loss_mlp": 1.10366094, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.05962621730359375, + "language_loss": 0.90523338, + "learning_rate": 0.0008811018910706387, + "loss": 0.91676497, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.49536133, + "step": 1286, + "time_per_iteration": 2.558340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150564, + "balance_loss_mlp": 1.0996381, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08171747444285254, + "language_loss": 0.82914776, + "learning_rate": 0.0008809001441921211, + "loss": 0.84065336, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.50976562, + "step": 1287, + "time_per_iteration": 2.7096829414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134679, + "balance_loss_mlp": 1.08651865, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.061876473909820096, + "language_loss": 0.86037469, + "learning_rate": 0.0008806982494350528, + "loss": 0.87172151, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.48144531, + "step": 1288, + "time_per_iteration": 2.6826744079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.0885514, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.05818805427718153, + "language_loss": 0.90965348, + "learning_rate": 0.0008804962068778161, + "loss": 0.92104065, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.50195312, + "step": 1289, + "time_per_iteration": 2.9314775466918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137271, + "balance_loss_mlp": 1.08872867, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.06661216201088474, + "language_loss": 0.81390089, + "learning_rate": 0.0008802940165988511, + "loss": 0.82527363, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.48510742, + "step": 1290, + "time_per_iteration": 2.8629136085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113117, + "balance_loss_mlp": 1.08389127, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.06960392685137955, + "language_loss": 0.89268786, + "learning_rate": 0.000880091678676655, + "loss": 0.90399957, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47265625, + "step": 1291, + "time_per_iteration": 2.8345038890838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.08882165, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.058047960295431696, + "language_loss": 0.89150697, + "learning_rate": 0.0008798891931897821, + "loss": 0.90286887, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47338867, + "step": 1292, + "time_per_iteration": 2.7299227714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128008, + "balance_loss_mlp": 1.07949018, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.09954343743221296, + "language_loss": 0.84998739, + "learning_rate": 0.0008796865602168447, + "loss": 0.86126745, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.48535156, + "step": 1293, + "time_per_iteration": 2.5342278480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127533, + "balance_loss_mlp": 1.08220935, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.05777797953149353, + "language_loss": 0.89527249, + "learning_rate": 0.0008794837798365115, + "loss": 0.90654784, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.45361328, + "step": 1294, + "time_per_iteration": 2.6889185905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.08886147, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.07754051928079464, + "language_loss": 0.89232659, + "learning_rate": 0.0008792808521275089, + "loss": 0.90369469, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47924805, + "step": 1295, + "time_per_iteration": 2.7635927200317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136837, + "balance_loss_mlp": 1.09027398, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.09989296116771008, + "language_loss": 0.87984705, + "learning_rate": 0.0008790777771686206, + "loss": 0.89121538, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.46557617, + "step": 1296, + "time_per_iteration": 2.579235076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124595, + "balance_loss_mlp": 1.07853234, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.08251132162328097, + "language_loss": 0.85680348, + "learning_rate": 0.0008788745550386872, + "loss": 0.86804938, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.46044922, + "step": 1297, + "time_per_iteration": 2.598031759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128617, + "balance_loss_mlp": 1.08152938, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.06717402893383145, + "language_loss": 0.80945367, + "learning_rate": 0.0008786711858166063, + "loss": 0.82073987, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47070312, + "step": 1298, + "time_per_iteration": 2.9720141887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133144, + "balance_loss_mlp": 1.08696246, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.058753985131359356, + "language_loss": 0.84356344, + "learning_rate": 0.0008784676695813332, + "loss": 0.85489488, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.46166992, + "step": 1299, + "time_per_iteration": 3.003113031387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154452, + "balance_loss_mlp": 1.10700631, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07081449776085671, + "language_loss": 0.85444576, + "learning_rate": 0.0008782640064118796, + "loss": 0.86599028, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47436523, + "step": 1300, + "time_per_iteration": 2.8769848346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166343, + "balance_loss_mlp": 1.14946294, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.041859158942630086, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77351093, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.16894531, + "step": 1301, + "time_per_iteration": 4.951652526855469 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191692, + "balance_loss_mlp": 1.14701271, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.07273634964220443, + "language_loss": 0.8750245, + "learning_rate": 0.0008778562395867648, + "loss": 0.88694143, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.44677734, + "step": 1302, + "time_per_iteration": 2.604402542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181408, + "balance_loss_mlp": 1.13629961, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07562070017846675, + "language_loss": 0.84288502, + "learning_rate": 0.0008776521360894127, + "loss": 0.85469913, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.45092773, + "step": 1303, + "time_per_iteration": 2.5878565311431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08784008, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0317480068151838, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80065739, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.15820312, + "step": 1304, + "time_per_iteration": 4.7717835903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116688, + "balance_loss_mlp": 1.12220049, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.05690422496958516, + "language_loss": 0.90951985, + "learning_rate": 0.0008772434893213186, + "loss": 0.92118865, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.44702148, + "step": 1305, + "time_per_iteration": 2.604490280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160948, + "balance_loss_mlp": 1.11405063, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.058263181320018995, + "language_loss": 0.85050523, + "learning_rate": 0.0008770389462092276, + "loss": 0.86211473, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46875, + "step": 1306, + "time_per_iteration": 2.6470468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011567, + "balance_loss_mlp": 1.1099937, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.058464254330546805, + "language_loss": 0.87023067, + "learning_rate": 0.0008768342567176357, + "loss": 0.88179767, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.46704102, + "step": 1307, + "time_per_iteration": 2.8168630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155047, + "balance_loss_mlp": 1.10753012, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.05479935706331158, + "language_loss": 0.90999937, + "learning_rate": 0.0008766294209260107, + "loss": 0.9215498, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.4753418, + "step": 1308, + "time_per_iteration": 2.721531629562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144469, + "balance_loss_mlp": 1.09704781, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.06755027454964987, + "language_loss": 0.91936618, + "learning_rate": 0.0008764244389138767, + "loss": 0.93081093, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47436523, + "step": 1309, + "time_per_iteration": 2.574913263320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146846, + "balance_loss_mlp": 1.10061693, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09614568206927013, + "language_loss": 0.82912982, + "learning_rate": 0.000876219310760815, + "loss": 0.84059829, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.46240234, + "step": 1310, + "time_per_iteration": 2.8861234188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140262, + "balance_loss_mlp": 1.09419942, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.07943381545238665, + "language_loss": 0.82026285, + "learning_rate": 0.0008760140365464631, + "loss": 0.83166546, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.46020508, + "step": 1311, + "time_per_iteration": 2.615981340408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157686, + "balance_loss_mlp": 1.11212397, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.0923524312347507, + "language_loss": 0.8768574, + "learning_rate": 0.0008758086163505156, + "loss": 0.88843429, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.45532227, + "step": 1312, + "time_per_iteration": 2.6723434925079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144164, + "balance_loss_mlp": 1.09872115, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.06443576206069311, + "language_loss": 0.90026277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91170442, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.45458984, + "step": 1313, + "time_per_iteration": 2.841367721557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114771, + "balance_loss_mlp": 1.10291111, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.057466156915965357, + "language_loss": 0.90976274, + "learning_rate": 0.0008753973383328954, + "loss": 0.92123979, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.44824219, + "step": 1314, + "time_per_iteration": 2.7198092937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135642, + "balance_loss_mlp": 1.08912706, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.0651730634150067, + "language_loss": 0.84640622, + "learning_rate": 0.0008751914806708952, + "loss": 0.85776269, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.46508789, + "step": 1315, + "time_per_iteration": 2.619739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138249, + "balance_loss_mlp": 1.0955956, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.06535523514746128, + "language_loss": 0.82706141, + "learning_rate": 0.0008749854773466439, + "loss": 0.83844388, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.42700195, + "step": 1316, + "time_per_iteration": 2.6750850677490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126734, + "balance_loss_mlp": 1.08594072, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07438570972797282, + "language_loss": 0.85103095, + "learning_rate": 0.0008747793284401192, + "loss": 0.86229837, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.40771484, + "step": 1317, + "time_per_iteration": 2.667684316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127851, + "balance_loss_mlp": 1.08231306, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.06662830476911753, + "language_loss": 0.8637262, + "learning_rate": 0.0008745730340313551, + "loss": 0.87500465, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.45532227, + "step": 1318, + "time_per_iteration": 2.783167839050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_mlp": 1.08298802, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.06014849970215255, + "language_loss": 0.84828806, + "learning_rate": 0.0008743665942004422, + "loss": 0.85955328, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.43554688, + "step": 1319, + "time_per_iteration": 2.6454880237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128022, + "balance_loss_mlp": 1.08334279, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.10116204644494126, + "language_loss": 0.93301231, + "learning_rate": 0.0008741600090275277, + "loss": 0.94429255, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.44702148, + "step": 1320, + "time_per_iteration": 2.565373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112488, + "balance_loss_mlp": 1.07884121, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.06655436432492466, + "language_loss": 0.84446663, + "learning_rate": 0.0008739532785928151, + "loss": 0.85571539, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.45996094, + "step": 1321, + "time_per_iteration": 3.479727268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080328, + "balance_loss_mlp": 1.06325758, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.0281051137535917, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7597391, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.17089844, + "step": 1322, + "time_per_iteration": 4.7930076122283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136178, + "balance_loss_mlp": 1.08921003, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.06285601142266005, + "language_loss": 0.83366752, + "learning_rate": 0.0008735393822590908, + "loss": 0.84502923, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.46923828, + "step": 1323, + "time_per_iteration": 2.672137498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145864, + "balance_loss_mlp": 1.10192394, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.05471127015298985, + "language_loss": 0.8775813, + "learning_rate": 0.0008733322165207681, + "loss": 0.88903993, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.43969727, + "step": 1324, + "time_per_iteration": 2.6422736644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157775, + "balance_loss_mlp": 1.11292815, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.058409122955484685, + "language_loss": 0.83687508, + "learning_rate": 0.0008731249058420247, + "loss": 0.84845281, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.44824219, + "step": 1325, + "time_per_iteration": 3.02577805519104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165947, + "balance_loss_mlp": 1.11995602, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.0843662219595253, + "language_loss": 0.90814316, + "learning_rate": 0.0008729174503033459, + "loss": 0.91980267, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.45947266, + "step": 1326, + "time_per_iteration": 2.700956344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160817, + "balance_loss_mlp": 1.11418188, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.07395752020353057, + "language_loss": 0.83274329, + "learning_rate": 0.0008727098499852728, + "loss": 0.84435147, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.46630859, + "step": 1327, + "time_per_iteration": 2.8289363384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.0946734, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.05433597882612883, + "language_loss": 0.90389377, + "learning_rate": 0.0008725021049684034, + "loss": 0.91528177, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.44165039, + "step": 1328, + "time_per_iteration": 2.766871452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.09057808, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.04999939134312536, + "language_loss": 0.83732843, + "learning_rate": 0.000872294215333391, + "loss": 0.84867573, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.44165039, + "step": 1329, + "time_per_iteration": 3.181687116622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133543, + "balance_loss_mlp": 1.08941174, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.053270875218317436, + "language_loss": 0.83338815, + "learning_rate": 0.0008720861811609457, + "loss": 0.84472358, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.44140625, + "step": 1330, + "time_per_iteration": 2.753095865249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139869, + "balance_loss_mlp": 1.09282851, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0744958299593676, + "language_loss": 0.83801699, + "learning_rate": 0.0008718780025318338, + "loss": 0.84941566, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.4699707, + "step": 1331, + "time_per_iteration": 2.74076771736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141571, + "balance_loss_mlp": 1.09913218, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.06658506014654758, + "language_loss": 0.84681445, + "learning_rate": 0.0008716696795268771, + "loss": 0.85823017, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.42456055, + "step": 1332, + "time_per_iteration": 2.6771953105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141914, + "balance_loss_mlp": 1.09718704, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.06458865940403113, + "language_loss": 0.86108088, + "learning_rate": 0.0008714612122269538, + "loss": 0.87250006, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.44750977, + "step": 1333, + "time_per_iteration": 2.872405767440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145867, + "balance_loss_mlp": 1.09944701, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.06078246423813374, + "language_loss": 0.89285004, + "learning_rate": 0.0008712526007129982, + "loss": 0.90430868, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46411133, + "step": 1334, + "time_per_iteration": 2.575467586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148778, + "balance_loss_mlp": 1.10517156, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.06822349657501799, + "language_loss": 0.91275418, + "learning_rate": 0.0008710438450660003, + "loss": 0.92424202, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.43603516, + "step": 1335, + "time_per_iteration": 2.6461987495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149209, + "balance_loss_mlp": 1.10157323, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.08158488021096956, + "language_loss": 0.88278055, + "learning_rate": 0.0008708349453670064, + "loss": 0.89427269, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47583008, + "step": 1336, + "time_per_iteration": 2.5001657009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128588, + "balance_loss_mlp": 1.08297849, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.0603403973753485, + "language_loss": 0.91654134, + "learning_rate": 0.0008706259016971185, + "loss": 0.92782724, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.45629883, + "step": 1337, + "time_per_iteration": 2.817657947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127771, + "balance_loss_mlp": 1.07865644, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.08421665296665147, + "language_loss": 0.83723027, + "learning_rate": 0.0008704167141374944, + "loss": 0.848508, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.49145508, + "step": 1338, + "time_per_iteration": 2.808487892150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_mlp": 1.08003271, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.05813050369368248, + "language_loss": 0.88781357, + "learning_rate": 0.0008702073827693482, + "loss": 0.89909494, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.48144531, + "step": 1339, + "time_per_iteration": 2.687836170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131918, + "balance_loss_mlp": 1.08711886, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.05714278292432699, + "language_loss": 0.89388514, + "learning_rate": 0.0008699979076739494, + "loss": 0.9052043, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.44799805, + "step": 1340, + "time_per_iteration": 2.9907524585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157888, + "balance_loss_mlp": 1.11089551, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.06321899043923618, + "language_loss": 0.8949765, + "learning_rate": 0.0008697882889326234, + "loss": 0.90655541, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.4699707, + "step": 1341, + "time_per_iteration": 2.5261731147766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182653, + "balance_loss_mlp": 1.13513625, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.06545350512623192, + "language_loss": 0.87013066, + "learning_rate": 0.0008695785266267515, + "loss": 0.88195717, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.4753418, + "step": 1342, + "time_per_iteration": 2.719949722290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194656, + "balance_loss_mlp": 1.14585173, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.07227104516109029, + "language_loss": 0.8379634, + "learning_rate": 0.0008693686208377704, + "loss": 0.84991002, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.48828125, + "step": 1343, + "time_per_iteration": 2.789046049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011909, + "balance_loss_mlp": 1.14572012, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.08291144049116697, + "language_loss": 0.89388204, + "learning_rate": 0.0008691585716471733, + "loss": 0.90579104, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.45214844, + "step": 1344, + "time_per_iteration": 2.63281512260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182868, + "balance_loss_mlp": 1.1348505, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.05462335243620436, + "language_loss": 0.86349607, + "learning_rate": 0.0008689483791365079, + "loss": 0.87532479, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.48022461, + "step": 1345, + "time_per_iteration": 2.8293464183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165648, + "balance_loss_mlp": 1.11879873, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.060641418043912716, + "language_loss": 0.89744675, + "learning_rate": 0.0008687380433873786, + "loss": 0.90910327, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46875, + "step": 1346, + "time_per_iteration": 2.757361650466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150314, + "balance_loss_mlp": 1.100389, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.0738804898683007, + "language_loss": 0.83070856, + "learning_rate": 0.0008685275644814448, + "loss": 0.84221172, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.49926758, + "step": 1347, + "time_per_iteration": 2.716006278991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147842, + "balance_loss_mlp": 1.10087395, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07544817120788133, + "language_loss": 0.85244781, + "learning_rate": 0.0008683169425004216, + "loss": 0.86392623, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46972656, + "step": 1348, + "time_per_iteration": 2.900754451751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114914, + "balance_loss_mlp": 1.09842825, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.08404854247051008, + "language_loss": 0.83688962, + "learning_rate": 0.0008681061775260799, + "loss": 0.84838104, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.50708008, + "step": 1349, + "time_per_iteration": 2.8356235027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140271, + "balance_loss_mlp": 1.09356534, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08196022848482862, + "language_loss": 0.92983842, + "learning_rate": 0.0008678952696402458, + "loss": 0.94124115, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46704102, + "step": 1350, + "time_per_iteration": 2.5051889419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_mlp": 1.0865308, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.052642437263987304, + "language_loss": 0.86759204, + "learning_rate": 0.000867684218924801, + "loss": 0.87891388, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.45629883, + "step": 1351, + "time_per_iteration": 2.8635144233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089623, + "balance_loss_mlp": 1.0725522, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.04013302579778462, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80036712, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.17089844, + "step": 1352, + "time_per_iteration": 4.89817476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121624, + "balance_loss_mlp": 1.07587171, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.055845692832442596, + "language_loss": 0.85694808, + "learning_rate": 0.0008672616893328834, + "loss": 0.8681643, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.45751953, + "step": 1353, + "time_per_iteration": 2.9335103034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123767, + "balance_loss_mlp": 1.07877684, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.07010977425409264, + "language_loss": 0.9082427, + "learning_rate": 0.0008670502106204512, + "loss": 0.91948032, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.44970703, + "step": 1354, + "time_per_iteration": 2.8469178676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138616, + "balance_loss_mlp": 1.08840501, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.056353527093492256, + "language_loss": 0.82360619, + "learning_rate": 0.0008668385894064892, + "loss": 0.83499235, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.50195312, + "step": 1355, + "time_per_iteration": 2.672883987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149756, + "balance_loss_mlp": 1.10321617, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.05383030346289838, + "language_loss": 0.89593899, + "learning_rate": 0.0008666268257731562, + "loss": 0.90743661, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46557617, + "step": 1356, + "time_per_iteration": 3.1050939559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169178, + "balance_loss_mlp": 1.12127948, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.05849819020383372, + "language_loss": 0.85968256, + "learning_rate": 0.0008664149198026662, + "loss": 0.87137431, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.47900391, + "step": 1357, + "time_per_iteration": 3.226966619491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156465, + "balance_loss_mlp": 1.10932934, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.07293583935871151, + "language_loss": 0.89518476, + "learning_rate": 0.0008662028715772883, + "loss": 0.90674949, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.47143555, + "step": 1358, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163078, + "balance_loss_mlp": 1.11718237, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.05890556701012809, + "language_loss": 0.86217821, + "learning_rate": 0.0008659906811793467, + "loss": 0.87380904, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.45898438, + "step": 1359, + "time_per_iteration": 2.651193857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151481, + "balance_loss_mlp": 1.10699224, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.06298146111957026, + "language_loss": 0.90418088, + "learning_rate": 0.0008657783486912215, + "loss": 0.91569573, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.44482422, + "step": 1360, + "time_per_iteration": 2.723550319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156338, + "balance_loss_mlp": 1.11022782, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.055299708084911615, + "language_loss": 0.90110713, + "learning_rate": 0.0008655658741953472, + "loss": 0.91267049, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.4609375, + "step": 1361, + "time_per_iteration": 3.216830015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139946, + "balance_loss_mlp": 1.09564757, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.04868556149108388, + "language_loss": 0.89168048, + "learning_rate": 0.0008653532577742136, + "loss": 0.90307987, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.44311523, + "step": 1362, + "time_per_iteration": 2.718886375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143457, + "balance_loss_mlp": 1.0986346, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.058057923999792295, + "language_loss": 0.87558335, + "learning_rate": 0.0008651404995103659, + "loss": 0.88701797, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.44824219, + "step": 1363, + "time_per_iteration": 2.594294309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.09338474, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.06330728330165165, + "language_loss": 0.87334514, + "learning_rate": 0.0008649275994864041, + "loss": 0.88471884, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.43994141, + "step": 1364, + "time_per_iteration": 2.707449197769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144999, + "balance_loss_mlp": 1.09879303, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.05276541609050752, + "language_loss": 0.84391934, + "learning_rate": 0.0008647145577849834, + "loss": 0.85536933, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46191406, + "step": 1365, + "time_per_iteration": 2.8216350078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131283, + "balance_loss_mlp": 1.08560157, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.05376997595185902, + "language_loss": 0.83317888, + "learning_rate": 0.0008645013744888139, + "loss": 0.84449172, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.45678711, + "step": 1366, + "time_per_iteration": 2.866891622543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149616, + "balance_loss_mlp": 1.10536587, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.06316724717597957, + "language_loss": 0.87992281, + "learning_rate": 0.0008642880496806607, + "loss": 0.89141893, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.44287109, + "step": 1367, + "time_per_iteration": 2.7763173580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142909, + "balance_loss_mlp": 1.09772861, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.05877759558608074, + "language_loss": 0.84959197, + "learning_rate": 0.0008640745834433437, + "loss": 0.86102104, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.4519043, + "step": 1368, + "time_per_iteration": 2.738328218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134336, + "balance_loss_mlp": 1.09018087, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.05935956886320276, + "language_loss": 0.87054664, + "learning_rate": 0.000863860975859738, + "loss": 0.88189, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.44165039, + "step": 1369, + "time_per_iteration": 2.9206831455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131372, + "balance_loss_mlp": 1.0855242, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.06691392922801855, + "language_loss": 0.88684422, + "learning_rate": 0.0008636472270127733, + "loss": 0.89815795, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.45825195, + "step": 1370, + "time_per_iteration": 2.6078739166259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116186, + "balance_loss_mlp": 1.07021928, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.06515524250359679, + "language_loss": 0.90367895, + "learning_rate": 0.0008634333369854345, + "loss": 0.91484082, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.45947266, + "step": 1371, + "time_per_iteration": 2.6001384258270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110327, + "balance_loss_mlp": 1.0667206, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.056061894150206536, + "language_loss": 0.87892628, + "learning_rate": 0.0008632193058607608, + "loss": 0.89002955, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.43554688, + "step": 1372, + "time_per_iteration": 2.711435317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113538, + "balance_loss_mlp": 1.06628299, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.060513983317086996, + "language_loss": 0.81023312, + "learning_rate": 0.0008630051337218466, + "loss": 0.82136846, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47314453, + "step": 1373, + "time_per_iteration": 2.656416893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110026, + "balance_loss_mlp": 1.0668484, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0689512550651149, + "language_loss": 0.82808203, + "learning_rate": 0.0008627908206518409, + "loss": 0.83918226, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.43188477, + "step": 1374, + "time_per_iteration": 2.673738956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_mlp": 1.02716982, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.01820003864645097, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76191109, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12695312, + "step": 1375, + "time_per_iteration": 5.317140817642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115308, + "balance_loss_mlp": 1.07272696, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.062338636090573274, + "language_loss": 0.91769958, + "learning_rate": 0.0008623617720514241, + "loss": 0.92885268, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.42578125, + "step": 1376, + "time_per_iteration": 2.666618585586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117829, + "balance_loss_mlp": 1.07255304, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.08321054400070194, + "language_loss": 0.85169828, + "learning_rate": 0.0008621470366875848, + "loss": 0.86287659, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.45288086, + "step": 1377, + "time_per_iteration": 2.5939900875091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011137, + "balance_loss_mlp": 1.0724293, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.0756812485553519, + "language_loss": 0.88528687, + "learning_rate": 0.0008619321607257966, + "loss": 0.89642382, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.41259766, + "step": 1378, + "time_per_iteration": 2.675719976425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.08109117, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.05967522341676015, + "language_loss": 0.8244732, + "learning_rate": 0.000861717144249482, + "loss": 0.8357054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.42138672, + "step": 1379, + "time_per_iteration": 2.8289949893951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132354, + "balance_loss_mlp": 1.09170318, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06486885922060631, + "language_loss": 0.90334523, + "learning_rate": 0.0008615019873421175, + "loss": 0.91466868, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.40649414, + "step": 1380, + "time_per_iteration": 2.4665510654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141805, + "balance_loss_mlp": 1.09798408, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.06471812563896691, + "language_loss": 0.86262017, + "learning_rate": 0.0008612866900872349, + "loss": 0.87403822, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.43823242, + "step": 1381, + "time_per_iteration": 2.553489923477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140972, + "balance_loss_mlp": 1.10017824, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.07006288293307902, + "language_loss": 0.88817614, + "learning_rate": 0.0008610712525684197, + "loss": 0.89958596, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.40771484, + "step": 1382, + "time_per_iteration": 2.623844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156525, + "balance_loss_mlp": 1.11341906, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.06690376769295572, + "language_loss": 0.85084939, + "learning_rate": 0.0008608556748693121, + "loss": 0.8624146, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.43115234, + "step": 1383, + "time_per_iteration": 3.248947858810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149603, + "balance_loss_mlp": 1.10549557, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.05893966497122096, + "language_loss": 0.86648834, + "learning_rate": 0.000860639957073607, + "loss": 0.8779844, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.44116211, + "step": 1384, + "time_per_iteration": 2.6954376697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161137, + "balance_loss_mlp": 1.11838901, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.05777577847879513, + "language_loss": 0.88325369, + "learning_rate": 0.0008604240992650534, + "loss": 0.8948651, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.42749023, + "step": 1385, + "time_per_iteration": 2.6810553073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116884, + "balance_loss_mlp": 1.12613928, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.1266990207417539, + "language_loss": 0.89650941, + "learning_rate": 0.0008602081015274545, + "loss": 0.90819776, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.42724609, + "step": 1386, + "time_per_iteration": 2.7079007625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169207, + "balance_loss_mlp": 1.12602973, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.05666517988787923, + "language_loss": 0.83684492, + "learning_rate": 0.0008599919639446684, + "loss": 0.84853697, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.43139648, + "step": 1387, + "time_per_iteration": 2.67275333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184027, + "balance_loss_mlp": 1.13755894, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.06873806966805297, + "language_loss": 0.80686462, + "learning_rate": 0.000859775686600607, + "loss": 0.81870484, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46459961, + "step": 1388, + "time_per_iteration": 2.568384885787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192065, + "balance_loss_mlp": 1.14676547, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.07413400256287127, + "language_loss": 0.85524642, + "learning_rate": 0.0008595592695792367, + "loss": 0.86716712, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.453125, + "step": 1389, + "time_per_iteration": 2.6748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182907, + "balance_loss_mlp": 1.13884759, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06676524761439688, + "language_loss": 0.9117986, + "learning_rate": 0.0008593427129645778, + "loss": 0.92362767, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.44042969, + "step": 1390, + "time_per_iteration": 2.5506954193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186114, + "balance_loss_mlp": 1.14205468, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.056989477345309104, + "language_loss": 0.85532665, + "learning_rate": 0.0008591260168407052, + "loss": 0.86718786, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.44067383, + "step": 1391, + "time_per_iteration": 2.759000778198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_mlp": 1.13714194, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.12230490659722075, + "language_loss": 0.83154678, + "learning_rate": 0.0008589091812917479, + "loss": 0.84336257, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.4440918, + "step": 1392, + "time_per_iteration": 2.6213910579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183464, + "balance_loss_mlp": 1.14030981, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07403824045185783, + "language_loss": 0.8547672, + "learning_rate": 0.0008586922064018887, + "loss": 0.86660182, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.43139648, + "step": 1393, + "time_per_iteration": 2.6706490516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170846, + "balance_loss_mlp": 1.12375855, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.06891205333434622, + "language_loss": 0.89827204, + "learning_rate": 0.0008584750922553651, + "loss": 0.90998048, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.47021484, + "step": 1394, + "time_per_iteration": 3.1465976238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164798, + "balance_loss_mlp": 1.1222403, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.06253124916771012, + "language_loss": 0.84102368, + "learning_rate": 0.0008582578389364677, + "loss": 0.85267168, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.42529297, + "step": 1395, + "time_per_iteration": 2.853278875350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170721, + "balance_loss_mlp": 1.12573135, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.0656545534576685, + "language_loss": 0.92268932, + "learning_rate": 0.0008580404465295422, + "loss": 0.93439656, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.44970703, + "step": 1396, + "time_per_iteration": 2.773932695388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152323, + "balance_loss_mlp": 1.10826349, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07972324646927738, + "language_loss": 0.88789833, + "learning_rate": 0.0008578229151189876, + "loss": 0.89942157, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.44067383, + "step": 1397, + "time_per_iteration": 2.934276819229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10151267, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.10010461149900847, + "language_loss": 0.8178823, + "learning_rate": 0.0008576052447892573, + "loss": 0.82932794, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.43115234, + "step": 1398, + "time_per_iteration": 2.5337071418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131122, + "balance_loss_mlp": 1.08768189, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.07718983812215899, + "language_loss": 0.86768365, + "learning_rate": 0.000857387435624858, + "loss": 0.87899494, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.43457031, + "step": 1399, + "time_per_iteration": 2.5189273357391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127749, + "balance_loss_mlp": 1.08404672, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0707561541840249, + "language_loss": 0.88852745, + "learning_rate": 0.0008571694877103513, + "loss": 0.89980495, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.43701172, + "step": 1400, + "time_per_iteration": 3.287325859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126684, + "balance_loss_mlp": 1.08372128, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.08476375879770352, + "language_loss": 0.88499445, + "learning_rate": 0.0008569514011303515, + "loss": 0.89626133, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.4296875, + "step": 1401, + "time_per_iteration": 2.849506378173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120044, + "balance_loss_mlp": 1.07770109, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.12418270059874827, + "language_loss": 0.88531977, + "learning_rate": 0.0008567331759695277, + "loss": 0.89652026, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.42358398, + "step": 1402, + "time_per_iteration": 2.7033023834228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119932, + "balance_loss_mlp": 1.07584798, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.09855769315853927, + "language_loss": 0.86756563, + "learning_rate": 0.0008565148123126023, + "loss": 0.87876499, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.44091797, + "step": 1403, + "time_per_iteration": 2.645425319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119876, + "balance_loss_mlp": 1.07769978, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.15226973878739974, + "language_loss": 0.86578166, + "learning_rate": 0.0008562963102443516, + "loss": 0.87698042, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.421875, + "step": 1404, + "time_per_iteration": 2.6965179443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130222, + "balance_loss_mlp": 1.08668637, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.09156828725831004, + "language_loss": 0.85926664, + "learning_rate": 0.0008560776698496056, + "loss": 0.87056887, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.43530273, + "step": 1405, + "time_per_iteration": 2.868159532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141969, + "balance_loss_mlp": 1.09707534, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.07226677638641436, + "language_loss": 0.86433703, + "learning_rate": 0.0008558588912132481, + "loss": 0.87575674, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.44873047, + "step": 1406, + "time_per_iteration": 2.8309988975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.05236614, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03207539465139433, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77525663, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.14257812, + "step": 1407, + "time_per_iteration": 4.926543235778809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09220862, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.06146298960376288, + "language_loss": 0.83448923, + "learning_rate": 0.0008554209195555016, + "loss": 0.84585381, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.44287109, + "step": 1408, + "time_per_iteration": 2.6698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136456, + "balance_loss_mlp": 1.08965421, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.1627330563817166, + "language_loss": 0.89102834, + "learning_rate": 0.0008552017267041483, + "loss": 0.90239286, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.46801758, + "step": 1409, + "time_per_iteration": 2.6957972049713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127578, + "balance_loss_mlp": 1.08349395, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06560812899143556, + "language_loss": 0.83656335, + "learning_rate": 0.0008549823959512549, + "loss": 0.84783912, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.44091797, + "step": 1410, + "time_per_iteration": 2.7068376541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011101, + "balance_loss_mlp": 1.06708908, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.08175260567644033, + "language_loss": 0.87610555, + "learning_rate": 0.0008547629273819728, + "loss": 0.88720655, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.43041992, + "step": 1411, + "time_per_iteration": 3.366260290145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_mlp": 1.06542349, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.10517352924457117, + "language_loss": 0.84009993, + "learning_rate": 0.0008545433210815074, + "loss": 0.85118002, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.42578125, + "step": 1412, + "time_per_iteration": 2.630105972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.07931852, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.09841738404648297, + "language_loss": 0.87974489, + "learning_rate": 0.0008543235771351176, + "loss": 0.89097011, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.43188477, + "step": 1413, + "time_per_iteration": 2.725048065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.08635998, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.059677420125308425, + "language_loss": 0.84918916, + "learning_rate": 0.0008541036956281154, + "loss": 0.86048239, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.42993164, + "step": 1414, + "time_per_iteration": 2.897216796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133545, + "balance_loss_mlp": 1.08898425, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.08487151018546404, + "language_loss": 0.82919049, + "learning_rate": 0.0008538836766458665, + "loss": 0.84052598, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.44580078, + "step": 1415, + "time_per_iteration": 2.8930981159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137425, + "balance_loss_mlp": 1.0942955, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09871518143765563, + "language_loss": 0.85738099, + "learning_rate": 0.0008536635202737897, + "loss": 0.86875528, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.43164062, + "step": 1416, + "time_per_iteration": 2.7891178131103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137299, + "balance_loss_mlp": 1.0931915, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.10766210404252562, + "language_loss": 0.82790214, + "learning_rate": 0.0008534432265973573, + "loss": 0.83927512, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.44091797, + "step": 1417, + "time_per_iteration": 2.6409006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141948, + "balance_loss_mlp": 1.09691095, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07824380469589887, + "language_loss": 0.88708508, + "learning_rate": 0.000853222795702095, + "loss": 0.89850456, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.45092773, + "step": 1418, + "time_per_iteration": 3.4312241077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115343, + "balance_loss_mlp": 1.10767758, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.06262628073505326, + "language_loss": 0.84196067, + "learning_rate": 0.0008530022276735813, + "loss": 0.85349494, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.45727539, + "step": 1419, + "time_per_iteration": 2.742341995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169959, + "balance_loss_mlp": 1.12742519, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07008703106338479, + "language_loss": 0.86301696, + "learning_rate": 0.0008527815225974489, + "loss": 0.87471658, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.42529297, + "step": 1420, + "time_per_iteration": 2.643151044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172801, + "balance_loss_mlp": 1.12731028, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10800570533054084, + "language_loss": 0.88767672, + "learning_rate": 0.0008525606805593829, + "loss": 0.8994047, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.45483398, + "step": 1421, + "time_per_iteration": 2.4374186992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115892, + "balance_loss_mlp": 1.11283422, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.11472023337789067, + "language_loss": 0.83181965, + "learning_rate": 0.0008523397016451213, + "loss": 0.84340894, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46142578, + "step": 1422, + "time_per_iteration": 2.585376739501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152063, + "balance_loss_mlp": 1.10824132, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.08784028487991961, + "language_loss": 0.87910116, + "learning_rate": 0.0008521185859404564, + "loss": 0.89062172, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.43847656, + "step": 1423, + "time_per_iteration": 3.399348020553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150781, + "balance_loss_mlp": 1.10634017, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06323160386311827, + "language_loss": 0.89755672, + "learning_rate": 0.0008518973335312326, + "loss": 0.90906453, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.44433594, + "step": 1424, + "time_per_iteration": 2.771397352218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141797, + "balance_loss_mlp": 1.09628344, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.0741893947597381, + "language_loss": 0.83755773, + "learning_rate": 0.0008516759445033477, + "loss": 0.84897572, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.45532227, + "step": 1425, + "time_per_iteration": 2.623136520385742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148744, + "balance_loss_mlp": 1.10227656, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08118081060083703, + "language_loss": 0.85448551, + "learning_rate": 0.0008514544189427526, + "loss": 0.865973, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.46484375, + "step": 1426, + "time_per_iteration": 2.695749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156426, + "balance_loss_mlp": 1.11208034, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.0837156631450272, + "language_loss": 0.86976963, + "learning_rate": 0.0008512327569354511, + "loss": 0.88133389, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.44360352, + "step": 1427, + "time_per_iteration": 2.5354061126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160051, + "balance_loss_mlp": 1.11353528, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.09189170382991782, + "language_loss": 0.84034801, + "learning_rate": 0.0008510109585675001, + "loss": 0.8519485, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.46508789, + "step": 1428, + "time_per_iteration": 2.5996179580688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093492, + "balance_loss_mlp": 1.07680273, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.03549776566589832, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.8224684, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.16699219, + "step": 1429, + "time_per_iteration": 4.714696407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172648, + "balance_loss_mlp": 1.1280638, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.1239425770540774, + "language_loss": 0.81035018, + "learning_rate": 0.0008505669530941415, + "loss": 0.82207668, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 3.346867322921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171144, + "balance_loss_mlp": 1.12613082, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.0741807723541833, + "language_loss": 0.84519219, + "learning_rate": 0.000850344746161112, + "loss": 0.85690367, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.45019531, + "step": 1431, + "time_per_iteration": 2.6365530490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178527, + "balance_loss_mlp": 1.13418126, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.09683250699138053, + "language_loss": 0.88287663, + "learning_rate": 0.0008501224032121894, + "loss": 0.8946619, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.44360352, + "step": 1432, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178788, + "balance_loss_mlp": 1.13406062, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06051880699738469, + "language_loss": 0.82098711, + "learning_rate": 0.0008498999243336946, + "loss": 0.832775, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.44726562, + "step": 1433, + "time_per_iteration": 2.643663167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198526, + "balance_loss_mlp": 1.15129471, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.07173936681504893, + "language_loss": 0.87897062, + "learning_rate": 0.0008496773096120021, + "loss": 0.89095587, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.47241211, + "step": 1434, + "time_per_iteration": 2.8680803775787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198281, + "balance_loss_mlp": 1.15164685, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.07924459326066897, + "language_loss": 0.84949142, + "learning_rate": 0.0008494545591335381, + "loss": 0.86147422, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46630859, + "step": 1435, + "time_per_iteration": 2.9436187744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197184, + "balance_loss_mlp": 1.15176487, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.05338969573395925, + "language_loss": 0.87283278, + "learning_rate": 0.0008492316729847823, + "loss": 0.88480461, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.4543457, + "step": 1436, + "time_per_iteration": 2.817201614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195413, + "balance_loss_mlp": 1.14739525, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08524745340475512, + "language_loss": 0.80082995, + "learning_rate": 0.0008490086512522664, + "loss": 0.81278408, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47998047, + "step": 1437, + "time_per_iteration": 2.7126290798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196486, + "balance_loss_mlp": 1.14870656, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.06867103991167788, + "language_loss": 0.90572739, + "learning_rate": 0.0008487854940225755, + "loss": 0.9176923, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47729492, + "step": 1438, + "time_per_iteration": 2.431755542755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207177, + "balance_loss_mlp": 1.15858746, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.13716227323677116, + "language_loss": 0.90202403, + "learning_rate": 0.0008485622013823466, + "loss": 0.91409582, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.48608398, + "step": 1439, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198257, + "balance_loss_mlp": 1.15062046, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.09985187013126534, + "language_loss": 0.836923, + "learning_rate": 0.00084833877341827, + "loss": 0.84890562, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47680664, + "step": 1440, + "time_per_iteration": 2.652665138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215433, + "balance_loss_mlp": 1.16562724, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09777751450797587, + "language_loss": 0.81022394, + "learning_rate": 0.000848115210217088, + "loss": 0.82237822, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.49853516, + "step": 1441, + "time_per_iteration": 2.550879955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120133, + "balance_loss_mlp": 1.15166724, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.06658099231370791, + "language_loss": 0.82249796, + "learning_rate": 0.0008478915118655952, + "loss": 0.83451128, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.49658203, + "step": 1442, + "time_per_iteration": 2.7541940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209129, + "balance_loss_mlp": 1.16261363, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.05385742523937431, + "language_loss": 0.86750221, + "learning_rate": 0.0008476676784506393, + "loss": 0.87959349, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.46557617, + "step": 1443, + "time_per_iteration": 2.6595921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120895, + "balance_loss_mlp": 1.16083765, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07541643273231594, + "language_loss": 0.82715142, + "learning_rate": 0.0008474437100591201, + "loss": 0.83924091, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.48120117, + "step": 1444, + "time_per_iteration": 3.285985231399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209577, + "balance_loss_mlp": 1.16258454, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.07952238187909891, + "language_loss": 0.8560605, + "learning_rate": 0.0008472196067779898, + "loss": 0.86815625, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47021484, + "step": 1445, + "time_per_iteration": 2.677077293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204567, + "balance_loss_mlp": 1.15600109, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10163023549653756, + "language_loss": 0.86494523, + "learning_rate": 0.0008469953686942531, + "loss": 0.87699091, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.48583984, + "step": 1446, + "time_per_iteration": 3.10603928565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.14158559, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.0769454608790312, + "language_loss": 0.83537692, + "learning_rate": 0.0008467709958949668, + "loss": 0.84726554, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.47265625, + "step": 1447, + "time_per_iteration": 2.7602903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116478, + "balance_loss_mlp": 1.11943233, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08244080074007111, + "language_loss": 0.86534739, + "learning_rate": 0.0008465464884672403, + "loss": 0.87699515, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.45410156, + "step": 1448, + "time_per_iteration": 2.702974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178355, + "balance_loss_mlp": 1.13424778, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.061441667483596626, + "language_loss": 0.85982984, + "learning_rate": 0.0008463218464982348, + "loss": 0.87161338, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.44091797, + "step": 1449, + "time_per_iteration": 2.832615852355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185601, + "balance_loss_mlp": 1.14058757, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07503412994840371, + "language_loss": 0.88168389, + "learning_rate": 0.0008460970700751645, + "loss": 0.89353991, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.45019531, + "step": 1450, + "time_per_iteration": 3.0487136840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185626, + "balance_loss_mlp": 1.13977861, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.06352945894963989, + "language_loss": 0.88538259, + "learning_rate": 0.000845872159285295, + "loss": 0.89723885, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.45849609, + "step": 1451, + "time_per_iteration": 2.715423822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_mlp": 1.04985404, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.02807340123185793, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78833961, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17285156, + "step": 1452, + "time_per_iteration": 4.906192302703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197684, + "balance_loss_mlp": 1.15064442, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.06703382456082828, + "language_loss": 0.86617672, + "learning_rate": 0.0008454219349544836, + "loss": 0.87815356, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47045898, + "step": 1453, + "time_per_iteration": 3.3534200191497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.15343201, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.08552050648295068, + "language_loss": 0.82341981, + "learning_rate": 0.000845196621588334, + "loss": 0.83540004, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.44580078, + "step": 1454, + "time_per_iteration": 2.743699073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204394, + "balance_loss_mlp": 1.1566391, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.05325666962256515, + "language_loss": 0.7637955, + "learning_rate": 0.0008449711742049706, + "loss": 0.77583951, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.4777832, + "step": 1455, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208188, + "balance_loss_mlp": 1.16222095, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.09912152167704158, + "language_loss": 0.84447122, + "learning_rate": 0.0008447455928919196, + "loss": 0.85655314, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.45996094, + "step": 1456, + "time_per_iteration": 2.597557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242882, + "balance_loss_mlp": 1.19460225, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.060789109492995964, + "language_loss": 0.87272859, + "learning_rate": 0.0008445198777367595, + "loss": 0.88515741, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.48291016, + "step": 1457, + "time_per_iteration": 2.5689990520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283391, + "balance_loss_mlp": 1.23394287, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.0840599244275116, + "language_loss": 0.80820799, + "learning_rate": 0.0008442940288271208, + "loss": 0.82104188, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.49365234, + "step": 1458, + "time_per_iteration": 2.674907922744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299064, + "balance_loss_mlp": 1.24899602, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06912303271008884, + "language_loss": 0.87410611, + "learning_rate": 0.0008440680462506856, + "loss": 0.88709676, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.50073242, + "step": 1459, + "time_per_iteration": 2.73905873298645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312423, + "balance_loss_mlp": 1.26221192, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.11964292138845481, + "language_loss": 0.86650789, + "learning_rate": 0.0008438419300951883, + "loss": 0.87963212, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.50219727, + "step": 1460, + "time_per_iteration": 2.6775193214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277494, + "balance_loss_mlp": 1.22690177, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.08967430845786024, + "language_loss": 0.86711442, + "learning_rate": 0.0008436156804484148, + "loss": 0.87988937, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.50610352, + "step": 1461, + "time_per_iteration": 2.8446624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225027, + "balance_loss_mlp": 1.17615128, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.06778030965882964, + "language_loss": 0.88354933, + "learning_rate": 0.0008433892973982031, + "loss": 0.89579964, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.48901367, + "step": 1462, + "time_per_iteration": 2.5101869106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212759, + "balance_loss_mlp": 1.16168988, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07940790981700917, + "language_loss": 0.85705763, + "learning_rate": 0.0008431627810324431, + "loss": 0.86918521, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.51098633, + "step": 1463, + "time_per_iteration": 2.6701931953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208608, + "balance_loss_mlp": 1.15906441, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.1112721524597414, + "language_loss": 0.81312853, + "learning_rate": 0.000842936131439076, + "loss": 0.82521462, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.49584961, + "step": 1464, + "time_per_iteration": 2.626397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182235, + "balance_loss_mlp": 1.13440847, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.10805991000078381, + "language_loss": 0.88305855, + "learning_rate": 0.0008427093487060951, + "loss": 0.89488095, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.4777832, + "step": 1465, + "time_per_iteration": 2.6287689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152025, + "balance_loss_mlp": 1.10815573, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.05392746655550109, + "language_loss": 0.85014635, + "learning_rate": 0.000842482432921545, + "loss": 0.86166662, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.4387207, + "step": 1466, + "time_per_iteration": 2.843055009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140929, + "balance_loss_mlp": 1.09691691, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.12216249404138245, + "language_loss": 0.8786549, + "learning_rate": 0.0008422553841735225, + "loss": 0.89006418, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.44018555, + "step": 1467, + "time_per_iteration": 2.4870855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130953, + "balance_loss_mlp": 1.08686972, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.0834179705505054, + "language_loss": 0.85186172, + "learning_rate": 0.0008420282025501757, + "loss": 0.86317128, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.44091797, + "step": 1468, + "time_per_iteration": 2.746919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.09730196, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07747841896553878, + "language_loss": 0.85862702, + "learning_rate": 0.0008418008881397043, + "loss": 0.8700223, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.42236328, + "step": 1469, + "time_per_iteration": 2.7157111167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011536, + "balance_loss_mlp": 1.11108959, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.09196817065592088, + "language_loss": 0.83090472, + "learning_rate": 0.0008415734410303595, + "loss": 0.84244066, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.42529297, + "step": 1470, + "time_per_iteration": 3.2546660900115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.1166662, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07745609031802311, + "language_loss": 0.91133046, + "learning_rate": 0.0008413458613104444, + "loss": 0.92292744, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.43017578, + "step": 1471, + "time_per_iteration": 2.683119773864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124215, + "balance_loss_mlp": 1.08091772, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06716648824100378, + "language_loss": 0.83225214, + "learning_rate": 0.0008411181490683129, + "loss": 0.84349424, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.43334961, + "step": 1472, + "time_per_iteration": 2.7247512340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112102, + "balance_loss_mlp": 1.06692195, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08730853561294576, + "language_loss": 0.83099282, + "learning_rate": 0.0008408903043923707, + "loss": 0.84211385, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.45166016, + "step": 1473, + "time_per_iteration": 2.9982750415802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_mlp": 1.06675041, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09441991509127853, + "language_loss": 0.81456125, + "learning_rate": 0.0008406623273710754, + "loss": 0.82569724, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.46826172, + "step": 1474, + "time_per_iteration": 2.6457254886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107143, + "balance_loss_mlp": 1.06482363, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.08147557265850319, + "language_loss": 0.83874208, + "learning_rate": 0.0008404342180929351, + "loss": 0.84981352, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.42358398, + "step": 1475, + "time_per_iteration": 2.6071481704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110668, + "balance_loss_mlp": 1.06758618, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0682383784230515, + "language_loss": 0.81900609, + "learning_rate": 0.00084020597664651, + "loss": 0.83011281, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.43066406, + "step": 1476, + "time_per_iteration": 2.831547260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118821, + "balance_loss_mlp": 1.07149458, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.08199753583087593, + "language_loss": 0.84526181, + "learning_rate": 0.0008399776031204111, + "loss": 0.85645002, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.47290039, + "step": 1477, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112444, + "balance_loss_mlp": 1.07832992, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07183050675580523, + "language_loss": 0.80975109, + "learning_rate": 0.0008397490976033009, + "loss": 0.82099551, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.46118164, + "step": 1478, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.03766239, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.035679392232843235, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933525, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.16210938, + "step": 1479, + "time_per_iteration": 4.813107252120972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132957, + "balance_loss_mlp": 1.08925462, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06426749014533666, + "language_loss": 0.85708797, + "learning_rate": 0.0008392916909509525, + "loss": 0.86841756, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.43725586, + "step": 1480, + "time_per_iteration": 3.105465888977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.10180378, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.12099224111333258, + "language_loss": 0.8583495, + "learning_rate": 0.0008390627899932954, + "loss": 0.86980623, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.43847656, + "step": 1481, + "time_per_iteration": 2.5961339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146403, + "balance_loss_mlp": 1.1041795, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.09850404509995118, + "language_loss": 0.88747412, + "learning_rate": 0.000838833757399789, + "loss": 0.89893812, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.42211914, + "step": 1482, + "time_per_iteration": 2.9445223808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160742, + "balance_loss_mlp": 1.11513209, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.09258701289693592, + "language_loss": 0.81233478, + "learning_rate": 0.0008386045932593515, + "loss": 0.82394218, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.45605469, + "step": 1483, + "time_per_iteration": 2.696171283721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172022, + "balance_loss_mlp": 1.12853456, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07718327666813503, + "language_loss": 0.8687939, + "learning_rate": 0.0008383752976609525, + "loss": 0.88051414, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.43481445, + "step": 1484, + "time_per_iteration": 2.948983907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159194, + "balance_loss_mlp": 1.11508679, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06564205880415652, + "language_loss": 0.80617285, + "learning_rate": 0.0008381458706936123, + "loss": 0.81776482, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.44116211, + "step": 1485, + "time_per_iteration": 2.689715623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117177, + "balance_loss_mlp": 1.12740064, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06570872016312425, + "language_loss": 0.87734085, + "learning_rate": 0.0008379163124464025, + "loss": 0.88905853, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.44384766, + "step": 1486, + "time_per_iteration": 2.7226197719573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166912, + "balance_loss_mlp": 1.12526059, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.0915307653224295, + "language_loss": 0.77564812, + "learning_rate": 0.0008376866230084452, + "loss": 0.78731728, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.41650391, + "step": 1487, + "time_per_iteration": 2.82708477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154293, + "balance_loss_mlp": 1.10901785, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07232162522245564, + "language_loss": 0.86754864, + "learning_rate": 0.000837456802468914, + "loss": 0.87909162, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.45239258, + "step": 1488, + "time_per_iteration": 2.6107335090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115391, + "balance_loss_mlp": 1.1082294, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.06580975478488113, + "language_loss": 0.85965604, + "learning_rate": 0.0008372268509170331, + "loss": 0.8711952, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.45678711, + "step": 1489, + "time_per_iteration": 2.682190418243408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147981, + "balance_loss_mlp": 1.10554218, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.0640942252200205, + "language_loss": 0.85215169, + "learning_rate": 0.0008369967684420779, + "loss": 0.86363149, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.42431641, + "step": 1490, + "time_per_iteration": 2.708315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.11154985, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.07293711729105107, + "language_loss": 0.84566355, + "learning_rate": 0.0008367665551333736, + "loss": 0.85722154, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.44262695, + "step": 1491, + "time_per_iteration": 2.605665445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159368, + "balance_loss_mlp": 1.11216116, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.0802107480821924, + "language_loss": 0.85808468, + "learning_rate": 0.0008365362110802977, + "loss": 0.86967838, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47241211, + "step": 1492, + "time_per_iteration": 2.879655122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155437, + "balance_loss_mlp": 1.109303, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.06007050516222503, + "language_loss": 0.82957923, + "learning_rate": 0.0008363057363722773, + "loss": 0.84113365, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.46142578, + "step": 1493, + "time_per_iteration": 2.8600335121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154458, + "balance_loss_mlp": 1.11085081, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.060904552171674266, + "language_loss": 0.8464222, + "learning_rate": 0.0008360751310987906, + "loss": 0.85796678, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.4362793, + "step": 1494, + "time_per_iteration": 2.602029800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151781, + "balance_loss_mlp": 1.11160707, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.06255193118064963, + "language_loss": 0.86073208, + "learning_rate": 0.0008358443953493666, + "loss": 0.87224984, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.40185547, + "step": 1495, + "time_per_iteration": 2.8682689666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116061, + "balance_loss_mlp": 1.11702669, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.06637793594414569, + "language_loss": 0.89093578, + "learning_rate": 0.0008356135292135851, + "loss": 0.90254188, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.43579102, + "step": 1496, + "time_per_iteration": 2.519700288772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162426, + "balance_loss_mlp": 1.11760294, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.07926576541007177, + "language_loss": 0.92873323, + "learning_rate": 0.0008353825327810758, + "loss": 0.94035745, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.44873047, + "step": 1497, + "time_per_iteration": 2.4195892810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.09852648, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.05522330058639147, + "language_loss": 0.81832987, + "learning_rate": 0.00083515140614152, + "loss": 0.82973409, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.41894531, + "step": 1498, + "time_per_iteration": 2.6989245414733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151843, + "balance_loss_mlp": 1.10992932, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.08112895482541128, + "language_loss": 0.87581354, + "learning_rate": 0.0008349201493846485, + "loss": 0.88733196, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.41894531, + "step": 1499, + "time_per_iteration": 2.647165298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113639, + "balance_loss_mlp": 1.09364128, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.06188269799142739, + "language_loss": 0.89485824, + "learning_rate": 0.0008346887626002432, + "loss": 0.90622216, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.42724609, + "step": 1500, + "time_per_iteration": 2.546494960784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.09546816, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.07756887509348087, + "language_loss": 0.86612689, + "learning_rate": 0.000834457245878137, + "loss": 0.87751424, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.43261719, + "step": 1501, + "time_per_iteration": 2.6271145343780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132854, + "balance_loss_mlp": 1.08993816, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07465598629984396, + "language_loss": 0.8176384, + "learning_rate": 0.000834225599308212, + "loss": 0.82896686, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.42895508, + "step": 1502, + "time_per_iteration": 3.2550971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150677, + "balance_loss_mlp": 1.10580611, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07581203663628927, + "language_loss": 0.85830456, + "learning_rate": 0.0008339938229804016, + "loss": 0.8698113, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.44897461, + "step": 1503, + "time_per_iteration": 2.704310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132016, + "balance_loss_mlp": 1.11475468, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04995777902546146, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76566839, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17285156, + "step": 1504, + "time_per_iteration": 4.959474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.10965538, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.06157445053236475, + "language_loss": 0.84505653, + "learning_rate": 0.0008335298814111094, + "loss": 0.85662901, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47607422, + "step": 1505, + "time_per_iteration": 2.5612986087799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178976, + "balance_loss_mlp": 1.13374829, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.05887296654917154, + "language_loss": 0.88222575, + "learning_rate": 0.0008332977163497455, + "loss": 0.89401549, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.4519043, + "step": 1506, + "time_per_iteration": 2.8017849922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183741, + "balance_loss_mlp": 1.13696313, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07773532252894584, + "language_loss": 0.83964998, + "learning_rate": 0.0008330654218907325, + "loss": 0.8514874, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.46801758, + "step": 1507, + "time_per_iteration": 2.6568052768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167782, + "balance_loss_mlp": 1.12016964, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.05364053536005051, + "language_loss": 0.82260346, + "learning_rate": 0.0008328329981242548, + "loss": 0.83428133, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47631836, + "step": 1508, + "time_per_iteration": 2.8732171058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161954, + "balance_loss_mlp": 1.11479485, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.06776855665971031, + "language_loss": 0.88091129, + "learning_rate": 0.0008326004451405475, + "loss": 0.8925308, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47143555, + "step": 1509, + "time_per_iteration": 2.762476921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.11104107, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.08089915602738365, + "language_loss": 0.82757521, + "learning_rate": 0.0008323677630298957, + "loss": 0.83914363, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.45800781, + "step": 1510, + "time_per_iteration": 2.554558753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152926, + "balance_loss_mlp": 1.1073643, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.07106066660777852, + "language_loss": 0.85773015, + "learning_rate": 0.0008321349518826345, + "loss": 0.86925942, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.45556641, + "step": 1511, + "time_per_iteration": 2.8341891765594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144812, + "balance_loss_mlp": 1.09870172, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.06994476337169399, + "language_loss": 0.95554525, + "learning_rate": 0.0008319020117891491, + "loss": 0.96699333, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.4609375, + "step": 1512, + "time_per_iteration": 2.6152215003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147304, + "balance_loss_mlp": 1.09902406, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.09218377020634298, + "language_loss": 0.87772787, + "learning_rate": 0.0008316689428398751, + "loss": 0.88920093, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.4831543, + "step": 1513, + "time_per_iteration": 2.687288522720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148068, + "balance_loss_mlp": 1.10407972, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05407373665960582, + "language_loss": 0.89050305, + "learning_rate": 0.0008314357451252979, + "loss": 0.90198368, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.44018555, + "step": 1514, + "time_per_iteration": 2.7870078086853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151939, + "balance_loss_mlp": 1.10644853, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.11283198751561448, + "language_loss": 0.88657945, + "learning_rate": 0.0008312024187359527, + "loss": 0.89809883, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.45483398, + "step": 1515, + "time_per_iteration": 2.6400256156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144613, + "balance_loss_mlp": 1.10060108, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.08270455580526427, + "language_loss": 0.87534022, + "learning_rate": 0.000830968963762425, + "loss": 0.8867864, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.43994141, + "step": 1516, + "time_per_iteration": 3.0442028045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151597, + "balance_loss_mlp": 1.10617828, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.06364079743342543, + "language_loss": 0.84482789, + "learning_rate": 0.0008307353802953497, + "loss": 0.85634387, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.45361328, + "step": 1517, + "time_per_iteration": 2.672921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171551, + "balance_loss_mlp": 1.12281811, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.060139597091390135, + "language_loss": 0.86612219, + "learning_rate": 0.0008305016684254125, + "loss": 0.87783766, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.48803711, + "step": 1518, + "time_per_iteration": 2.7845590114593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.12947094, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.09151635615922826, + "language_loss": 0.87469971, + "learning_rate": 0.0008302678282433479, + "loss": 0.88644284, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.44848633, + "step": 1519, + "time_per_iteration": 2.562605619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163342, + "balance_loss_mlp": 1.11999798, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07068722957296131, + "language_loss": 0.85016668, + "learning_rate": 0.0008300338598399411, + "loss": 0.86180007, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.43359375, + "step": 1520, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155651, + "balance_loss_mlp": 1.11111403, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07704766336953982, + "language_loss": 0.95187533, + "learning_rate": 0.0008297997633060263, + "loss": 0.96343178, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.44506836, + "step": 1521, + "time_per_iteration": 2.5206730365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_mlp": 1.08468485, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07256926042070597, + "language_loss": 0.85441822, + "learning_rate": 0.0008295655387324883, + "loss": 0.865695, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.42993164, + "step": 1522, + "time_per_iteration": 2.8186635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126011, + "balance_loss_mlp": 1.08090246, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.07210388942873598, + "language_loss": 0.8532753, + "learning_rate": 0.0008293311862102609, + "loss": 0.86453545, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.45092773, + "step": 1523, + "time_per_iteration": 2.4982752799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.07334912, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.0579845522804068, + "language_loss": 0.89434093, + "learning_rate": 0.0008290967058303275, + "loss": 0.90552431, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.44995117, + "step": 1524, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.07575774, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07735764089304721, + "language_loss": 0.86793721, + "learning_rate": 0.0008288620976837219, + "loss": 0.87910557, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.41088867, + "step": 1525, + "time_per_iteration": 2.4877853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.06881261, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.06064034312392981, + "language_loss": 0.83118868, + "learning_rate": 0.000828627361861527, + "loss": 0.84231043, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.43383789, + "step": 1526, + "time_per_iteration": 2.567406415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06620967, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.0729369607745646, + "language_loss": 0.84539104, + "learning_rate": 0.0008283924984548752, + "loss": 0.85648245, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.42919922, + "step": 1527, + "time_per_iteration": 2.8396716117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117649, + "balance_loss_mlp": 1.07480514, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.05516048868040139, + "language_loss": 0.85423326, + "learning_rate": 0.0008281575075549485, + "loss": 0.86540973, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.4284668, + "step": 1528, + "time_per_iteration": 2.596402645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093475, + "balance_loss_mlp": 1.0787884, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.03776357558455706, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78446174, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.14648438, + "step": 1529, + "time_per_iteration": 4.641916513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118614, + "balance_loss_mlp": 1.07436347, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.11599739785132454, + "language_loss": 0.90857148, + "learning_rate": 0.0008276871436402469, + "loss": 0.91975754, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.44238281, + "step": 1530, + "time_per_iteration": 2.8211593627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113901, + "balance_loss_mlp": 1.07239282, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.06834093724659761, + "language_loss": 0.87937176, + "learning_rate": 0.000827451770808083, + "loss": 0.8905108, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.41503906, + "step": 1531, + "time_per_iteration": 2.7127888202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.06357539, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.06489723039655686, + "language_loss": 0.8385976, + "learning_rate": 0.0008272162708478674, + "loss": 0.84966749, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.43457031, + "step": 1532, + "time_per_iteration": 2.580057144165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119293, + "balance_loss_mlp": 1.07749844, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.06938693493012958, + "language_loss": 0.86437017, + "learning_rate": 0.000826980643851029, + "loss": 0.87556309, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.41821289, + "step": 1533, + "time_per_iteration": 2.689450740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118363, + "balance_loss_mlp": 1.07518554, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.057495804655394826, + "language_loss": 0.85101378, + "learning_rate": 0.0008267448899090464, + "loss": 0.8621974, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.43188477, + "step": 1534, + "time_per_iteration": 2.5541234016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139738, + "balance_loss_mlp": 1.09460509, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.0763188518859088, + "language_loss": 0.81071836, + "learning_rate": 0.0008265090091134473, + "loss": 0.82211578, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.45117188, + "step": 1535, + "time_per_iteration": 2.851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.09309804, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06589165398662913, + "language_loss": 0.80565453, + "learning_rate": 0.0008262730015558088, + "loss": 0.8170197, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.43432617, + "step": 1536, + "time_per_iteration": 2.8671340942382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113965, + "balance_loss_mlp": 1.09423184, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.08099910548300644, + "language_loss": 0.82513618, + "learning_rate": 0.0008260368673277574, + "loss": 0.83653271, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.45410156, + "step": 1537, + "time_per_iteration": 3.114685297012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134888, + "balance_loss_mlp": 1.08973145, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06868209454347093, + "language_loss": 0.84501362, + "learning_rate": 0.0008258006065209682, + "loss": 0.85636258, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.45141602, + "step": 1538, + "time_per_iteration": 2.7343428134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112017, + "balance_loss_mlp": 1.07341647, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.07819005704771397, + "language_loss": 0.80795646, + "learning_rate": 0.0008255642192271657, + "loss": 0.8191582, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.4675293, + "step": 1539, + "time_per_iteration": 2.7900264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123831, + "balance_loss_mlp": 1.0775305, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06984070899888078, + "language_loss": 0.84251219, + "learning_rate": 0.0008253277055381241, + "loss": 0.85375053, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.46313477, + "step": 1540, + "time_per_iteration": 2.7936105728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126968, + "balance_loss_mlp": 1.08383858, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09213105437911238, + "language_loss": 0.86479163, + "learning_rate": 0.0008250910655456658, + "loss": 0.87606132, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.43115234, + "step": 1541, + "time_per_iteration": 3.119706392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141818, + "balance_loss_mlp": 1.09723353, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.06264221574110865, + "language_loss": 0.84348595, + "learning_rate": 0.0008248542993416625, + "loss": 0.85490412, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.44628906, + "step": 1542, + "time_per_iteration": 2.6273162364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.09224987, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.062187844768518095, + "language_loss": 0.838552, + "learning_rate": 0.0008246174070180352, + "loss": 0.84992176, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.44702148, + "step": 1543, + "time_per_iteration": 2.6559441089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155532, + "balance_loss_mlp": 1.11099529, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09249403217806111, + "language_loss": 0.84424686, + "learning_rate": 0.0008243803886667537, + "loss": 0.85580218, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.44506836, + "step": 1544, + "time_per_iteration": 3.161595582962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155762, + "balance_loss_mlp": 1.11196482, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.11473976054569617, + "language_loss": 0.79569989, + "learning_rate": 0.0008241432443798364, + "loss": 0.80725753, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.43774414, + "step": 1545, + "time_per_iteration": 2.8056137561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154045, + "balance_loss_mlp": 1.11160624, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05050947415994233, + "language_loss": 0.86053026, + "learning_rate": 0.0008239059742493512, + "loss": 0.87207067, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.42456055, + "step": 1546, + "time_per_iteration": 2.6890687942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146751, + "balance_loss_mlp": 1.10383546, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.060404475813103174, + "language_loss": 0.87675822, + "learning_rate": 0.0008236685783674142, + "loss": 0.88822567, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.42944336, + "step": 1547, + "time_per_iteration": 3.0594639778137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176135, + "balance_loss_mlp": 1.15439153, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05730794129930028, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77397329, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.21777344, + "step": 1548, + "time_per_iteration": 4.907459020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115635, + "balance_loss_mlp": 1.11174202, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08902597202075696, + "language_loss": 0.82813615, + "learning_rate": 0.0008231934097178955, + "loss": 0.83969963, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.44604492, + "step": 1549, + "time_per_iteration": 2.622082471847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.1013267, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.06733871211748228, + "language_loss": 0.85700476, + "learning_rate": 0.0008229556371347903, + "loss": 0.86848152, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.46362305, + "step": 1550, + "time_per_iteration": 3.0081942081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133769, + "balance_loss_mlp": 1.09018564, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.09176779567237862, + "language_loss": 0.79384351, + "learning_rate": 0.0008227177391691874, + "loss": 0.80518115, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.43554688, + "step": 1551, + "time_per_iteration": 3.1698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126053, + "balance_loss_mlp": 1.08218408, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07033401560901072, + "language_loss": 0.89799201, + "learning_rate": 0.0008224797159134463, + "loss": 0.90925252, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.4387207, + "step": 1552, + "time_per_iteration": 2.714494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.07816052, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.05144631995573129, + "language_loss": 0.83942962, + "learning_rate": 0.0008222415674599765, + "loss": 0.85061103, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.39990234, + "step": 1553, + "time_per_iteration": 3.0642828941345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130247, + "balance_loss_mlp": 1.08563888, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07574846124683007, + "language_loss": 0.83871847, + "learning_rate": 0.0008220032939012349, + "loss": 0.85002089, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.44628906, + "step": 1554, + "time_per_iteration": 2.714172840118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.08810425, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.05026836342639273, + "language_loss": 0.8851645, + "learning_rate": 0.0008217648953297277, + "loss": 0.89646089, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.41503906, + "step": 1555, + "time_per_iteration": 2.8413305282592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139651, + "balance_loss_mlp": 1.09692693, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07726233455877282, + "language_loss": 0.78621179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79760832, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.42749023, + "step": 1556, + "time_per_iteration": 2.6995439529418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153013, + "balance_loss_mlp": 1.10766625, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07367356569931041, + "language_loss": 0.8461448, + "learning_rate": 0.0008212877235186833, + "loss": 0.85767496, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.45361328, + "step": 1557, + "time_per_iteration": 2.655294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105489, + "balance_loss_mlp": 1.09290004, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.039126881386902713, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78843045, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12597656, + "step": 1558, + "time_per_iteration": 4.953773021697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148338, + "balance_loss_mlp": 1.10647154, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.07045252665170362, + "language_loss": 0.81300378, + "learning_rate": 0.0008208100527678611, + "loss": 0.82448721, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.41870117, + "step": 1559, + "time_per_iteration": 2.5706257820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142691, + "balance_loss_mlp": 1.10223174, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.09371754463761041, + "language_loss": 0.79173958, + "learning_rate": 0.0008205710305218135, + "loss": 0.80316657, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.40454102, + "step": 1560, + "time_per_iteration": 3.001490354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152428, + "balance_loss_mlp": 1.11292171, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.06044421333553386, + "language_loss": 0.90459639, + "learning_rate": 0.0008203318838190541, + "loss": 0.91612065, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.39501953, + "step": 1561, + "time_per_iteration": 2.753243923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166566, + "balance_loss_mlp": 1.1229353, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.07449479195038491, + "language_loss": 0.85542631, + "learning_rate": 0.0008200926127524281, + "loss": 0.86709195, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.43676758, + "step": 1562, + "time_per_iteration": 2.6388282775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184921, + "balance_loss_mlp": 1.14045644, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.07268784417656445, + "language_loss": 0.83160597, + "learning_rate": 0.0008198532174148289, + "loss": 0.8434552, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.44482422, + "step": 1563, + "time_per_iteration": 2.71712589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076623, + "balance_loss_mlp": 1.06308043, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03416296623034226, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81762791, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.13574219, + "step": 1564, + "time_per_iteration": 4.830719232559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194058, + "balance_loss_mlp": 1.15185785, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08914748552149089, + "language_loss": 0.88889605, + "learning_rate": 0.0008193740542985244, + "loss": 0.90083665, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.421875, + "step": 1565, + "time_per_iteration": 2.6047041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199035, + "balance_loss_mlp": 1.15647733, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.07863054385005203, + "language_loss": 0.8685202, + "learning_rate": 0.0008191342867058467, + "loss": 0.88051057, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.42578125, + "step": 1566, + "time_per_iteration": 2.715708017349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196816, + "balance_loss_mlp": 1.15280378, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.087093537774187, + "language_loss": 0.83839655, + "learning_rate": 0.0008188943952142509, + "loss": 0.85036469, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.43994141, + "step": 1567, + "time_per_iteration": 2.831888198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118972, + "balance_loss_mlp": 1.14663815, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09637975850341399, + "language_loss": 0.82476509, + "learning_rate": 0.0008186543799168711, + "loss": 0.83666229, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.43041992, + "step": 1568, + "time_per_iteration": 3.121755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_mlp": 1.13324285, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.08024736909630528, + "language_loss": 0.88665748, + "learning_rate": 0.0008184142409068892, + "loss": 0.89842814, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.43847656, + "step": 1569, + "time_per_iteration": 2.990497350692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163968, + "balance_loss_mlp": 1.12343669, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.05684047424393967, + "language_loss": 0.86850333, + "learning_rate": 0.000818173978277536, + "loss": 0.88014305, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.40551758, + "step": 1570, + "time_per_iteration": 2.636310338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171599, + "balance_loss_mlp": 1.12956595, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.07636807389642969, + "language_loss": 0.84349716, + "learning_rate": 0.000817933592122089, + "loss": 0.85521317, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.4206543, + "step": 1571, + "time_per_iteration": 2.699178695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163998, + "balance_loss_mlp": 1.11984301, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.07546742874281152, + "language_loss": 0.83585215, + "learning_rate": 0.0008176930825338749, + "loss": 0.8474921, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.44189453, + "step": 1572, + "time_per_iteration": 2.550837516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166441, + "balance_loss_mlp": 1.12385964, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07092433148156627, + "language_loss": 0.89086282, + "learning_rate": 0.0008174524496062679, + "loss": 0.90252721, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.42578125, + "step": 1573, + "time_per_iteration": 2.883683919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116421, + "balance_loss_mlp": 1.11907697, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.061103918995996154, + "language_loss": 0.8587321, + "learning_rate": 0.0008172116934326894, + "loss": 0.8703742, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.45092773, + "step": 1574, + "time_per_iteration": 2.7379467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162954, + "balance_loss_mlp": 1.12132585, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.07023429776023385, + "language_loss": 0.87709713, + "learning_rate": 0.0008169708141066097, + "loss": 0.88872665, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.41625977, + "step": 1575, + "time_per_iteration": 2.571963310241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154168, + "balance_loss_mlp": 1.11435199, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.11601472076904104, + "language_loss": 0.90864658, + "learning_rate": 0.0008167298117215465, + "loss": 0.92018831, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.39819336, + "step": 1576, + "time_per_iteration": 2.562636375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153517, + "balance_loss_mlp": 1.11141217, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08960201833145559, + "language_loss": 0.88355744, + "learning_rate": 0.0008164886863710649, + "loss": 0.89509267, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.42138672, + "step": 1577, + "time_per_iteration": 2.921163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151824, + "balance_loss_mlp": 1.11212754, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07034131144929774, + "language_loss": 0.86199445, + "learning_rate": 0.0008162474381487783, + "loss": 0.87351274, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.39697266, + "step": 1578, + "time_per_iteration": 3.029076337814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.11016417, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.07584256466560314, + "language_loss": 0.85196549, + "learning_rate": 0.0008160060671483475, + "loss": 0.86348867, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.42163086, + "step": 1579, + "time_per_iteration": 2.7073986530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142614, + "balance_loss_mlp": 1.10289371, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.08686038732079729, + "language_loss": 0.83729678, + "learning_rate": 0.0008157645734634809, + "loss": 0.84872293, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.3972168, + "step": 1580, + "time_per_iteration": 2.6613049507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090857, + "balance_loss_mlp": 1.07302368, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.0332286598930082, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77987349, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.17871094, + "step": 1581, + "time_per_iteration": 4.915473699569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074598, + "balance_loss_mlp": 1.05705047, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.028649014265593315, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74289095, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17578125, + "step": 1582, + "time_per_iteration": 4.889309883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129405, + "balance_loss_mlp": 1.08827806, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.06812522797045092, + "language_loss": 0.84052569, + "learning_rate": 0.000815039357240067, + "loss": 0.85181975, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.41113281, + "step": 1583, + "time_per_iteration": 2.6366286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138467, + "balance_loss_mlp": 1.09672034, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.06492424308297744, + "language_loss": 0.85869169, + "learning_rate": 0.0008147973737554952, + "loss": 0.87007636, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.41748047, + "step": 1584, + "time_per_iteration": 2.7854599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136804, + "balance_loss_mlp": 1.095963, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.08202571879527615, + "language_loss": 0.86834013, + "learning_rate": 0.000814555268055744, + "loss": 0.87970817, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.40844727, + "step": 1585, + "time_per_iteration": 2.6199045181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132861, + "balance_loss_mlp": 1.09130502, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07393752668393892, + "language_loss": 0.87929702, + "learning_rate": 0.0008143130402348073, + "loss": 0.89062566, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.41625977, + "step": 1586, + "time_per_iteration": 2.638741970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129239, + "balance_loss_mlp": 1.08868384, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.06849121050203105, + "language_loss": 0.7939502, + "learning_rate": 0.0008140706903867265, + "loss": 0.80524254, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.4050293, + "step": 1587, + "time_per_iteration": 2.810335874557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134042, + "balance_loss_mlp": 1.0908649, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.07851663365650921, + "language_loss": 0.91122121, + "learning_rate": 0.0008138282186055897, + "loss": 0.92256165, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.43188477, + "step": 1588, + "time_per_iteration": 2.7237448692321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137411, + "balance_loss_mlp": 1.09661722, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.06832590097240848, + "language_loss": 0.8307212, + "learning_rate": 0.0008135856249855331, + "loss": 0.84209532, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.40771484, + "step": 1589, + "time_per_iteration": 2.7399301528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153972, + "balance_loss_mlp": 1.11241579, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.09162978556143483, + "language_loss": 0.89933717, + "learning_rate": 0.0008133429096207398, + "loss": 0.91087687, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.41577148, + "step": 1590, + "time_per_iteration": 2.8074302673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_mlp": 1.0156827, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.025543227678258826, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76341486, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.13574219, + "step": 1591, + "time_per_iteration": 4.961095094680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153411, + "balance_loss_mlp": 1.11330891, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.05628096053427355, + "language_loss": 0.87358719, + "learning_rate": 0.0008128571140339123, + "loss": 0.88512129, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.40087891, + "step": 1592, + "time_per_iteration": 2.6484899520874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137482, + "balance_loss_mlp": 1.09497237, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.058132540851188214, + "language_loss": 0.87688839, + "learning_rate": 0.0008126140340004805, + "loss": 0.88826323, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.42529297, + "step": 1593, + "time_per_iteration": 2.509239912033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144438, + "balance_loss_mlp": 1.10316801, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06371804566889869, + "language_loss": 0.82466245, + "learning_rate": 0.0008123708325995172, + "loss": 0.83610678, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.4128418, + "step": 1594, + "time_per_iteration": 3.1773130893707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133345, + "balance_loss_mlp": 1.09240818, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06060698504548286, + "language_loss": 0.79972136, + "learning_rate": 0.0008121275099254414, + "loss": 0.81105477, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 2.9426517486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142693, + "balance_loss_mlp": 1.10244751, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06149446857353131, + "language_loss": 0.88748306, + "learning_rate": 0.0008118840660727194, + "loss": 0.89890993, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.40283203, + "step": 1596, + "time_per_iteration": 2.665166139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.09553957, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.15751252363629464, + "language_loss": 0.88104224, + "learning_rate": 0.0008116405011358644, + "loss": 0.89240128, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.40380859, + "step": 1597, + "time_per_iteration": 3.1415486335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.10291696, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.06428245482632208, + "language_loss": 0.80117774, + "learning_rate": 0.0008113968152094369, + "loss": 0.81262958, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.42285156, + "step": 1598, + "time_per_iteration": 2.50484037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140725, + "balance_loss_mlp": 1.09781003, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.069373282908973, + "language_loss": 0.82692802, + "learning_rate": 0.0008111530083880438, + "loss": 0.83833528, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.42895508, + "step": 1599, + "time_per_iteration": 2.9072136878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.11211586, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.09326308305844169, + "language_loss": 0.86715603, + "learning_rate": 0.0008109090807663399, + "loss": 0.87871301, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.43554688, + "step": 1600, + "time_per_iteration": 2.8556277751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154517, + "balance_loss_mlp": 1.1142, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.07163974647376076, + "language_loss": 0.89029115, + "learning_rate": 0.0008106650324390257, + "loss": 0.90183634, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.40307617, + "step": 1601, + "time_per_iteration": 2.8016483783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115055, + "balance_loss_mlp": 1.10768259, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.06437682840273379, + "language_loss": 0.81480461, + "learning_rate": 0.0008104208635008493, + "loss": 0.82631016, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.42871094, + "step": 1602, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150496, + "balance_loss_mlp": 1.10631728, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.13502170342564263, + "language_loss": 0.8243258, + "learning_rate": 0.0008101765740466058, + "loss": 0.83583081, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.44165039, + "step": 1603, + "time_per_iteration": 2.506427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144916, + "balance_loss_mlp": 1.10135674, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0649160929519563, + "language_loss": 0.84340334, + "learning_rate": 0.0008099321641711364, + "loss": 0.85485256, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.43579102, + "step": 1604, + "time_per_iteration": 2.6318166255950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151756, + "balance_loss_mlp": 1.10938883, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.0523010874933109, + "language_loss": 0.83940029, + "learning_rate": 0.0008096876339693295, + "loss": 0.85091782, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.42407227, + "step": 1605, + "time_per_iteration": 2.620199680328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150228, + "balance_loss_mlp": 1.1086241, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.07539888612246932, + "language_loss": 0.8184768, + "learning_rate": 0.0008094429835361206, + "loss": 0.82997912, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.41625977, + "step": 1606, + "time_per_iteration": 2.9251575469970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147276, + "balance_loss_mlp": 1.10679281, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.07700051037162058, + "language_loss": 0.85932112, + "learning_rate": 0.0008091982129664908, + "loss": 0.87079388, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.40478516, + "step": 1607, + "time_per_iteration": 2.7032129764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169169, + "balance_loss_mlp": 1.12427497, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.11394505928871175, + "language_loss": 0.83292013, + "learning_rate": 0.0008089533223554687, + "loss": 0.84461182, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.44897461, + "step": 1608, + "time_per_iteration": 2.6975207328796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161949, + "balance_loss_mlp": 1.12115526, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.06275490202685644, + "language_loss": 0.85402906, + "learning_rate": 0.0008087083117981294, + "loss": 0.86564851, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.40795898, + "step": 1609, + "time_per_iteration": 2.8709142208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158469, + "balance_loss_mlp": 1.11402774, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.06357956742359384, + "language_loss": 0.88521934, + "learning_rate": 0.0008084631813895943, + "loss": 0.89680409, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.44433594, + "step": 1610, + "time_per_iteration": 2.7704904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148821, + "balance_loss_mlp": 1.1059773, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07818022356789546, + "language_loss": 0.84349322, + "learning_rate": 0.0008082179312250315, + "loss": 0.85498142, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.42871094, + "step": 1611, + "time_per_iteration": 2.6352171897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118188, + "balance_loss_mlp": 1.10588562, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.03204939869531237, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8097403, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.12255859, + "step": 1612, + "time_per_iteration": 4.865812301635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095093, + "balance_loss_mlp": 1.08288634, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.024031397097536, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77724421, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.12207031, + "step": 1613, + "time_per_iteration": 5.057459831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163336, + "balance_loss_mlp": 1.12020612, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.056757119691581794, + "language_loss": 0.82255232, + "learning_rate": 0.0008074814631475545, + "loss": 0.83418566, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.43139648, + "step": 1614, + "time_per_iteration": 3.3026204109191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164621, + "balance_loss_mlp": 1.12153852, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.0685570598787085, + "language_loss": 0.79806983, + "learning_rate": 0.0008072357349114907, + "loss": 0.80971605, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.4309082, + "step": 1615, + "time_per_iteration": 2.663853645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187873, + "balance_loss_mlp": 1.14369345, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.06371446427292905, + "language_loss": 0.8904891, + "learning_rate": 0.0008069898873959363, + "loss": 0.90236783, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.44189453, + "step": 1616, + "time_per_iteration": 2.675607919692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199097, + "balance_loss_mlp": 1.15773141, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.10138062428343411, + "language_loss": 0.8626408, + "learning_rate": 0.0008067439206963375, + "loss": 0.87463176, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.41381836, + "step": 1617, + "time_per_iteration": 2.6264841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193178, + "balance_loss_mlp": 1.15119278, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.06654120721966555, + "language_loss": 0.8650856, + "learning_rate": 0.0008064978349081873, + "loss": 0.87701744, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.41967773, + "step": 1618, + "time_per_iteration": 2.9114232063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180658, + "balance_loss_mlp": 1.13712287, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.06279818174684408, + "language_loss": 0.86905777, + "learning_rate": 0.0008062516301270245, + "loss": 0.88086432, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.43530273, + "step": 1619, + "time_per_iteration": 2.697016477584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174783, + "balance_loss_mlp": 1.13341749, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.07259268941115717, + "language_loss": 0.89074606, + "learning_rate": 0.0008060053064484343, + "loss": 0.90249389, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.41381836, + "step": 1620, + "time_per_iteration": 2.9220941066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160759, + "balance_loss_mlp": 1.11996579, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.054906942105454146, + "language_loss": 0.85286081, + "learning_rate": 0.0008057588639680482, + "loss": 0.8644684, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.40795898, + "step": 1621, + "time_per_iteration": 2.7432475090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161698, + "balance_loss_mlp": 1.11754274, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.08428579582226577, + "language_loss": 0.83045304, + "learning_rate": 0.0008055123027815434, + "loss": 0.84207004, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.44165039, + "step": 1622, + "time_per_iteration": 2.888124465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149406, + "balance_loss_mlp": 1.10947073, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.06442378780427988, + "language_loss": 0.85635763, + "learning_rate": 0.0008052656229846436, + "loss": 0.86785173, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.39916992, + "step": 1623, + "time_per_iteration": 2.7215354442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.11259365, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.1013205930173775, + "language_loss": 0.90875685, + "learning_rate": 0.0008050188246731182, + "loss": 0.92030621, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.42333984, + "step": 1624, + "time_per_iteration": 2.6636321544647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146822, + "balance_loss_mlp": 1.10655355, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08961406202901398, + "language_loss": 0.82641953, + "learning_rate": 0.0008047719079427834, + "loss": 0.83788776, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.40283203, + "step": 1625, + "time_per_iteration": 2.9943442344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067888, + "balance_loss_mlp": 1.05425012, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.02225722433359613, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75419593, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.13671875, + "step": 1626, + "time_per_iteration": 4.865052700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124122, + "balance_loss_mlp": 1.0819937, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.05828883069087806, + "language_loss": 0.86570215, + "learning_rate": 0.0008042777196091757, + "loss": 0.87694335, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.42138672, + "step": 1627, + "time_per_iteration": 2.668349266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127492, + "balance_loss_mlp": 1.08481538, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08399253674550058, + "language_loss": 0.82332879, + "learning_rate": 0.0008040304481977643, + "loss": 0.83460367, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.42675781, + "step": 1628, + "time_per_iteration": 2.6445093154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130913, + "balance_loss_mlp": 1.09224153, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.06122809929096989, + "language_loss": 0.86751842, + "learning_rate": 0.0008037830587512649, + "loss": 0.87882763, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.38671875, + "step": 1629, + "time_per_iteration": 3.0830209255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131503, + "balance_loss_mlp": 1.09068549, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.06235185724616104, + "language_loss": 0.7940957, + "learning_rate": 0.0008035355513657224, + "loss": 0.80541074, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.40820312, + "step": 1630, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135326, + "balance_loss_mlp": 1.09326935, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.06249119555349938, + "language_loss": 0.9321425, + "learning_rate": 0.0008032879261372279, + "loss": 0.94349587, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.42089844, + "step": 1631, + "time_per_iteration": 2.7995047569274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_mlp": 1.01777005, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.019617221588718974, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80666578, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12988281, + "step": 1632, + "time_per_iteration": 5.3968565464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149415, + "balance_loss_mlp": 1.10959888, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.05783646939860944, + "language_loss": 0.87576675, + "learning_rate": 0.0008027923225359748, + "loss": 0.88726091, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.39819336, + "step": 1633, + "time_per_iteration": 2.5933566093444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153635, + "balance_loss_mlp": 1.11145878, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.05944909670445279, + "language_loss": 0.88579285, + "learning_rate": 0.0008025443443556267, + "loss": 0.89732921, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.421875, + "step": 1634, + "time_per_iteration": 2.728522777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149168, + "balance_loss_mlp": 1.109519, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.0772983201911997, + "language_loss": 0.88333809, + "learning_rate": 0.000802296248717147, + "loss": 0.89482975, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.39648438, + "step": 1635, + "time_per_iteration": 2.9030401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140791, + "balance_loss_mlp": 1.0971607, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06629024784700413, + "language_loss": 0.7930302, + "learning_rate": 0.0008020480357168554, + "loss": 0.80443811, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.43603516, + "step": 1636, + "time_per_iteration": 2.839134931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145583, + "balance_loss_mlp": 1.1038121, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.06656267016529639, + "language_loss": 0.88396037, + "learning_rate": 0.0008017997054511165, + "loss": 0.89541626, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.41796875, + "step": 1637, + "time_per_iteration": 2.5937085151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148115, + "balance_loss_mlp": 1.10424566, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06622170213435077, + "language_loss": 0.85649616, + "learning_rate": 0.0008015512580163407, + "loss": 0.86797726, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.43896484, + "step": 1638, + "time_per_iteration": 2.8432726860046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138121, + "balance_loss_mlp": 1.09639752, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.06676164925493694, + "language_loss": 0.81149763, + "learning_rate": 0.0008013026935089838, + "loss": 0.82287884, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.41699219, + "step": 1639, + "time_per_iteration": 2.8703761100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142857, + "balance_loss_mlp": 1.1031127, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.060786667538297263, + "language_loss": 0.84702241, + "learning_rate": 0.0008010540120255472, + "loss": 0.85845095, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.3972168, + "step": 1640, + "time_per_iteration": 2.6741273403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136768, + "balance_loss_mlp": 1.09511614, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.06934658167266547, + "language_loss": 0.86723542, + "learning_rate": 0.0008008052136625774, + "loss": 0.8786031, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.41650391, + "step": 1641, + "time_per_iteration": 2.8395094871520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135427, + "balance_loss_mlp": 1.09272623, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07613576058544219, + "language_loss": 0.87025082, + "learning_rate": 0.0008005562985166666, + "loss": 0.88160515, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.42675781, + "step": 1642, + "time_per_iteration": 2.708812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127626, + "balance_loss_mlp": 1.08621287, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05118616143218352, + "language_loss": 0.85440576, + "learning_rate": 0.0008003072666844524, + "loss": 0.86568201, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.41430664, + "step": 1643, + "time_per_iteration": 2.74019193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127922, + "balance_loss_mlp": 1.08746231, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.07457594622010144, + "language_loss": 0.82632107, + "learning_rate": 0.0008000581182626173, + "loss": 0.83760029, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.40478516, + "step": 1644, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011327, + "balance_loss_mlp": 1.09159672, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.0586598658040055, + "language_loss": 0.86714005, + "learning_rate": 0.0007998088533478894, + "loss": 0.87846708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.41137695, + "step": 1645, + "time_per_iteration": 2.674678087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130403, + "balance_loss_mlp": 1.08805966, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.10428151324619617, + "language_loss": 0.84319067, + "learning_rate": 0.000799559472037042, + "loss": 0.85449469, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.4230957, + "step": 1646, + "time_per_iteration": 2.5389983654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130022, + "balance_loss_mlp": 1.08939528, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05498023868715711, + "language_loss": 0.8798641, + "learning_rate": 0.0007993099744268932, + "loss": 0.8911643, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.40625, + "step": 1647, + "time_per_iteration": 2.919410467147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127448, + "balance_loss_mlp": 1.0858674, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.07648109375468225, + "language_loss": 0.88298547, + "learning_rate": 0.000799060360614307, + "loss": 0.89425999, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.41577148, + "step": 1648, + "time_per_iteration": 2.679098606109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132184, + "balance_loss_mlp": 1.09117627, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.17676844539598618, + "language_loss": 0.83707428, + "learning_rate": 0.0007988106306961917, + "loss": 0.84839618, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.41015625, + "step": 1649, + "time_per_iteration": 3.1304876804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139719, + "balance_loss_mlp": 1.09809113, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.06731506602110418, + "language_loss": 0.84557772, + "learning_rate": 0.0007985607847695014, + "loss": 0.85697484, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.41625977, + "step": 1650, + "time_per_iteration": 2.6152966022491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151307, + "balance_loss_mlp": 1.11087108, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.08658277444707524, + "language_loss": 0.83160597, + "learning_rate": 0.0007983108229312345, + "loss": 0.84311903, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.40454102, + "step": 1651, + "time_per_iteration": 2.9157605171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180085, + "balance_loss_mlp": 1.13864803, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.12326743545136284, + "language_loss": 0.86631948, + "learning_rate": 0.0007980607452784351, + "loss": 0.8781203, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.4140625, + "step": 1652, + "time_per_iteration": 2.5533528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170234, + "balance_loss_mlp": 1.12798643, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.07656805667485655, + "language_loss": 0.90550399, + "learning_rate": 0.0007978105519081919, + "loss": 0.91720629, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.42236328, + "step": 1653, + "time_per_iteration": 2.683962821960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162895, + "balance_loss_mlp": 1.12088561, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.06859901935764132, + "language_loss": 0.88378012, + "learning_rate": 0.0007975602429176385, + "loss": 0.89540899, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.42041016, + "step": 1654, + "time_per_iteration": 2.563507556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165514, + "balance_loss_mlp": 1.12421989, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.07830522948057009, + "language_loss": 0.81779003, + "learning_rate": 0.0007973098184039536, + "loss": 0.82944512, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.4128418, + "step": 1655, + "time_per_iteration": 2.6503560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154556, + "balance_loss_mlp": 1.11433494, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.07004293098644994, + "language_loss": 0.87212098, + "learning_rate": 0.0007970592784643602, + "loss": 0.88366652, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.40185547, + "step": 1656, + "time_per_iteration": 2.8598649501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167315, + "balance_loss_mlp": 1.12366056, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.08267452239342069, + "language_loss": 0.8563, + "learning_rate": 0.0007968086231961272, + "loss": 0.86797309, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.43676758, + "step": 1657, + "time_per_iteration": 2.637216806411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158327, + "balance_loss_mlp": 1.11343288, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.09173012098392071, + "language_loss": 0.83764172, + "learning_rate": 0.0007965578526965671, + "loss": 0.84922498, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.44897461, + "step": 1658, + "time_per_iteration": 2.607729911804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154777, + "balance_loss_mlp": 1.11307764, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.08650327787833377, + "language_loss": 0.86397582, + "learning_rate": 0.0007963069670630377, + "loss": 0.87552357, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.41723633, + "step": 1659, + "time_per_iteration": 2.7385904788970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154696, + "balance_loss_mlp": 1.11175728, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.06815630012467462, + "language_loss": 0.88107586, + "learning_rate": 0.0007960559663929416, + "loss": 0.89262283, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.4296875, + "step": 1660, + "time_per_iteration": 2.696936845779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155709, + "balance_loss_mlp": 1.11372399, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.07443207173064395, + "language_loss": 0.8773188, + "learning_rate": 0.0007958048507837259, + "loss": 0.88887584, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.41992188, + "step": 1661, + "time_per_iteration": 3.0276992321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.12168884, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.07361086812440759, + "language_loss": 0.87900233, + "learning_rate": 0.0007955536203328822, + "loss": 0.89066029, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.44116211, + "step": 1662, + "time_per_iteration": 2.9181947708129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167079, + "balance_loss_mlp": 1.12497449, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0536049497981301, + "language_loss": 0.8375597, + "learning_rate": 0.0007953022751379469, + "loss": 0.84923047, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.42089844, + "step": 1663, + "time_per_iteration": 2.8502774238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160364, + "balance_loss_mlp": 1.11749601, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.09076105210561375, + "language_loss": 0.82297581, + "learning_rate": 0.000795050815296501, + "loss": 0.83457941, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.42871094, + "step": 1664, + "time_per_iteration": 2.990253210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149395, + "balance_loss_mlp": 1.10821986, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.05392034602485258, + "language_loss": 0.93401325, + "learning_rate": 0.0007947992409061695, + "loss": 0.94550717, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.41162109, + "step": 1665, + "time_per_iteration": 2.5734803676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146101, + "balance_loss_mlp": 1.10456824, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07147454481835314, + "language_loss": 0.86398005, + "learning_rate": 0.0007945475520646226, + "loss": 0.87544107, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.4152832, + "step": 1666, + "time_per_iteration": 2.9147067070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144126, + "balance_loss_mlp": 1.10156846, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08541845552139904, + "language_loss": 0.85159481, + "learning_rate": 0.0007942957488695743, + "loss": 0.8630361, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.42578125, + "step": 1667, + "time_per_iteration": 2.6842408180236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138005, + "balance_loss_mlp": 1.09725952, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06001483498827303, + "language_loss": 0.81309706, + "learning_rate": 0.0007940438314187833, + "loss": 0.82447714, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.4074707, + "step": 1668, + "time_per_iteration": 3.0340676307678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128156, + "balance_loss_mlp": 1.08769631, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.06998559069767052, + "language_loss": 0.81191337, + "learning_rate": 0.0007937917998100529, + "loss": 0.82319492, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.40454102, + "step": 1669, + "time_per_iteration": 2.635629177093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.09313023, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.08304565240235381, + "language_loss": 0.79254091, + "learning_rate": 0.0007935396541412302, + "loss": 0.80392736, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.45532227, + "step": 1670, + "time_per_iteration": 2.6226065158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141669, + "balance_loss_mlp": 1.09896851, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07816166477955887, + "language_loss": 0.85914934, + "learning_rate": 0.0007932873945102068, + "loss": 0.87056601, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.42724609, + "step": 1671, + "time_per_iteration": 2.559443473815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_mlp": 1.03238678, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.025388272809080015, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76809424, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15234375, + "step": 1672, + "time_per_iteration": 4.8329596519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176113, + "balance_loss_mlp": 1.13319826, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.10680060394368475, + "language_loss": 0.86589128, + "learning_rate": 0.0007927825337533461, + "loss": 0.87765247, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.42895508, + "step": 1673, + "time_per_iteration": 2.670067071914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117015, + "balance_loss_mlp": 1.12651968, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.0659920492524482, + "language_loss": 0.84953517, + "learning_rate": 0.0007925299328235131, + "loss": 0.86123669, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.43652344, + "step": 1674, + "time_per_iteration": 2.6559884548187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169615, + "balance_loss_mlp": 1.12543643, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.10142438885407562, + "language_loss": 0.85307467, + "learning_rate": 0.000792277218323488, + "loss": 0.86477083, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.44189453, + "step": 1675, + "time_per_iteration": 2.5843372344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158765, + "balance_loss_mlp": 1.11673164, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.06840501438298492, + "language_loss": 0.85418063, + "learning_rate": 0.0007920243903513833, + "loss": 0.86576831, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.4206543, + "step": 1676, + "time_per_iteration": 2.562697649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.09280825, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.06731593225582447, + "language_loss": 0.84609574, + "learning_rate": 0.0007917714490053556, + "loss": 0.85747755, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.45361328, + "step": 1677, + "time_per_iteration": 2.685854434967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131879, + "balance_loss_mlp": 1.09029913, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06310440112326268, + "language_loss": 0.86562228, + "learning_rate": 0.0007915183943836055, + "loss": 0.87694108, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.41601562, + "step": 1678, + "time_per_iteration": 2.8568227291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128366, + "balance_loss_mlp": 1.08466363, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07690366782162197, + "language_loss": 0.84428912, + "learning_rate": 0.0007912652265843773, + "loss": 0.85557282, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.43725586, + "step": 1679, + "time_per_iteration": 3.079998254776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110817, + "balance_loss_mlp": 1.06930852, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.07712564159484636, + "language_loss": 0.8213551, + "learning_rate": 0.0007910119457059597, + "loss": 0.83246326, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.4152832, + "step": 1680, + "time_per_iteration": 2.6812973022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112077, + "balance_loss_mlp": 1.06975782, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.10745693955939492, + "language_loss": 0.81109858, + "learning_rate": 0.0007907585518466849, + "loss": 0.82221937, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.42333984, + "step": 1681, + "time_per_iteration": 2.9406683444976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115262, + "balance_loss_mlp": 1.07265627, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07157404686533678, + "language_loss": 0.89948541, + "learning_rate": 0.000790505045104929, + "loss": 0.91063797, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.42602539, + "step": 1682, + "time_per_iteration": 2.5241646766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119953, + "balance_loss_mlp": 1.07606041, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.06937214564576595, + "language_loss": 0.87034553, + "learning_rate": 0.0007902514255791125, + "loss": 0.88154507, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.43896484, + "step": 1683, + "time_per_iteration": 2.8741068840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111917, + "balance_loss_mlp": 1.076231, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06778435640114842, + "language_loss": 0.87994444, + "learning_rate": 0.0007899976933676986, + "loss": 0.89113617, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.42919922, + "step": 1684, + "time_per_iteration": 2.959290027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117467, + "balance_loss_mlp": 1.07469463, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.06453517439379398, + "language_loss": 0.87573123, + "learning_rate": 0.0007897438485691955, + "loss": 0.88690597, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.42773438, + "step": 1685, + "time_per_iteration": 2.6591978073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_mlp": 1.08655035, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.13512041919643347, + "language_loss": 0.82386112, + "learning_rate": 0.0007894898912821542, + "loss": 0.835177, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.45043945, + "step": 1686, + "time_per_iteration": 2.5375750064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134689, + "balance_loss_mlp": 1.09201205, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.07414292899066016, + "language_loss": 0.8748548, + "learning_rate": 0.0007892358216051695, + "loss": 0.88620168, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.42675781, + "step": 1687, + "time_per_iteration": 2.73968243598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132707, + "balance_loss_mlp": 1.09098339, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06337992950379638, + "language_loss": 0.92269105, + "learning_rate": 0.0007889816396368803, + "loss": 0.93401814, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.41699219, + "step": 1688, + "time_per_iteration": 2.6067299842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131771, + "balance_loss_mlp": 1.08961868, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07885708031147778, + "language_loss": 0.85782814, + "learning_rate": 0.0007887273454759687, + "loss": 0.86914587, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.421875, + "step": 1689, + "time_per_iteration": 2.484260320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122278, + "balance_loss_mlp": 1.08031607, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.06527022407794938, + "language_loss": 0.82859224, + "learning_rate": 0.0007884729392211603, + "loss": 0.83981502, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.41943359, + "step": 1690, + "time_per_iteration": 2.642786741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129634, + "balance_loss_mlp": 1.08812594, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09568065131307975, + "language_loss": 0.86132944, + "learning_rate": 0.0007882184209712245, + "loss": 0.87262577, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.41503906, + "step": 1691, + "time_per_iteration": 2.5199530124664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123067, + "balance_loss_mlp": 1.08234525, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06282055281729462, + "language_loss": 0.86132228, + "learning_rate": 0.000787963790824974, + "loss": 0.87255299, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.40722656, + "step": 1692, + "time_per_iteration": 2.9768075942993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124522, + "balance_loss_mlp": 1.08427668, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.07612118071262816, + "language_loss": 0.89543802, + "learning_rate": 0.0007877090488812651, + "loss": 0.90668321, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.40258789, + "step": 1693, + "time_per_iteration": 2.4604485034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124012, + "balance_loss_mlp": 1.08207428, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.1035661329718289, + "language_loss": 0.83982152, + "learning_rate": 0.0007874541952389973, + "loss": 0.85106164, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.41943359, + "step": 1694, + "time_per_iteration": 2.6709587574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113814, + "balance_loss_mlp": 1.09753752, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08446561178027004, + "language_loss": 0.86949492, + "learning_rate": 0.0007871992299971136, + "loss": 0.8808763, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.40625, + "step": 1695, + "time_per_iteration": 2.5585403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150743, + "balance_loss_mlp": 1.11023593, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.05830689117178756, + "language_loss": 0.84793502, + "learning_rate": 0.0007869441532546001, + "loss": 0.85944247, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.4050293, + "step": 1696, + "time_per_iteration": 2.7510788440704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148317, + "balance_loss_mlp": 1.1100266, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06976949490853021, + "language_loss": 0.79791546, + "learning_rate": 0.0007866889651104867, + "loss": 0.80939865, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.38256836, + "step": 1697, + "time_per_iteration": 2.7944459915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152152, + "balance_loss_mlp": 1.11114383, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.06767982610774756, + "language_loss": 0.83777177, + "learning_rate": 0.000786433665663846, + "loss": 0.84929335, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.40991211, + "step": 1698, + "time_per_iteration": 2.6864194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167936, + "balance_loss_mlp": 1.12514019, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.0725657973515617, + "language_loss": 0.87005848, + "learning_rate": 0.0007861782550137942, + "loss": 0.88173789, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.42797852, + "step": 1699, + "time_per_iteration": 2.896897792816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160393, + "balance_loss_mlp": 1.11986172, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.0774952835645251, + "language_loss": 0.86092401, + "learning_rate": 0.0007859227332594901, + "loss": 0.87252796, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.40527344, + "step": 1700, + "time_per_iteration": 2.8986380100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165908, + "balance_loss_mlp": 1.12449527, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09509515836767467, + "language_loss": 0.85007191, + "learning_rate": 0.0007856671005001365, + "loss": 0.86173105, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.41381836, + "step": 1701, + "time_per_iteration": 3.148084878921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168963, + "balance_loss_mlp": 1.12726378, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.07560076292899535, + "language_loss": 0.82363045, + "learning_rate": 0.0007854113568349787, + "loss": 0.83532006, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.41699219, + "step": 1702, + "time_per_iteration": 3.1411454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191314, + "balance_loss_mlp": 1.14882779, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.08142047178498793, + "language_loss": 0.81090933, + "learning_rate": 0.0007851555023633052, + "loss": 0.82282251, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.42504883, + "step": 1703, + "time_per_iteration": 2.9109766483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197058, + "balance_loss_mlp": 1.1559788, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07993965020483434, + "language_loss": 0.82561779, + "learning_rate": 0.0007848995371844474, + "loss": 0.83758843, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.41088867, + "step": 1704, + "time_per_iteration": 2.531611680984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197334, + "balance_loss_mlp": 1.15267849, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.11293951672356671, + "language_loss": 0.81012988, + "learning_rate": 0.0007846434613977801, + "loss": 0.82210326, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.44677734, + "step": 1705, + "time_per_iteration": 2.5413970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175519, + "balance_loss_mlp": 1.1340816, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.10106481858624654, + "language_loss": 0.78958142, + "learning_rate": 0.0007843872751027203, + "loss": 0.80133665, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.41455078, + "step": 1706, + "time_per_iteration": 2.817387580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158115, + "balance_loss_mlp": 1.1166296, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.06764312208644677, + "language_loss": 0.87366319, + "learning_rate": 0.0007841309783987287, + "loss": 0.88524431, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.41503906, + "step": 1707, + "time_per_iteration": 2.7335729598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155907, + "balance_loss_mlp": 1.11117959, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06220723681544313, + "language_loss": 0.89445031, + "learning_rate": 0.0007838745713853084, + "loss": 0.90600932, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.44702148, + "step": 1708, + "time_per_iteration": 2.6179606914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114633, + "balance_loss_mlp": 1.10207939, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.09473479000062662, + "language_loss": 0.84092307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85238636, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.44213867, + "step": 1709, + "time_per_iteration": 2.703660249710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160153, + "balance_loss_mlp": 1.11723721, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06816782803484764, + "language_loss": 0.86778289, + "learning_rate": 0.0007833614268284082, + "loss": 0.8793844, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.42944336, + "step": 1710, + "time_per_iteration": 2.548859119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077221, + "balance_loss_mlp": 1.06558585, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.029019472878356288, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75186992, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.11621094, + "step": 1711, + "time_per_iteration": 4.9234619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117236, + "balance_loss_mlp": 1.12934983, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.10714861433418864, + "language_loss": 0.78928375, + "learning_rate": 0.0007828478422289016, + "loss": 0.80100739, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.43017578, + "step": 1712, + "time_per_iteration": 2.584307909011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167703, + "balance_loss_mlp": 1.12228465, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.08165577234876795, + "language_loss": 0.89409995, + "learning_rate": 0.0007825908851623833, + "loss": 0.90577698, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.45410156, + "step": 1713, + "time_per_iteration": 2.7400283813476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158648, + "balance_loss_mlp": 1.11475515, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08464988169520862, + "language_loss": 0.85764992, + "learning_rate": 0.0007823338183843533, + "loss": 0.86923635, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.43896484, + "step": 1714, + "time_per_iteration": 2.671375036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157012, + "balance_loss_mlp": 1.11419201, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.0730773907324959, + "language_loss": 0.81870985, + "learning_rate": 0.0007820766419946141, + "loss": 0.83028001, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4284668, + "step": 1715, + "time_per_iteration": 3.3361854553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_mlp": 1.01473284, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.017749933707714268, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80699992, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.12792969, + "step": 1716, + "time_per_iteration": 4.933880567550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193401, + "balance_loss_mlp": 1.14895988, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.1003306893312863, + "language_loss": 0.76434684, + "learning_rate": 0.0007815619607794288, + "loss": 0.77628088, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.4440918, + "step": 1717, + "time_per_iteration": 2.6259148120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191125, + "balance_loss_mlp": 1.14823365, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.07927399877074098, + "language_loss": 0.83156073, + "learning_rate": 0.0007813044561538001, + "loss": 0.84347194, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.42895508, + "step": 1718, + "time_per_iteration": 3.1473774909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.145239, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06905487251407855, + "language_loss": 0.88941157, + "learning_rate": 0.0007810468423160958, + "loss": 0.9013117, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.44799805, + "step": 1719, + "time_per_iteration": 2.895155906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181044, + "balance_loss_mlp": 1.13943982, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06204943336400955, + "language_loss": 0.82643551, + "learning_rate": 0.0007807891193663306, + "loss": 0.83824587, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.41625977, + "step": 1720, + "time_per_iteration": 2.7824859619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165341, + "balance_loss_mlp": 1.12357068, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07732363095630222, + "language_loss": 0.82492876, + "learning_rate": 0.0007805312874045614, + "loss": 0.83658212, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.41796875, + "step": 1721, + "time_per_iteration": 2.5710601806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170989, + "balance_loss_mlp": 1.12807381, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.07358039625922873, + "language_loss": 0.86639178, + "learning_rate": 0.0007802733465308874, + "loss": 0.87810171, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.42895508, + "step": 1722, + "time_per_iteration": 2.4402778148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171295, + "balance_loss_mlp": 1.12632966, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06616160911514579, + "language_loss": 0.8424235, + "learning_rate": 0.0007800152968454501, + "loss": 0.85413647, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.44970703, + "step": 1723, + "time_per_iteration": 2.689309597015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115688, + "balance_loss_mlp": 1.11634886, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06191321033146657, + "language_loss": 0.90671206, + "learning_rate": 0.0007797571384484334, + "loss": 0.91828084, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.40527344, + "step": 1724, + "time_per_iteration": 2.8473238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147699, + "balance_loss_mlp": 1.10421109, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.06062690844208358, + "language_loss": 0.92524576, + "learning_rate": 0.0007794988714400633, + "loss": 0.93672276, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.43530273, + "step": 1725, + "time_per_iteration": 2.62685227394104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146389, + "balance_loss_mlp": 1.10118532, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.09351886782013036, + "language_loss": 0.85586655, + "learning_rate": 0.0007792404959206079, + "loss": 0.86733043, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.45214844, + "step": 1726, + "time_per_iteration": 2.487520694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150134, + "balance_loss_mlp": 1.10707533, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.09481341164405561, + "language_loss": 0.81825417, + "learning_rate": 0.0007789820119903774, + "loss": 0.82975549, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.4309082, + "step": 1727, + "time_per_iteration": 2.9732954502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118165, + "balance_loss_mlp": 1.16734493, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.0769954731958624, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79674315, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14257812, + "step": 1728, + "time_per_iteration": 4.8314409255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149054, + "balance_loss_mlp": 1.10599601, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.06765949793064117, + "language_loss": 0.84123361, + "learning_rate": 0.0007784647192990428, + "loss": 0.85272419, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.43041992, + "step": 1729, + "time_per_iteration": 2.715163230895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147649, + "balance_loss_mlp": 1.10799968, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06156065876328187, + "language_loss": 0.80939102, + "learning_rate": 0.0007782059107387696, + "loss": 0.82086754, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.39672852, + "step": 1730, + "time_per_iteration": 2.865858554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165768, + "balance_loss_mlp": 1.12247074, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.07708666526094303, + "language_loss": 0.88668191, + "learning_rate": 0.0007779469941693826, + "loss": 0.89833963, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.43261719, + "step": 1731, + "time_per_iteration": 2.8640921115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166075, + "balance_loss_mlp": 1.12351775, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.08600344935746515, + "language_loss": 0.76943499, + "learning_rate": 0.0007776879696914029, + "loss": 0.78109574, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.42553711, + "step": 1732, + "time_per_iteration": 2.8162899017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159987, + "balance_loss_mlp": 1.11745262, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.07534435583192022, + "language_loss": 0.89131331, + "learning_rate": 0.000777428837405392, + "loss": 0.90291321, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.42553711, + "step": 1733, + "time_per_iteration": 2.869436740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151668, + "balance_loss_mlp": 1.11042213, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.0649827105829465, + "language_loss": 0.87220478, + "learning_rate": 0.0007771695974119544, + "loss": 0.88372147, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.41259766, + "step": 1734, + "time_per_iteration": 2.5153088569641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138148, + "balance_loss_mlp": 1.0959959, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07614790264044081, + "language_loss": 0.76295686, + "learning_rate": 0.0007769102498117359, + "loss": 0.77433836, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.42163086, + "step": 1735, + "time_per_iteration": 3.1105504035949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136381, + "balance_loss_mlp": 1.09430027, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06230250245944302, + "language_loss": 0.80020654, + "learning_rate": 0.000776650794705424, + "loss": 0.81157035, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.42089844, + "step": 1736, + "time_per_iteration": 3.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141082, + "balance_loss_mlp": 1.09890568, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.053956568858798265, + "language_loss": 0.82610357, + "learning_rate": 0.0007763912321937483, + "loss": 0.8375144, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.421875, + "step": 1737, + "time_per_iteration": 2.6871769428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126175, + "balance_loss_mlp": 1.0870508, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.06336651482337263, + "language_loss": 0.82955027, + "learning_rate": 0.0007761315623774799, + "loss": 0.84081209, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.39111328, + "step": 1738, + "time_per_iteration": 3.4055540561676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113225, + "balance_loss_mlp": 1.09088469, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08278309899958468, + "language_loss": 0.88244802, + "learning_rate": 0.0007758717853574313, + "loss": 0.89377058, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.41381836, + "step": 1739, + "time_per_iteration": 2.7666313648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120554, + "balance_loss_mlp": 1.08114362, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0696820530517557, + "language_loss": 0.90798807, + "learning_rate": 0.0007756119012344571, + "loss": 0.91919363, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.39404297, + "step": 1740, + "time_per_iteration": 2.5491223335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115915, + "balance_loss_mlp": 1.07428706, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06589349032225494, + "language_loss": 0.85103011, + "learning_rate": 0.0007753519101094535, + "loss": 0.86218929, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.41625977, + "step": 1741, + "time_per_iteration": 2.765583038330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112401, + "balance_loss_mlp": 1.0837177, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.0662644502369307, + "language_loss": 0.86452365, + "learning_rate": 0.0007750918120833575, + "loss": 0.87576377, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.40283203, + "step": 1742, + "time_per_iteration": 2.6085479259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140409, + "balance_loss_mlp": 1.10240483, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.07280628286033199, + "language_loss": 0.87783647, + "learning_rate": 0.0007748316072571485, + "loss": 0.88924056, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.37963867, + "step": 1743, + "time_per_iteration": 2.793119192123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133272, + "balance_loss_mlp": 1.09259784, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0850070564381928, + "language_loss": 0.79522568, + "learning_rate": 0.0007745712957318467, + "loss": 0.80655837, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.40698242, + "step": 1744, + "time_per_iteration": 2.943847417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137205, + "balance_loss_mlp": 1.09700739, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06831295126385283, + "language_loss": 0.86807823, + "learning_rate": 0.0007743108776085141, + "loss": 0.87945032, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.40136719, + "step": 1745, + "time_per_iteration": 2.771634101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011368, + "balance_loss_mlp": 1.09743714, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.05902486087385494, + "language_loss": 0.83364028, + "learning_rate": 0.0007740503529882543, + "loss": 0.84500825, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.39331055, + "step": 1746, + "time_per_iteration": 2.7896366119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139374, + "balance_loss_mlp": 1.09831822, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.061665767711377016, + "language_loss": 0.90955931, + "learning_rate": 0.0007737897219722114, + "loss": 0.92095304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.41088867, + "step": 1747, + "time_per_iteration": 2.7088165283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129332, + "balance_loss_mlp": 1.08725071, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08528813851267185, + "language_loss": 0.81553382, + "learning_rate": 0.0007735289846615716, + "loss": 0.82682711, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.42089844, + "step": 1748, + "time_per_iteration": 2.635098934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129982, + "balance_loss_mlp": 1.09119081, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.09169024401551043, + "language_loss": 0.82026851, + "learning_rate": 0.0007732681411575621, + "loss": 0.83156836, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.38818359, + "step": 1749, + "time_per_iteration": 2.6693224906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134785, + "balance_loss_mlp": 1.09437299, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0698579909367107, + "language_loss": 0.88035583, + "learning_rate": 0.0007730071915614514, + "loss": 0.89170372, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.40405273, + "step": 1750, + "time_per_iteration": 2.6900789737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.09800839, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09938227861633823, + "language_loss": 0.89158392, + "learning_rate": 0.0007727461359745489, + "loss": 0.90296388, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.3996582, + "step": 1751, + "time_per_iteration": 2.5086123943328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154901, + "balance_loss_mlp": 1.1132257, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06249007419708336, + "language_loss": 0.86569941, + "learning_rate": 0.0007724849744982056, + "loss": 0.87724847, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.41674805, + "step": 1752, + "time_per_iteration": 2.700474739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169913, + "balance_loss_mlp": 1.12737882, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.06015013269361517, + "language_loss": 0.8195309, + "learning_rate": 0.0007722237072338131, + "loss": 0.83123004, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.42529297, + "step": 1753, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.14816022, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.11537307258838475, + "language_loss": 0.85648489, + "learning_rate": 0.0007719623342828046, + "loss": 0.86841327, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.44726562, + "step": 1754, + "time_per_iteration": 2.517010450363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191581, + "balance_loss_mlp": 1.14685392, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.06847069318075473, + "language_loss": 0.84535718, + "learning_rate": 0.000771700855746654, + "loss": 0.85727292, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.44750977, + "step": 1755, + "time_per_iteration": 2.5961217880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164795, + "balance_loss_mlp": 1.1231432, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.05626734330263072, + "language_loss": 0.8872534, + "learning_rate": 0.0007714392717268763, + "loss": 0.89890134, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.41674805, + "step": 1756, + "time_per_iteration": 2.5784223079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166558, + "balance_loss_mlp": 1.12185431, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.07105398160496887, + "language_loss": 0.8649826, + "learning_rate": 0.0007711775823250273, + "loss": 0.87664813, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.44702148, + "step": 1757, + "time_per_iteration": 2.5373613834381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115594, + "balance_loss_mlp": 1.11207056, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06341765106008965, + "language_loss": 0.83797616, + "learning_rate": 0.0007709157876427039, + "loss": 0.84953558, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.43896484, + "step": 1758, + "time_per_iteration": 3.1393754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144144, + "balance_loss_mlp": 1.10027504, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.0573406658982909, + "language_loss": 0.85933769, + "learning_rate": 0.0007706538877815439, + "loss": 0.8707791, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4387207, + "step": 1759, + "time_per_iteration": 2.6080896854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152987, + "balance_loss_mlp": 1.11054862, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.06135171113161323, + "language_loss": 0.83615482, + "learning_rate": 0.0007703918828432259, + "loss": 0.84768468, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.42456055, + "step": 1760, + "time_per_iteration": 2.5886309146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148897, + "balance_loss_mlp": 1.10464644, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.05937499082636783, + "language_loss": 0.88942921, + "learning_rate": 0.000770129772929469, + "loss": 0.90091813, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.44238281, + "step": 1761, + "time_per_iteration": 2.645293951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140708, + "balance_loss_mlp": 1.09629107, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07244625367361128, + "language_loss": 0.88504505, + "learning_rate": 0.0007698675581420334, + "loss": 0.89645213, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.4440918, + "step": 1762, + "time_per_iteration": 2.849560022354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149138, + "balance_loss_mlp": 1.10469711, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06385607916775927, + "language_loss": 0.79163915, + "learning_rate": 0.0007696052385827199, + "loss": 0.80313051, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.44458008, + "step": 1763, + "time_per_iteration": 2.9164280891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138684, + "balance_loss_mlp": 1.09765172, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.07477333876977248, + "language_loss": 0.78203613, + "learning_rate": 0.00076934281435337, + "loss": 0.79342294, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.41040039, + "step": 1764, + "time_per_iteration": 2.7213284969329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131547, + "balance_loss_mlp": 1.08922768, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0661700543843282, + "language_loss": 0.86476332, + "learning_rate": 0.0007690802855558658, + "loss": 0.87607884, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.4230957, + "step": 1765, + "time_per_iteration": 2.8648691177368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144213, + "balance_loss_mlp": 1.12981212, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.0393682164062729, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77519166, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.14355469, + "step": 1766, + "time_per_iteration": 4.883134603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138855, + "balance_loss_mlp": 1.09441423, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.06478844738748038, + "language_loss": 0.89260793, + "learning_rate": 0.0007685549146641262, + "loss": 0.90399647, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.44458008, + "step": 1767, + "time_per_iteration": 2.5584475994110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138308, + "balance_loss_mlp": 1.09780085, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0552886410345199, + "language_loss": 0.8865279, + "learning_rate": 0.0007682920727738579, + "loss": 0.89791095, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.4050293, + "step": 1768, + "time_per_iteration": 2.462104558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.09170651, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.07550967393636049, + "language_loss": 0.84987569, + "learning_rate": 0.000768029126723369, + "loss": 0.86121619, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.42333984, + "step": 1769, + "time_per_iteration": 2.5362985134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.09360242, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.0745429404709064, + "language_loss": 0.82167029, + "learning_rate": 0.0007677660766147447, + "loss": 0.83301806, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.41186523, + "step": 1770, + "time_per_iteration": 2.516824960708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.06356168, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.02503514207226814, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73550433, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.15917969, + "step": 1771, + "time_per_iteration": 4.943475008010864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137395, + "balance_loss_mlp": 1.09543359, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.06960763795190199, + "language_loss": 0.80136019, + "learning_rate": 0.0007672396646316306, + "loss": 0.81273413, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.41918945, + "step": 1772, + "time_per_iteration": 2.5425803661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145424, + "balance_loss_mlp": 1.10341442, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.05748114386543088, + "language_loss": 0.80760133, + "learning_rate": 0.000766976302961512, + "loss": 0.81905556, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.42041016, + "step": 1773, + "time_per_iteration": 2.982287645339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155937, + "balance_loss_mlp": 1.11330807, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.06912006035569716, + "language_loss": 0.81549138, + "learning_rate": 0.0007667128376420003, + "loss": 0.82705075, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.42626953, + "step": 1774, + "time_per_iteration": 2.5396063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151156, + "balance_loss_mlp": 1.10926604, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07768471353958366, + "language_loss": 0.84963071, + "learning_rate": 0.0007664492687753817, + "loss": 0.86114228, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.41894531, + "step": 1775, + "time_per_iteration": 2.7326042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139013, + "balance_loss_mlp": 1.09845805, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10552495092435867, + "language_loss": 0.81927752, + "learning_rate": 0.000766185596463983, + "loss": 0.83066773, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.40551758, + "step": 1776, + "time_per_iteration": 2.622465133666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126657, + "balance_loss_mlp": 1.08455205, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.06005887645947995, + "language_loss": 0.77224028, + "learning_rate": 0.0007659218208101706, + "loss": 0.78350687, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.42114258, + "step": 1777, + "time_per_iteration": 3.099862575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124902, + "balance_loss_mlp": 1.0852288, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.057585659974550854, + "language_loss": 0.85272229, + "learning_rate": 0.0007656579419163515, + "loss": 0.86397129, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.39672852, + "step": 1778, + "time_per_iteration": 2.7696709632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129572, + "balance_loss_mlp": 1.08794475, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.07376046533358642, + "language_loss": 0.77272999, + "learning_rate": 0.0007653939598849724, + "loss": 0.78402567, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.41650391, + "step": 1779, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131616, + "balance_loss_mlp": 1.11511779, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.05276839393693404, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84011823, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16503906, + "step": 1780, + "time_per_iteration": 4.96061897277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112473, + "balance_loss_mlp": 1.08267307, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07129012841004771, + "language_loss": 0.80831903, + "learning_rate": 0.000764865686819522, + "loss": 0.81956631, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.4206543, + "step": 1781, + "time_per_iteration": 3.089735507965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.08492422, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0622927262326037, + "language_loss": 0.86375809, + "learning_rate": 0.0007646013959905449, + "loss": 0.87502241, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.41503906, + "step": 1782, + "time_per_iteration": 2.6112704277038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123127, + "balance_loss_mlp": 1.08130884, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10167310682771787, + "language_loss": 0.81018484, + "learning_rate": 0.0007643370024341949, + "loss": 0.82141614, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.41821289, + "step": 1783, + "time_per_iteration": 3.1074132919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115046, + "balance_loss_mlp": 1.07563567, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.057781870331099924, + "language_loss": 0.83518296, + "learning_rate": 0.0007640725062531195, + "loss": 0.84633338, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.39404297, + "step": 1784, + "time_per_iteration": 2.491313934326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112121, + "balance_loss_mlp": 1.07228112, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.12476428026998775, + "language_loss": 0.86600161, + "learning_rate": 0.0007638079075500047, + "loss": 0.87712288, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.39819336, + "step": 1785, + "time_per_iteration": 2.5236706733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070785, + "balance_loss_mlp": 1.05457258, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.032988320908807454, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76251453, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.16210938, + "step": 1786, + "time_per_iteration": 4.938300609588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_mlp": 1.09274352, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.06899034270313556, + "language_loss": 0.83409935, + "learning_rate": 0.0007632784029886026, + "loss": 0.84544241, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.41552734, + "step": 1787, + "time_per_iteration": 2.6218347549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140121, + "balance_loss_mlp": 1.09968519, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.05777013506444436, + "language_loss": 0.85674673, + "learning_rate": 0.0007630134973358873, + "loss": 0.86814797, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.40429688, + "step": 1788, + "time_per_iteration": 2.9675180912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.11780846, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.11323624876812292, + "language_loss": 0.86969185, + "learning_rate": 0.0007627484895722763, + "loss": 0.88126147, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.39160156, + "step": 1789, + "time_per_iteration": 2.6400198936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164783, + "balance_loss_mlp": 1.1222018, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.06957715435201431, + "language_loss": 0.80509681, + "learning_rate": 0.0007624833798006552, + "loss": 0.81674469, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.42602539, + "step": 1790, + "time_per_iteration": 3.042621374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162497, + "balance_loss_mlp": 1.11924767, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.09367673394256656, + "language_loss": 0.84194326, + "learning_rate": 0.0007622181681239483, + "loss": 0.85356832, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.43261719, + "step": 1791, + "time_per_iteration": 2.642648220062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140416, + "balance_loss_mlp": 1.09907472, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07487034842421487, + "language_loss": 0.84962463, + "learning_rate": 0.0007619528546451202, + "loss": 0.86102873, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.41333008, + "step": 1792, + "time_per_iteration": 2.8014347553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.08941662, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.05771787988130437, + "language_loss": 0.84187096, + "learning_rate": 0.0007616874394671745, + "loss": 0.85317373, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.40869141, + "step": 1793, + "time_per_iteration": 3.336076498031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137422, + "balance_loss_mlp": 1.09276664, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08239177777048284, + "language_loss": 0.85433841, + "learning_rate": 0.0007614219226931547, + "loss": 0.86571258, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44677734, + "step": 1794, + "time_per_iteration": 2.6596035957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.0951401, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.06809904369873732, + "language_loss": 0.85092592, + "learning_rate": 0.0007611563044261435, + "loss": 0.86229378, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.41674805, + "step": 1795, + "time_per_iteration": 2.545440435409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140576, + "balance_loss_mlp": 1.09601521, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.08865061616635866, + "language_loss": 0.8722235, + "learning_rate": 0.0007608905847692631, + "loss": 0.88362932, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.44555664, + "step": 1796, + "time_per_iteration": 2.471306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112492, + "balance_loss_mlp": 1.08486605, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07442154430907115, + "language_loss": 0.86828166, + "learning_rate": 0.0007606247638256749, + "loss": 0.87953079, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.40039062, + "step": 1797, + "time_per_iteration": 2.8728272914886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_mlp": 1.03099036, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.022391201486326673, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79215777, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.14453125, + "step": 1798, + "time_per_iteration": 4.99533486366272 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_mlp": 1.0224725, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.020693498138200886, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80363786, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.14160156, + "step": 1799, + "time_per_iteration": 4.871920347213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131321, + "balance_loss_mlp": 1.086761, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.06425687332848114, + "language_loss": 0.8622126, + "learning_rate": 0.0007598266943068686, + "loss": 0.8735258, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44555664, + "step": 1800, + "time_per_iteration": 2.7352967262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128705, + "balance_loss_mlp": 1.0892942, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.06122285990583016, + "language_loss": 0.84089196, + "learning_rate": 0.0007595604692488507, + "loss": 0.85217899, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.39404297, + "step": 1801, + "time_per_iteration": 2.520047664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145052, + "balance_loss_mlp": 1.10182643, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.08959882775364528, + "language_loss": 0.83156121, + "learning_rate": 0.0007592941434205215, + "loss": 0.84301168, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.43237305, + "step": 1802, + "time_per_iteration": 2.774533987045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_mlp": 1.01191127, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.0173366039721641, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74594939, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11962891, + "step": 1803, + "time_per_iteration": 5.441190004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130945, + "balance_loss_mlp": 1.08481145, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07392614166366455, + "language_loss": 0.80754089, + "learning_rate": 0.0007587611898665566, + "loss": 0.81885034, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.4609375, + "step": 1804, + "time_per_iteration": 3.0738565921783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126694, + "balance_loss_mlp": 1.08320653, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.052717282161679486, + "language_loss": 0.82365519, + "learning_rate": 0.0007584945623478315, + "loss": 0.83492208, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.43530273, + "step": 1805, + "time_per_iteration": 2.810065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.08773112, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.0654216117506123, + "language_loss": 0.81839657, + "learning_rate": 0.000758227834472617, + "loss": 0.8297019, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.42822266, + "step": 1806, + "time_per_iteration": 3.0400753021240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129234, + "balance_loss_mlp": 1.08631909, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.06780310502945991, + "language_loss": 0.77468187, + "learning_rate": 0.0007579610063444664, + "loss": 0.78597426, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.42895508, + "step": 1807, + "time_per_iteration": 2.720200538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_mlp": 1.0805254, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.056464817781099026, + "language_loss": 0.87875664, + "learning_rate": 0.0007576940780669712, + "loss": 0.88999271, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4309082, + "step": 1808, + "time_per_iteration": 3.1972455978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119319, + "balance_loss_mlp": 1.07723832, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.06350201854913072, + "language_loss": 0.84194762, + "learning_rate": 0.0007574270497437624, + "loss": 0.85314083, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.42089844, + "step": 1809, + "time_per_iteration": 2.956308364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.08036816, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.05949268624371524, + "language_loss": 0.88030243, + "learning_rate": 0.000757159921478509, + "loss": 0.89152765, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.42138672, + "step": 1810, + "time_per_iteration": 2.7515318393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_mlp": 1.04769194, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.027450813841054106, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75509393, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.719837427139282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.09272385, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.06099509375847796, + "language_loss": 0.87676752, + "learning_rate": 0.0007566253655367423, + "loss": 0.88813394, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.43896484, + "step": 1812, + "time_per_iteration": 2.6117310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145498, + "balance_loss_mlp": 1.10196316, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.26075237363376164, + "language_loss": 0.90086293, + "learning_rate": 0.000756357938067762, + "loss": 0.91231787, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.43554688, + "step": 1813, + "time_per_iteration": 2.6537845134735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09305573, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.07803772738029488, + "language_loss": 0.8299284, + "learning_rate": 0.0007560904110718033, + "loss": 0.84130079, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44165039, + "step": 1814, + "time_per_iteration": 3.2229981422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131299, + "balance_loss_mlp": 1.08549881, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06602375994559181, + "language_loss": 0.83648008, + "learning_rate": 0.0007558227846527297, + "loss": 0.8477931, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.45751953, + "step": 1815, + "time_per_iteration": 2.8217966556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137186, + "balance_loss_mlp": 1.09300709, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.06552880481969095, + "language_loss": 0.83563447, + "learning_rate": 0.0007555550589144429, + "loss": 0.84700632, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44189453, + "step": 1816, + "time_per_iteration": 2.4231276512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148289, + "balance_loss_mlp": 1.1026082, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.05960251663438414, + "language_loss": 0.84705317, + "learning_rate": 0.000755287233960883, + "loss": 0.85853606, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.45678711, + "step": 1817, + "time_per_iteration": 2.5598244667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148754, + "balance_loss_mlp": 1.10297787, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.06564730471203778, + "language_loss": 0.78051704, + "learning_rate": 0.0007550193098960292, + "loss": 0.79200459, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.45751953, + "step": 1818, + "time_per_iteration": 2.8570642471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115418, + "balance_loss_mlp": 1.11033523, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.05538445579726575, + "language_loss": 0.8654325, + "learning_rate": 0.0007547512868238988, + "loss": 0.87697428, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.43847656, + "step": 1819, + "time_per_iteration": 3.1437833309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170578, + "balance_loss_mlp": 1.12499213, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.0822966351911203, + "language_loss": 0.83893883, + "learning_rate": 0.0007544831648485473, + "loss": 0.85064459, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.45605469, + "step": 1820, + "time_per_iteration": 2.660233736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162235, + "balance_loss_mlp": 1.11684048, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.06443547558053964, + "language_loss": 0.81439716, + "learning_rate": 0.0007542149440740694, + "loss": 0.82601953, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.45385742, + "step": 1821, + "time_per_iteration": 2.6618528366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154684, + "balance_loss_mlp": 1.10938418, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.06960442221541481, + "language_loss": 0.86201102, + "learning_rate": 0.000753946624604597, + "loss": 0.87355781, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.45288086, + "step": 1822, + "time_per_iteration": 2.7180583477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138697, + "balance_loss_mlp": 1.09466076, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.11840223630221765, + "language_loss": 0.88456279, + "learning_rate": 0.0007536782065443015, + "loss": 0.89594972, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44042969, + "step": 1823, + "time_per_iteration": 2.6035680770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147734, + "balance_loss_mlp": 1.1024822, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.08971754998357863, + "language_loss": 0.75357497, + "learning_rate": 0.0007534096899973919, + "loss": 0.76505232, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.45263672, + "step": 1824, + "time_per_iteration": 2.592313528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136405, + "balance_loss_mlp": 1.095397, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.056380284358423516, + "language_loss": 0.8296026, + "learning_rate": 0.0007531410750681154, + "loss": 0.84096658, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.41015625, + "step": 1825, + "time_per_iteration": 2.7599031925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149352, + "balance_loss_mlp": 1.10710466, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.06329210930184016, + "language_loss": 0.8686763, + "learning_rate": 0.0007528723618607575, + "loss": 0.88016987, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.42236328, + "step": 1826, + "time_per_iteration": 3.423145055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156709, + "balance_loss_mlp": 1.11808527, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.05752886424443174, + "language_loss": 0.8293525, + "learning_rate": 0.0007526035504796422, + "loss": 0.84091961, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.38598633, + "step": 1827, + "time_per_iteration": 2.774202346801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164193, + "balance_loss_mlp": 1.12080038, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.08334994788856638, + "language_loss": 0.87348354, + "learning_rate": 0.0007523346410291312, + "loss": 0.8851254, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.43408203, + "step": 1828, + "time_per_iteration": 2.7933921813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172191, + "balance_loss_mlp": 1.13127816, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.05847449829546615, + "language_loss": 0.85163879, + "learning_rate": 0.0007520656336136245, + "loss": 0.86336064, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.40942383, + "step": 1829, + "time_per_iteration": 2.9654810428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167386, + "balance_loss_mlp": 1.12675905, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06508844853371867, + "language_loss": 0.88540596, + "learning_rate": 0.0007517965283375599, + "loss": 0.89707983, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.40625, + "step": 1830, + "time_per_iteration": 2.833653211593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161789, + "balance_loss_mlp": 1.12078059, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.05306701185260888, + "language_loss": 0.89636958, + "learning_rate": 0.0007515273253054132, + "loss": 0.90798748, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.41015625, + "step": 1831, + "time_per_iteration": 2.648688554763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162371, + "balance_loss_mlp": 1.11788237, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.060637132075448665, + "language_loss": 0.8317945, + "learning_rate": 0.0007512580246216988, + "loss": 0.84341824, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44482422, + "step": 1832, + "time_per_iteration": 2.695558786392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152178, + "balance_loss_mlp": 1.11288619, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.06652239867864222, + "language_loss": 0.8520152, + "learning_rate": 0.000750988626390968, + "loss": 0.86353695, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.39306641, + "step": 1833, + "time_per_iteration": 2.5903215408325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114923, + "balance_loss_mlp": 1.10810232, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.05520517467567221, + "language_loss": 0.85274744, + "learning_rate": 0.0007507191307178108, + "loss": 0.86423969, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.41137695, + "step": 1834, + "time_per_iteration": 2.7567453384399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132557, + "balance_loss_mlp": 1.0890696, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.06897138795442613, + "language_loss": 0.75032014, + "learning_rate": 0.0007504495377068543, + "loss": 0.76164567, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.43481445, + "step": 1835, + "time_per_iteration": 2.7309370040893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134622, + "balance_loss_mlp": 1.08972788, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09099083327189633, + "language_loss": 0.81936944, + "learning_rate": 0.0007501798474627642, + "loss": 0.8307156, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44873047, + "step": 1836, + "time_per_iteration": 2.9126806259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113171, + "balance_loss_mlp": 1.08853245, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.058808043239055564, + "language_loss": 0.8375026, + "learning_rate": 0.0007499100600902433, + "loss": 0.84881973, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.43164062, + "step": 1837, + "time_per_iteration": 2.9810633659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124171, + "balance_loss_mlp": 1.08118403, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08552727697149294, + "language_loss": 0.8450433, + "learning_rate": 0.0007496401756940324, + "loss": 0.85628498, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.43017578, + "step": 1838, + "time_per_iteration": 2.670412540435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130914, + "balance_loss_mlp": 1.08897638, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.06964876492363449, + "language_loss": 0.82608843, + "learning_rate": 0.0007493701943789098, + "loss": 0.83739758, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.41967773, + "step": 1839, + "time_per_iteration": 2.772620677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.09537208, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07045943234490067, + "language_loss": 0.83116889, + "learning_rate": 0.000749100116249692, + "loss": 0.84255433, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.43188477, + "step": 1840, + "time_per_iteration": 2.6031582355499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144616, + "balance_loss_mlp": 1.10110414, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.08424265710124153, + "language_loss": 0.86582088, + "learning_rate": 0.0007488299414112321, + "loss": 0.87726706, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.43505859, + "step": 1841, + "time_per_iteration": 2.5864784717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.10726476, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.058600000923872894, + "language_loss": 0.77847576, + "learning_rate": 0.0007485596699684215, + "loss": 0.78998852, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.43994141, + "step": 1842, + "time_per_iteration": 2.8149642944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156484, + "balance_loss_mlp": 1.11266279, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.055073821734726955, + "language_loss": 0.85694617, + "learning_rate": 0.000748289302026189, + "loss": 0.86851102, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.43823242, + "step": 1843, + "time_per_iteration": 2.8475751876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158372, + "balance_loss_mlp": 1.11688685, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.057565803102883874, + "language_loss": 0.85718876, + "learning_rate": 0.0007480188376895004, + "loss": 0.86877251, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.41479492, + "step": 1844, + "time_per_iteration": 3.0344529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140478, + "balance_loss_mlp": 1.12693632, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.05127204690943662, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74951822, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.13574219, + "step": 1845, + "time_per_iteration": 4.8589537143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176931, + "balance_loss_mlp": 1.13518405, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08988090291235612, + "language_loss": 0.78641856, + "learning_rate": 0.0007474776202528074, + "loss": 0.79818785, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.41772461, + "step": 1846, + "time_per_iteration": 2.9269866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184559, + "balance_loss_mlp": 1.14243031, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08000045078310114, + "language_loss": 0.81513619, + "learning_rate": 0.000747206867362922, + "loss": 0.82698178, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.42114258, + "step": 1847, + "time_per_iteration": 3.067870616912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169442, + "balance_loss_mlp": 1.12573957, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.0760432300690223, + "language_loss": 0.84328806, + "learning_rate": 0.0007469360184988194, + "loss": 0.85498255, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.43701172, + "step": 1848, + "time_per_iteration": 2.8130369186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159569, + "balance_loss_mlp": 1.11837053, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08168000095068725, + "language_loss": 0.86707914, + "learning_rate": 0.0007466650737656518, + "loss": 0.87867486, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.41162109, + "step": 1849, + "time_per_iteration": 2.592503309249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115621, + "balance_loss_mlp": 1.11324644, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06757272046168854, + "language_loss": 0.89898217, + "learning_rate": 0.0007463940332686098, + "loss": 0.91054422, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.42944336, + "step": 1850, + "time_per_iteration": 2.4776744842529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148398, + "balance_loss_mlp": 1.10607898, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.05922624538442341, + "language_loss": 0.84461212, + "learning_rate": 0.0007461228971129205, + "loss": 0.85609609, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.42358398, + "step": 1851, + "time_per_iteration": 2.9012656211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.11387658, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.058626739978073765, + "language_loss": 0.85743707, + "learning_rate": 0.0007458516654038483, + "loss": 0.86898398, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.40820312, + "step": 1852, + "time_per_iteration": 2.666947603225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165665, + "balance_loss_mlp": 1.12160563, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06798765543406252, + "language_loss": 0.86475062, + "learning_rate": 0.0007455803382466946, + "loss": 0.87640727, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44042969, + "step": 1853, + "time_per_iteration": 2.804776191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162987, + "balance_loss_mlp": 1.11985719, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07311152518110202, + "language_loss": 0.87308323, + "learning_rate": 0.0007453089157467979, + "loss": 0.88471317, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.43139648, + "step": 1854, + "time_per_iteration": 2.8038864135742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159292, + "balance_loss_mlp": 1.1161381, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06621845487790666, + "language_loss": 0.82129812, + "learning_rate": 0.0007450373980095341, + "loss": 0.83289105, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.43164062, + "step": 1855, + "time_per_iteration": 3.0980496406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154286, + "balance_loss_mlp": 1.11268187, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.05908088829108725, + "language_loss": 0.87076378, + "learning_rate": 0.0007447657851403155, + "loss": 0.88230669, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.41601562, + "step": 1856, + "time_per_iteration": 2.6393351554870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148054, + "balance_loss_mlp": 1.10609269, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.07116077808597938, + "language_loss": 0.79415643, + "learning_rate": 0.0007444940772445915, + "loss": 0.805637, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.41943359, + "step": 1857, + "time_per_iteration": 2.7049038410186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.10770321, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06303496934817837, + "language_loss": 0.80443203, + "learning_rate": 0.0007442222744278484, + "loss": 0.81591749, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.40844727, + "step": 1858, + "time_per_iteration": 2.6416029930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.10056937, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.06290523981550739, + "language_loss": 0.84690839, + "learning_rate": 0.0007439503767956099, + "loss": 0.85831463, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.40063477, + "step": 1859, + "time_per_iteration": 2.697295665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095769, + "balance_loss_mlp": 1.08213139, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.02707100394521806, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80767375, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.13671875, + "step": 1860, + "time_per_iteration": 4.896381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157881, + "balance_loss_mlp": 1.11744571, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.054355964588402354, + "language_loss": 0.86204398, + "learning_rate": 0.000743406297506922, + "loss": 0.87362283, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.40478516, + "step": 1861, + "time_per_iteration": 2.7121450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154988, + "balance_loss_mlp": 1.11362243, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.056412092641732435, + "language_loss": 0.8442747, + "learning_rate": 0.0007431341160617031, + "loss": 0.85582459, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.41381836, + "step": 1862, + "time_per_iteration": 2.902806520462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.13052833, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06986467819319542, + "language_loss": 0.88734752, + "learning_rate": 0.0007428618402234491, + "loss": 0.89907002, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.41723633, + "step": 1863, + "time_per_iteration": 2.644352436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159657, + "balance_loss_mlp": 1.11831546, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.06293448628505635, + "language_loss": 0.8061077, + "learning_rate": 0.0007425894700978668, + "loss": 0.81770432, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.41357422, + "step": 1864, + "time_per_iteration": 2.782757043838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.10699308, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.056888458094662434, + "language_loss": 0.79858804, + "learning_rate": 0.0007423170057906996, + "loss": 0.81006974, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.41162109, + "step": 1865, + "time_per_iteration": 3.848773956298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133926, + "balance_loss_mlp": 1.09391952, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.06447904861600703, + "language_loss": 0.86500657, + "learning_rate": 0.0007420444474077275, + "loss": 0.87634581, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.40014648, + "step": 1866, + "time_per_iteration": 2.542572498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126566, + "balance_loss_mlp": 1.0855341, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.07300351460408123, + "language_loss": 0.8986578, + "learning_rate": 0.0007417717950547671, + "loss": 0.90992349, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.41040039, + "step": 1867, + "time_per_iteration": 2.5633254051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073925, + "balance_loss_mlp": 1.06143153, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.026482390846264015, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77070534, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.125, + "step": 1868, + "time_per_iteration": 4.904905557632446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111694, + "balance_loss_mlp": 1.07345176, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.053992922509511466, + "language_loss": 0.850173, + "learning_rate": 0.0007412262088623299, + "loss": 0.86128998, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.38232422, + "step": 1869, + "time_per_iteration": 2.7310874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110773, + "balance_loss_mlp": 1.07200575, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08370102618564679, + "language_loss": 0.79675972, + "learning_rate": 0.0007409532752346684, + "loss": 0.80786741, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.38769531, + "step": 1870, + "time_per_iteration": 2.6629347801208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110981, + "balance_loss_mlp": 1.07166612, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06403903481871269, + "language_loss": 0.88829064, + "learning_rate": 0.0007406802480606491, + "loss": 0.89940047, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.39306641, + "step": 1871, + "time_per_iteration": 2.6200008392333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.06835461, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.0729370697679506, + "language_loss": 0.90798759, + "learning_rate": 0.0007404071274462707, + "loss": 0.9190588, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.38769531, + "step": 1872, + "time_per_iteration": 2.5693628787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111805, + "balance_loss_mlp": 1.07978415, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.06627703814726228, + "language_loss": 0.84024733, + "learning_rate": 0.0007401339134975682, + "loss": 0.85142779, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.38208008, + "step": 1873, + "time_per_iteration": 2.7031140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127585, + "balance_loss_mlp": 1.08760262, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.06845959531373838, + "language_loss": 0.84298885, + "learning_rate": 0.0007398606063206122, + "loss": 0.85426462, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.39990234, + "step": 1874, + "time_per_iteration": 2.6090316772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.07598901, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.06521397848462201, + "language_loss": 0.78764814, + "learning_rate": 0.0007395872060215101, + "loss": 0.79879999, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.3918457, + "step": 1875, + "time_per_iteration": 2.620976448059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0831089, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.06345733178575377, + "language_loss": 0.88705117, + "learning_rate": 0.0007393137127064056, + "loss": 0.89827275, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.39013672, + "step": 1876, + "time_per_iteration": 2.7320597171783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125006, + "balance_loss_mlp": 1.08511841, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.056097062255587686, + "language_loss": 0.84576774, + "learning_rate": 0.0007390401264814779, + "loss": 0.85701776, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.39868164, + "step": 1877, + "time_per_iteration": 2.605865478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123607, + "balance_loss_mlp": 1.08503079, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.06159732683880817, + "language_loss": 0.84937686, + "learning_rate": 0.0007387664474529427, + "loss": 0.86061299, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.38598633, + "step": 1878, + "time_per_iteration": 2.6548514366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.09750319, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.05796680079252983, + "language_loss": 0.91768891, + "learning_rate": 0.0007384926757270518, + "loss": 0.92906928, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.40527344, + "step": 1879, + "time_per_iteration": 2.6339149475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137039, + "balance_loss_mlp": 1.09791493, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.05405313293747941, + "language_loss": 0.79881001, + "learning_rate": 0.0007382188114100924, + "loss": 0.81018037, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.39111328, + "step": 1880, + "time_per_iteration": 2.983384132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139197, + "balance_loss_mlp": 1.09964395, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.12141150358978081, + "language_loss": 0.82206392, + "learning_rate": 0.0007379448546083884, + "loss": 0.83345592, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.39575195, + "step": 1881, + "time_per_iteration": 2.9186532497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140707, + "balance_loss_mlp": 1.10127282, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06284373597557333, + "language_loss": 0.88377333, + "learning_rate": 0.0007376708054282992, + "loss": 0.8951804, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.39428711, + "step": 1882, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144635, + "balance_loss_mlp": 1.10605919, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.05224621202588268, + "language_loss": 0.84316945, + "learning_rate": 0.0007373966639762201, + "loss": 0.85461575, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.38574219, + "step": 1883, + "time_per_iteration": 2.623133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147786, + "balance_loss_mlp": 1.10620606, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.06751899300287477, + "language_loss": 0.89170045, + "learning_rate": 0.0007371224303585822, + "loss": 0.90317833, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.41577148, + "step": 1884, + "time_per_iteration": 2.628394842147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021984, + "balance_loss_mlp": 1.01154125, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.007236456832270123, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8137905, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10449219, + "step": 1885, + "time_per_iteration": 4.717620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114112, + "balance_loss_mlp": 1.10049307, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.057116908748179596, + "language_loss": 0.82560247, + "learning_rate": 0.0007365736870525335, + "loss": 0.83701366, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.40625, + "step": 1886, + "time_per_iteration": 2.8198611736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132227, + "balance_loss_mlp": 1.09310222, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.06530442713985495, + "language_loss": 0.83123338, + "learning_rate": 0.000736299177577164, + "loss": 0.84255564, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.39135742, + "step": 1887, + "time_per_iteration": 2.613863945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128864, + "balance_loss_mlp": 1.08992994, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0666501464088242, + "language_loss": 0.84363097, + "learning_rate": 0.0007360245763623174, + "loss": 0.85491955, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3894043, + "step": 1888, + "time_per_iteration": 2.6378068923950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115221, + "balance_loss_mlp": 1.07702661, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06993226621121658, + "language_loss": 0.90142351, + "learning_rate": 0.0007357498835146039, + "loss": 0.91257572, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.38183594, + "step": 1889, + "time_per_iteration": 2.8125081062316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128167, + "balance_loss_mlp": 1.08878016, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.07359030033413445, + "language_loss": 0.87316656, + "learning_rate": 0.0007354750991406684, + "loss": 0.88444823, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.39379883, + "step": 1890, + "time_per_iteration": 2.714569568634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121285, + "balance_loss_mlp": 1.0807066, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07836036923074335, + "language_loss": 0.80991101, + "learning_rate": 0.0007352002233471919, + "loss": 0.8211239, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.40576172, + "step": 1891, + "time_per_iteration": 2.6287412643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121974, + "balance_loss_mlp": 1.08180022, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.058839902089765785, + "language_loss": 0.79524523, + "learning_rate": 0.0007349252562408906, + "loss": 0.80646491, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.40161133, + "step": 1892, + "time_per_iteration": 2.669903039932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125098, + "balance_loss_mlp": 1.08449531, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.057079030651025625, + "language_loss": 0.81590033, + "learning_rate": 0.0007346501979285158, + "loss": 0.8271513, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.40600586, + "step": 1893, + "time_per_iteration": 2.9146764278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083448, + "balance_loss_mlp": 1.07238543, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.036364529291757694, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81622547, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11083984, + "step": 1894, + "time_per_iteration": 4.784435272216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126267, + "balance_loss_mlp": 1.08444858, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06549610472034906, + "language_loss": 0.86352968, + "learning_rate": 0.0007340998081127308, + "loss": 0.87479234, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.41796875, + "step": 1895, + "time_per_iteration": 2.7702367305755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130662, + "balance_loss_mlp": 1.09113181, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06520113052193731, + "language_loss": 0.91046786, + "learning_rate": 0.0007338244768230007, + "loss": 0.92177445, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.39550781, + "step": 1896, + "time_per_iteration": 2.7612760066986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133468, + "balance_loss_mlp": 1.09315181, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.058734972315737245, + "language_loss": 0.89108521, + "learning_rate": 0.0007335490547545578, + "loss": 0.90241992, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.40307617, + "step": 1897, + "time_per_iteration": 3.024462938308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135084, + "balance_loss_mlp": 1.09343266, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06208128991116815, + "language_loss": 0.82833707, + "learning_rate": 0.0007332735420143308, + "loss": 0.83968788, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.41650391, + "step": 1898, + "time_per_iteration": 2.725468158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112873, + "balance_loss_mlp": 1.08669686, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.09645190116324148, + "language_loss": 0.86573303, + "learning_rate": 0.0007329979387092826, + "loss": 0.8770203, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.42016602, + "step": 1899, + "time_per_iteration": 2.6357531547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133626, + "balance_loss_mlp": 1.09259379, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.06150604002201611, + "language_loss": 0.84294677, + "learning_rate": 0.0007327222449464124, + "loss": 0.85428298, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.41040039, + "step": 1900, + "time_per_iteration": 3.2381174564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136855, + "balance_loss_mlp": 1.09382069, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07567830151973255, + "language_loss": 0.89052904, + "learning_rate": 0.0007324464608327538, + "loss": 0.90189761, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4309082, + "step": 1901, + "time_per_iteration": 2.597569227218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.10814035, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.07712085030005716, + "language_loss": 0.88794601, + "learning_rate": 0.0007321705864753758, + "loss": 0.89944601, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.41870117, + "step": 1902, + "time_per_iteration": 2.6877686977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151954, + "balance_loss_mlp": 1.11097002, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.05591922142148154, + "language_loss": 0.84586883, + "learning_rate": 0.0007318946219813823, + "loss": 0.85738844, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.40991211, + "step": 1903, + "time_per_iteration": 3.0283257961273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11341679, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.0702623940180467, + "language_loss": 0.90117764, + "learning_rate": 0.000731618567457912, + "loss": 0.91269374, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.38208008, + "step": 1904, + "time_per_iteration": 2.651491165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114788, + "balance_loss_mlp": 1.10522676, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07047012066076976, + "language_loss": 0.87036794, + "learning_rate": 0.000731342423012139, + "loss": 0.88184673, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.42700195, + "step": 1905, + "time_per_iteration": 3.0361618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143776, + "balance_loss_mlp": 1.10331631, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.06969182334255739, + "language_loss": 0.82982039, + "learning_rate": 0.0007310661887512722, + "loss": 0.84125817, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.40478516, + "step": 1906, + "time_per_iteration": 3.020333766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134716, + "balance_loss_mlp": 1.09592557, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.056548054453958524, + "language_loss": 0.82503444, + "learning_rate": 0.0007307898647825549, + "loss": 0.83638155, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.38793945, + "step": 1907, + "time_per_iteration": 2.6819958686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128623, + "balance_loss_mlp": 1.08568358, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.0662764931561561, + "language_loss": 0.89910614, + "learning_rate": 0.0007305134512132659, + "loss": 0.9103924, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.42944336, + "step": 1908, + "time_per_iteration": 2.688716411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.08063269, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.07972147303822336, + "language_loss": 0.83329952, + "learning_rate": 0.0007302369481507183, + "loss": 0.8445071, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.40136719, + "step": 1909, + "time_per_iteration": 2.520551919937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043123, + "balance_loss_mlp": 1.03272831, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.028970701382128577, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004882, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10400391, + "step": 1910, + "time_per_iteration": 4.862990140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_mlp": 1.07534695, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.0535153553246422, + "language_loss": 0.85860741, + "learning_rate": 0.000729683673975274, + "loss": 0.86976075, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.3996582, + "step": 1911, + "time_per_iteration": 2.6834514141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117796, + "balance_loss_mlp": 1.07783747, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.07394300555179863, + "language_loss": 0.83108044, + "learning_rate": 0.0007294069030771774, + "loss": 0.84225845, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.39941406, + "step": 1912, + "time_per_iteration": 3.6458523273468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124936, + "balance_loss_mlp": 1.08483398, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.05916806609098389, + "language_loss": 0.90897858, + "learning_rate": 0.0007291300431154224, + "loss": 0.920228, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.40112305, + "step": 1913, + "time_per_iteration": 2.5737557411193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_mlp": 1.02157927, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.013681752942923219, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71422619, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.11279297, + "step": 1914, + "time_per_iteration": 5.031456232070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113953, + "balance_loss_mlp": 1.07499564, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.06158754254944219, + "language_loss": 0.79961407, + "learning_rate": 0.0007285760564309179, + "loss": 0.81075364, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.38964844, + "step": 1915, + "time_per_iteration": 3.152339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122924, + "balance_loss_mlp": 1.08346629, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10197178679971165, + "language_loss": 0.85308397, + "learning_rate": 0.0007282989299232448, + "loss": 0.86431319, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.39453125, + "step": 1916, + "time_per_iteration": 3.0152268409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119949, + "balance_loss_mlp": 1.08013296, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.05980283450468872, + "language_loss": 0.8385278, + "learning_rate": 0.0007280217147820668, + "loss": 0.84972733, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.39794922, + "step": 1917, + "time_per_iteration": 2.625802755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114962, + "balance_loss_mlp": 1.07512259, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06755957483710798, + "language_loss": 0.79489267, + "learning_rate": 0.0007277444111150079, + "loss": 0.80604231, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.3984375, + "step": 1918, + "time_per_iteration": 2.6753525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112846, + "balance_loss_mlp": 1.08785725, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.07157808177363079, + "language_loss": 0.84730321, + "learning_rate": 0.0007274670190297272, + "loss": 0.8585878, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.40576172, + "step": 1919, + "time_per_iteration": 2.6149959564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.09986341, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.05944559747374387, + "language_loss": 0.8264004, + "learning_rate": 0.0007271895386339179, + "loss": 0.83782172, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.42285156, + "step": 1920, + "time_per_iteration": 2.7611513137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140712, + "balance_loss_mlp": 1.09970427, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.059089751588204814, + "language_loss": 0.83542717, + "learning_rate": 0.0007269119700353073, + "loss": 0.8468343, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.41015625, + "step": 1921, + "time_per_iteration": 2.782167911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148229, + "balance_loss_mlp": 1.10738814, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06644949508392005, + "language_loss": 0.85268104, + "learning_rate": 0.0007266343133416571, + "loss": 0.8641634, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.40844727, + "step": 1922, + "time_per_iteration": 2.7218997478485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.06340241, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.03214674667569998, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78192997, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.12695312, + "step": 1923, + "time_per_iteration": 4.837427854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145902, + "balance_loss_mlp": 1.1028676, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.07518583721861193, + "language_loss": 0.84417462, + "learning_rate": 0.0007260787361004556, + "loss": 0.85563368, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.43041992, + "step": 1924, + "time_per_iteration": 2.5874598026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.03880954, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023888622594867324, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74812186, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11865234, + "step": 1925, + "time_per_iteration": 4.961286544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137865, + "balance_loss_mlp": 1.09571242, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.05584746966952834, + "language_loss": 0.87657702, + "learning_rate": 0.0007255228077730903, + "loss": 0.88795567, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.42163086, + "step": 1926, + "time_per_iteration": 2.663482666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.09786606, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.05562014185368244, + "language_loss": 0.81976974, + "learning_rate": 0.0007252447122218632, + "loss": 0.83117759, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.42919922, + "step": 1927, + "time_per_iteration": 3.1484758853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138853, + "balance_loss_mlp": 1.09655809, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.06601877155853234, + "language_loss": 0.88791764, + "learning_rate": 0.0007249665292228834, + "loss": 0.89930612, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.4230957, + "step": 1928, + "time_per_iteration": 2.5840864181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140563, + "balance_loss_mlp": 1.09872091, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.05314866644458525, + "language_loss": 0.83534646, + "learning_rate": 0.000724688258884151, + "loss": 0.84675211, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.41845703, + "step": 1929, + "time_per_iteration": 2.6063482761383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129765, + "balance_loss_mlp": 1.09166527, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.06946275153671234, + "language_loss": 0.86767673, + "learning_rate": 0.0007244099013137002, + "loss": 0.87897444, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.38085938, + "step": 1930, + "time_per_iteration": 3.0539071559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.0873971, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.05696415350586704, + "language_loss": 0.89040637, + "learning_rate": 0.0007241314566195993, + "loss": 0.90168232, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.40185547, + "step": 1931, + "time_per_iteration": 3.2625389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07861531, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.08463017827171934, + "language_loss": 0.85909784, + "learning_rate": 0.0007238529249099496, + "loss": 0.87028337, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.39941406, + "step": 1932, + "time_per_iteration": 2.6740944385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.09080601, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.046016525030599324, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78958464, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10791016, + "step": 1933, + "time_per_iteration": 4.862685203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125745, + "balance_loss_mlp": 1.08347321, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.10032321862894769, + "language_loss": 0.80747449, + "learning_rate": 0.000723295600876581, + "loss": 0.81873196, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.42285156, + "step": 1934, + "time_per_iteration": 2.990391969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125218, + "balance_loss_mlp": 1.08406699, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.057414096803471676, + "language_loss": 0.87956464, + "learning_rate": 0.0007230168087692344, + "loss": 0.89081681, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.41162109, + "step": 1935, + "time_per_iteration": 2.656625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119009, + "balance_loss_mlp": 1.07924092, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.060205825913767164, + "language_loss": 0.82307911, + "learning_rate": 0.0007227379300790839, + "loss": 0.83426917, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.39770508, + "step": 1936, + "time_per_iteration": 2.997037649154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114267, + "balance_loss_mlp": 1.07218599, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.06128365804507508, + "language_loss": 0.86067426, + "learning_rate": 0.0007224589649143997, + "loss": 0.87181687, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.4206543, + "step": 1937, + "time_per_iteration": 2.5290677547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124508, + "balance_loss_mlp": 1.08228397, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.06605047879793914, + "language_loss": 0.81297445, + "learning_rate": 0.0007221799133834861, + "loss": 0.82421947, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.42236328, + "step": 1938, + "time_per_iteration": 2.613140106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122203, + "balance_loss_mlp": 1.08195794, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.09318016716004435, + "language_loss": 0.8198092, + "learning_rate": 0.00072190077559468, + "loss": 0.83103126, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.40209961, + "step": 1939, + "time_per_iteration": 2.517237424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115818, + "balance_loss_mlp": 1.07578754, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.0553068133661429, + "language_loss": 0.8932575, + "learning_rate": 0.0007216215516563527, + "loss": 0.90441567, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.40014648, + "step": 1940, + "time_per_iteration": 2.7175915241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_mlp": 1.07089305, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.06982995582267476, + "language_loss": 0.83827746, + "learning_rate": 0.0007213422416769083, + "loss": 0.84939647, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.41015625, + "step": 1941, + "time_per_iteration": 2.5922279357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116664, + "balance_loss_mlp": 1.07684803, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.050249137281424494, + "language_loss": 0.75479639, + "learning_rate": 0.0007210628457647849, + "loss": 0.76596296, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.39819336, + "step": 1942, + "time_per_iteration": 2.583151340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.07781446, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.0794488438004998, + "language_loss": 0.79022861, + "learning_rate": 0.000720783364028453, + "loss": 0.8014161, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.40942383, + "step": 1943, + "time_per_iteration": 2.7737677097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114071, + "balance_loss_mlp": 1.07418346, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.05694655733140731, + "language_loss": 0.87941283, + "learning_rate": 0.0007205037965764177, + "loss": 0.89055347, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.39868164, + "step": 1944, + "time_per_iteration": 2.558089256286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123121, + "balance_loss_mlp": 1.08430672, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07621334150317126, + "language_loss": 0.85730159, + "learning_rate": 0.0007202241435172161, + "loss": 0.86853278, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.38769531, + "step": 1945, + "time_per_iteration": 2.7602779865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125439, + "balance_loss_mlp": 1.08574176, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07927003262790512, + "language_loss": 0.88465476, + "learning_rate": 0.0007199444049594198, + "loss": 0.89590919, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.39697266, + "step": 1946, + "time_per_iteration": 2.9583580493927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119027, + "balance_loss_mlp": 1.07665968, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.055396154938164174, + "language_loss": 0.8346498, + "learning_rate": 0.0007196645810116322, + "loss": 0.8458401, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.42382812, + "step": 1947, + "time_per_iteration": 2.6851320266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131178, + "balance_loss_mlp": 1.09045637, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.05889971918419499, + "language_loss": 0.84302223, + "learning_rate": 0.0007193846717824912, + "loss": 0.854334, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.40698242, + "step": 1948, + "time_per_iteration": 2.9035325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.08848619, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.07994215642664601, + "language_loss": 0.88549483, + "learning_rate": 0.0007191046773806669, + "loss": 0.89678907, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.40942383, + "step": 1949, + "time_per_iteration": 2.574697256088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135159, + "balance_loss_mlp": 1.09224343, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07615017139071276, + "language_loss": 0.8356899, + "learning_rate": 0.0007188245979148631, + "loss": 0.84704149, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.42919922, + "step": 1950, + "time_per_iteration": 3.216397285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137761, + "balance_loss_mlp": 1.09475029, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.061651705216508604, + "language_loss": 0.87894762, + "learning_rate": 0.0007185444334938157, + "loss": 0.89032525, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.43041992, + "step": 1951, + "time_per_iteration": 2.6782584190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127424, + "balance_loss_mlp": 1.08879972, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.07782676746029546, + "language_loss": 0.84900033, + "learning_rate": 0.0007182641842262947, + "loss": 0.86027455, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.38647461, + "step": 1952, + "time_per_iteration": 2.639446258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125752, + "balance_loss_mlp": 1.08603168, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.05954692469221933, + "language_loss": 0.78027642, + "learning_rate": 0.0007179838502211022, + "loss": 0.79153389, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.3972168, + "step": 1953, + "time_per_iteration": 2.84329891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131364, + "balance_loss_mlp": 1.09028411, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.10232430816689406, + "language_loss": 0.86411202, + "learning_rate": 0.0007177034315870738, + "loss": 0.8754257, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.41064453, + "step": 1954, + "time_per_iteration": 2.957648992538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124051, + "balance_loss_mlp": 1.08325803, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06271313302399782, + "language_loss": 0.91398948, + "learning_rate": 0.0007174229284330773, + "loss": 0.92523003, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.40795898, + "step": 1955, + "time_per_iteration": 2.5879859924316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128257, + "balance_loss_mlp": 1.08879828, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.06607511431735706, + "language_loss": 0.86850858, + "learning_rate": 0.0007171423408680141, + "loss": 0.87979114, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.39453125, + "step": 1956, + "time_per_iteration": 2.7903566360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123297, + "balance_loss_mlp": 1.08295655, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.06886679209235984, + "language_loss": 0.90041375, + "learning_rate": 0.0007168616690008176, + "loss": 0.91164672, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.40356445, + "step": 1957, + "time_per_iteration": 2.6327474117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07705224, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.062429689069725576, + "language_loss": 0.85725892, + "learning_rate": 0.0007165809129404545, + "loss": 0.86842352, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.39404297, + "step": 1958, + "time_per_iteration": 2.7385900020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124898, + "balance_loss_mlp": 1.08527279, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.05793527093847313, + "language_loss": 0.85962278, + "learning_rate": 0.0007163000727959239, + "loss": 0.87087178, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.39624023, + "step": 1959, + "time_per_iteration": 2.485438585281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_mlp": 1.0320313, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.027906108498427614, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79005599, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.14453125, + "step": 1960, + "time_per_iteration": 4.834578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_mlp": 1.07865775, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.05325294699236946, + "language_loss": 0.84349847, + "learning_rate": 0.00071573814069052, + "loss": 0.85467696, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.39208984, + "step": 1961, + "time_per_iteration": 2.9086802005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120534, + "balance_loss_mlp": 1.08219612, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.09498383670658105, + "language_loss": 0.88074362, + "learning_rate": 0.0007154570489478081, + "loss": 0.89194894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.38330078, + "step": 1962, + "time_per_iteration": 3.2217841148376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117183, + "balance_loss_mlp": 1.07889283, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.05466788938828107, + "language_loss": 0.86278516, + "learning_rate": 0.0007151758735572514, + "loss": 0.87395698, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.38232422, + "step": 1963, + "time_per_iteration": 3.01104998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130106, + "balance_loss_mlp": 1.08921766, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.06218420858169212, + "language_loss": 0.81413925, + "learning_rate": 0.0007148946146280119, + "loss": 0.82544029, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.40893555, + "step": 1964, + "time_per_iteration": 2.8039112091064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_mlp": 1.01440012, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.022738468700431315, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73218751, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12207031, + "step": 1965, + "time_per_iteration": 4.8600172996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024213, + "balance_loss_mlp": 1.0124352, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.018349030303600054, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76366156, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.11767578, + "step": 1966, + "time_per_iteration": 4.918729782104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135372, + "balance_loss_mlp": 1.09648633, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.2766921299066869, + "language_loss": 0.83812642, + "learning_rate": 0.0007140503377003022, + "loss": 0.84948009, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.38891602, + "step": 1967, + "time_per_iteration": 3.015761613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149326, + "balance_loss_mlp": 1.10862756, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.07158509383086724, + "language_loss": 0.8519339, + "learning_rate": 0.000713768745708599, + "loss": 0.8634271, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.40698242, + "step": 1968, + "time_per_iteration": 2.6109209060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140905, + "balance_loss_mlp": 1.09996843, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.05954158443482363, + "language_loss": 0.774553, + "learning_rate": 0.0007134870707245085, + "loss": 0.78596205, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.40893555, + "step": 1969, + "time_per_iteration": 3.2631757259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150809, + "balance_loss_mlp": 1.11008716, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.05763521765218817, + "language_loss": 0.84313977, + "learning_rate": 0.0007132053128573864, + "loss": 0.85464787, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.40698242, + "step": 1970, + "time_per_iteration": 2.7791051864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143919, + "balance_loss_mlp": 1.10353041, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06905446326925666, + "language_loss": 0.84168518, + "learning_rate": 0.0007129234722166211, + "loss": 0.85312432, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.40356445, + "step": 1971, + "time_per_iteration": 2.8210554122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149932, + "balance_loss_mlp": 1.11152232, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.07023460279096982, + "language_loss": 0.91057038, + "learning_rate": 0.0007126415489116328, + "loss": 0.92206967, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3840332, + "step": 1972, + "time_per_iteration": 2.672755002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153021, + "balance_loss_mlp": 1.11210799, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06814261110374484, + "language_loss": 0.81719398, + "learning_rate": 0.0007123595430518736, + "loss": 0.82872415, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.40917969, + "step": 1973, + "time_per_iteration": 2.8325109481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_mlp": 1.10081029, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.06503005991167149, + "language_loss": 0.86840981, + "learning_rate": 0.0007120774547468282, + "loss": 0.87980628, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.38793945, + "step": 1974, + "time_per_iteration": 2.6115715503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148199, + "balance_loss_mlp": 1.10781133, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.05441443516000103, + "language_loss": 0.81729043, + "learning_rate": 0.0007117952841060128, + "loss": 0.82877243, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.40380859, + "step": 1975, + "time_per_iteration": 2.6378135681152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135454, + "balance_loss_mlp": 1.09389758, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08133175482890537, + "language_loss": 0.83869064, + "learning_rate": 0.0007115130312389756, + "loss": 0.85004514, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.41552734, + "step": 1976, + "time_per_iteration": 2.664318084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139177, + "balance_loss_mlp": 1.0974772, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.06620518382871708, + "language_loss": 0.79781663, + "learning_rate": 0.0007112306962552973, + "loss": 0.80920839, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.41699219, + "step": 1977, + "time_per_iteration": 2.6198599338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.0891974, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.05972767263520316, + "language_loss": 0.85605282, + "learning_rate": 0.0007109482792645896, + "loss": 0.86734867, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.40356445, + "step": 1978, + "time_per_iteration": 2.728576898574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132218, + "balance_loss_mlp": 1.09066188, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.09572440125940551, + "language_loss": 0.84308225, + "learning_rate": 0.0007106657803764969, + "loss": 0.85440445, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.41552734, + "step": 1979, + "time_per_iteration": 2.7279720306396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126537, + "balance_loss_mlp": 1.08340704, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.05862837672704736, + "language_loss": 0.82269728, + "learning_rate": 0.0007103831997006948, + "loss": 0.83396262, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.43164062, + "step": 1980, + "time_per_iteration": 2.746915817260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127621, + "balance_loss_mlp": 1.08663654, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.05821983888794681, + "language_loss": 0.85798764, + "learning_rate": 0.0007101005373468908, + "loss": 0.86926389, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.40991211, + "step": 1981, + "time_per_iteration": 2.878394365310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131348, + "balance_loss_mlp": 1.09060264, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.057148713710776886, + "language_loss": 0.86977971, + "learning_rate": 0.0007098177934248242, + "loss": 0.88109326, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.40771484, + "step": 1982, + "time_per_iteration": 2.7281908988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142672, + "balance_loss_mlp": 1.09918451, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.07304374640444197, + "language_loss": 0.85583997, + "learning_rate": 0.0007095349680442661, + "loss": 0.86726665, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.43505859, + "step": 1983, + "time_per_iteration": 2.831989288330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132213, + "balance_loss_mlp": 1.09015596, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.059661631452858944, + "language_loss": 0.79073238, + "learning_rate": 0.0007092520613150188, + "loss": 0.80205452, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4206543, + "step": 1984, + "time_per_iteration": 2.6566810607910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.09416926, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.0624399969319272, + "language_loss": 0.81395054, + "learning_rate": 0.0007089690733469165, + "loss": 0.82531422, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.42236328, + "step": 1985, + "time_per_iteration": 2.713041067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133128, + "balance_loss_mlp": 1.09023643, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.0833415836593691, + "language_loss": 0.83054602, + "learning_rate": 0.000708686004249825, + "loss": 0.84187728, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.42895508, + "step": 1986, + "time_per_iteration": 2.7708489894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135389, + "balance_loss_mlp": 1.09311724, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.050231849807362665, + "language_loss": 0.91983181, + "learning_rate": 0.0007084028541336413, + "loss": 0.93118572, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.42260742, + "step": 1987, + "time_per_iteration": 2.7049031257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135282, + "balance_loss_mlp": 1.09205675, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.07987509930436443, + "language_loss": 0.86416399, + "learning_rate": 0.0007081196231082942, + "loss": 0.87551689, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.43212891, + "step": 1988, + "time_per_iteration": 2.769559860229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.09949565, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.09872496004335095, + "language_loss": 0.80492568, + "learning_rate": 0.0007078363112837436, + "loss": 0.81635618, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.43579102, + "step": 1989, + "time_per_iteration": 2.836904525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144237, + "balance_loss_mlp": 1.10065365, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.05755280117815587, + "language_loss": 0.85391158, + "learning_rate": 0.000707552918769981, + "loss": 0.86535394, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43579102, + "step": 1990, + "time_per_iteration": 2.552560806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114164, + "balance_loss_mlp": 1.09846199, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.058237292508227935, + "language_loss": 0.83844453, + "learning_rate": 0.000707269445677029, + "loss": 0.84986091, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43188477, + "step": 1991, + "time_per_iteration": 2.717240571975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155192, + "balance_loss_mlp": 1.11270583, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.08345502818850435, + "language_loss": 0.85774487, + "learning_rate": 0.0007069858921149416, + "loss": 0.86929679, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.42480469, + "step": 1992, + "time_per_iteration": 2.937901496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143498, + "balance_loss_mlp": 1.10120225, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.06679457573221616, + "language_loss": 0.86415881, + "learning_rate": 0.0007067022581938043, + "loss": 0.87559378, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.4230957, + "step": 1993, + "time_per_iteration": 2.8283159732818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147458, + "balance_loss_mlp": 1.10614026, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.06079929242541683, + "language_loss": 0.83476102, + "learning_rate": 0.0007064185440237334, + "loss": 0.84623557, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.41333008, + "step": 1994, + "time_per_iteration": 2.738664150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148789, + "balance_loss_mlp": 1.10627878, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.05320553517563596, + "language_loss": 0.8495338, + "learning_rate": 0.0007061347497148764, + "loss": 0.8610217, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.42504883, + "step": 1995, + "time_per_iteration": 2.7379775047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147554, + "balance_loss_mlp": 1.10444832, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.059351713178290334, + "language_loss": 0.86747766, + "learning_rate": 0.0007058508753774122, + "loss": 0.87895322, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.4309082, + "step": 1996, + "time_per_iteration": 2.6882424354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144268, + "balance_loss_mlp": 1.10242534, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.08780844300106258, + "language_loss": 0.87086272, + "learning_rate": 0.0007055669211215505, + "loss": 0.88230544, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.41870117, + "step": 1997, + "time_per_iteration": 2.5902607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136259, + "balance_loss_mlp": 1.09236586, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.0743501008638896, + "language_loss": 0.77852333, + "learning_rate": 0.0007052828870575322, + "loss": 0.78988594, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43896484, + "step": 1998, + "time_per_iteration": 2.643887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113691, + "balance_loss_mlp": 1.09521055, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.05655172042288627, + "language_loss": 0.87035221, + "learning_rate": 0.0007049987732956291, + "loss": 0.88172132, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.41723633, + "step": 1999, + "time_per_iteration": 2.9655773639678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132979, + "balance_loss_mlp": 1.09325886, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.061738893850828154, + "language_loss": 0.83046496, + "learning_rate": 0.0007047145799461439, + "loss": 0.84179473, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.39746094, + "step": 2000, + "time_per_iteration": 2.8509583473205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_mlp": 1.0917958, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06203375299954445, + "language_loss": 0.82530397, + "learning_rate": 0.00070443030711941, + "loss": 0.83663273, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.41088867, + "step": 2001, + "time_per_iteration": 2.759324312210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134639, + "balance_loss_mlp": 1.09386945, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.05757301433327453, + "language_loss": 0.83082199, + "learning_rate": 0.0007041459549257924, + "loss": 0.84216839, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.40771484, + "step": 2002, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121155, + "balance_loss_mlp": 1.08014655, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.07528883527847323, + "language_loss": 0.78547823, + "learning_rate": 0.0007038615234756859, + "loss": 0.79668975, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.41015625, + "step": 2003, + "time_per_iteration": 3.211712598800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_mlp": 1.08257461, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.05751633762771481, + "language_loss": 0.83558142, + "learning_rate": 0.000703577012879517, + "loss": 0.84683371, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.42651367, + "step": 2004, + "time_per_iteration": 2.628211498260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130283, + "balance_loss_mlp": 1.08956099, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.08619617913051347, + "language_loss": 0.89379585, + "learning_rate": 0.0007032924232477423, + "loss": 0.90509868, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.40722656, + "step": 2005, + "time_per_iteration": 2.631619930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128848, + "balance_loss_mlp": 1.08743477, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.06586843636176778, + "language_loss": 0.80831605, + "learning_rate": 0.0007030077546908493, + "loss": 0.81960452, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.6160101890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336479, + "balance_loss_mlp": 1.3253212, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.11294410837330418, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84401143, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11181641, + "step": 2007, + "time_per_iteration": 4.7873475551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131514, + "balance_loss_mlp": 1.09014845, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.06382618687285554, + "language_loss": 0.79329109, + "learning_rate": 0.0007024381812438117, + "loss": 0.8046062, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.41381836, + "step": 2008, + "time_per_iteration": 2.5387141704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152986, + "balance_loss_mlp": 1.11390948, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.0811673363837608, + "language_loss": 0.83681285, + "learning_rate": 0.0007021532765747951, + "loss": 0.84834278, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.390625, + "step": 2009, + "time_per_iteration": 2.9795420169830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164171, + "balance_loss_mlp": 1.12082672, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.11123688030830275, + "language_loss": 0.7961666, + "learning_rate": 0.0007018682934229162, + "loss": 0.80780828, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43334961, + "step": 2010, + "time_per_iteration": 2.9108352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164881, + "balance_loss_mlp": 1.1216315, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.07913719393788664, + "language_loss": 0.83099723, + "learning_rate": 0.0007015832318988152, + "loss": 0.842646, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43237305, + "step": 2011, + "time_per_iteration": 2.605280637741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082789, + "balance_loss_mlp": 1.07096386, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.024547203760462325, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74972868, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11816406, + "step": 2012, + "time_per_iteration": 4.955415964126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161774, + "balance_loss_mlp": 1.12167192, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.062010894867637535, + "language_loss": 0.84259552, + "learning_rate": 0.0007010128741766604, + "loss": 0.85421324, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.40112305, + "step": 2013, + "time_per_iteration": 2.738905906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162675, + "balance_loss_mlp": 1.12080884, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.08443522979585812, + "language_loss": 0.84619504, + "learning_rate": 0.0007007275782000391, + "loss": 0.85782182, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.41870117, + "step": 2014, + "time_per_iteration": 2.6049582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178912, + "balance_loss_mlp": 1.13528132, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.05901822901260885, + "language_loss": 0.84836662, + "learning_rate": 0.0007004422042940605, + "loss": 0.8601557, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.43603516, + "step": 2015, + "time_per_iteration": 2.5449817180633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174031, + "balance_loss_mlp": 1.13106763, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.07137462797198264, + "language_loss": 0.89881837, + "learning_rate": 0.0007001567525695169, + "loss": 0.9105587, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.42993164, + "step": 2016, + "time_per_iteration": 2.5804128646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191346, + "balance_loss_mlp": 1.14921737, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.11416128839824946, + "language_loss": 0.84030014, + "learning_rate": 0.0006998712231372303, + "loss": 0.85221362, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.42138672, + "step": 2017, + "time_per_iteration": 2.9779462814331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182085, + "balance_loss_mlp": 1.13845432, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06300984009010882, + "language_loss": 0.86622429, + "learning_rate": 0.0006995856161080532, + "loss": 0.87804508, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43652344, + "step": 2018, + "time_per_iteration": 2.8405675888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160301, + "balance_loss_mlp": 1.11588371, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.0764923139512956, + "language_loss": 0.8250891, + "learning_rate": 0.0006992999315928679, + "loss": 0.83669221, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.44433594, + "step": 2019, + "time_per_iteration": 2.7929439544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146323, + "balance_loss_mlp": 1.10407472, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.09156853050649941, + "language_loss": 0.86159158, + "learning_rate": 0.0006990141697025871, + "loss": 0.8730548, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.42236328, + "step": 2020, + "time_per_iteration": 2.7913589477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137863, + "balance_loss_mlp": 1.12422562, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.035838926183426385, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77497506, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.13671875, + "step": 2021, + "time_per_iteration": 4.727250576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011348, + "balance_loss_mlp": 1.09398317, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0717580829802053, + "language_loss": 0.82676983, + "learning_rate": 0.0006984424142405392, + "loss": 0.8381179, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.40771484, + "step": 2022, + "time_per_iteration": 2.810420513153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.09006715, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.11474151925346394, + "language_loss": 0.8263585, + "learning_rate": 0.0006981564208907474, + "loss": 0.83766377, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.40454102, + "step": 2023, + "time_per_iteration": 2.604849100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139234, + "balance_loss_mlp": 1.09763026, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.05701984367640102, + "language_loss": 0.90312237, + "learning_rate": 0.0006978703506098102, + "loss": 0.91451472, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.41601562, + "step": 2024, + "time_per_iteration": 2.7345082759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115758, + "balance_loss_mlp": 1.11683416, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.06830457595999238, + "language_loss": 0.87819719, + "learning_rate": 0.00069758420350879, + "loss": 0.88977301, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.40722656, + "step": 2025, + "time_per_iteration": 2.6252336502075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160672, + "balance_loss_mlp": 1.11689889, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.07405760759256953, + "language_loss": 0.8637889, + "learning_rate": 0.000697297979698779, + "loss": 0.87539566, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43774414, + "step": 2026, + "time_per_iteration": 2.709831476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.11291099, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06812366476721117, + "language_loss": 0.83983821, + "learning_rate": 0.0006970116792908992, + "loss": 0.85135239, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.38500977, + "step": 2027, + "time_per_iteration": 3.0651228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.10976994, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.06881031116362346, + "language_loss": 0.82086015, + "learning_rate": 0.000696725302396302, + "loss": 0.832358, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.39990234, + "step": 2028, + "time_per_iteration": 2.6441640853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134129, + "balance_loss_mlp": 1.09400284, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.05768401763088921, + "language_loss": 0.86036873, + "learning_rate": 0.0006964388491261692, + "loss": 0.87171006, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.40136719, + "step": 2029, + "time_per_iteration": 3.3004355430603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129182, + "balance_loss_mlp": 1.08941352, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.06928638271863855, + "language_loss": 0.87596297, + "learning_rate": 0.0006961523195917114, + "loss": 0.88725477, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.39770508, + "step": 2030, + "time_per_iteration": 2.8312549591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112269, + "balance_loss_mlp": 1.08041883, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.06430070846126967, + "language_loss": 0.78209358, + "learning_rate": 0.0006958657139041696, + "loss": 0.79332048, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.4230957, + "step": 2031, + "time_per_iteration": 2.789843797683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172125, + "balance_loss_mlp": 1.1593461, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.04676690558545683, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77885091, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.12792969, + "step": 2032, + "time_per_iteration": 4.9584527015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118419, + "balance_loss_mlp": 1.07781672, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.06222398192409584, + "language_loss": 0.78433788, + "learning_rate": 0.0006952922745149434, + "loss": 0.79552209, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.40600586, + "step": 2033, + "time_per_iteration": 2.6696994304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125088, + "balance_loss_mlp": 1.08288765, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06080690179225973, + "language_loss": 0.88040847, + "learning_rate": 0.000695005441035888, + "loss": 0.89165938, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.421875, + "step": 2034, + "time_per_iteration": 2.675685167312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126781, + "balance_loss_mlp": 1.11333418, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.02489517999219278, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74850214, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.13476562, + "step": 2035, + "time_per_iteration": 4.8780670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114086, + "balance_loss_mlp": 1.10006714, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.09902005838056731, + "language_loss": 0.81387436, + "learning_rate": 0.0006944315470656863, + "loss": 0.82528299, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.40795898, + "step": 2036, + "time_per_iteration": 3.04048228263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132051, + "balance_loss_mlp": 1.08858752, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.07431126960541347, + "language_loss": 0.91352618, + "learning_rate": 0.000694144486797345, + "loss": 0.92484671, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.43432617, + "step": 2037, + "time_per_iteration": 2.692013740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110594, + "balance_loss_mlp": 1.09695601, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.027663679576331687, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8063103, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.13671875, + "step": 2038, + "time_per_iteration": 4.626150369644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128452, + "balance_loss_mlp": 1.08796859, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.06987974305662424, + "language_loss": 0.90060711, + "learning_rate": 0.0006935701402514156, + "loss": 0.91189158, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.40454102, + "step": 2039, + "time_per_iteration": 2.5738487243652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099838, + "balance_loss_mlp": 1.0864867, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.03469500580229188, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74134731, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13378906, + "step": 2040, + "time_per_iteration": 4.957871437072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140825, + "balance_loss_mlp": 1.10112846, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.08036310752647091, + "language_loss": 0.84965599, + "learning_rate": 0.0006929954931031422, + "loss": 0.86106431, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.39672852, + "step": 2041, + "time_per_iteration": 4.232867956161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_mlp": 1.09039509, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05705738410966496, + "language_loss": 0.8864727, + "learning_rate": 0.0006927080570819805, + "loss": 0.89776957, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.39282227, + "step": 2042, + "time_per_iteration": 2.6111459732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_mlp": 1.10252953, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.08862983476083965, + "language_loss": 0.81371272, + "learning_rate": 0.0006924205462449161, + "loss": 0.82514596, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.40795898, + "step": 2043, + "time_per_iteration": 2.6160669326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128783, + "balance_loss_mlp": 1.08932424, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.06601435567751561, + "language_loss": 0.82073617, + "learning_rate": 0.0006921329607035702, + "loss": 0.83202398, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.39453125, + "step": 2044, + "time_per_iteration": 3.2338860034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.08441699, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.06846789620147704, + "language_loss": 0.88441163, + "learning_rate": 0.0006918453005695938, + "loss": 0.89562631, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.37011719, + "step": 2045, + "time_per_iteration": 2.6499555110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135104, + "balance_loss_mlp": 1.09426332, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.05142411082006327, + "language_loss": 0.84655213, + "learning_rate": 0.0006915575659546662, + "loss": 0.85790318, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.40869141, + "step": 2046, + "time_per_iteration": 2.652902364730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133716, + "balance_loss_mlp": 1.09339929, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.08744808643608758, + "language_loss": 0.80837369, + "learning_rate": 0.0006912697569704959, + "loss": 0.81971085, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.40307617, + "step": 2047, + "time_per_iteration": 2.6129064559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131331, + "balance_loss_mlp": 1.09158659, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07468037026935817, + "language_loss": 0.86945641, + "learning_rate": 0.0006909818737288205, + "loss": 0.88076973, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.3972168, + "step": 2048, + "time_per_iteration": 2.5576181411743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10632348, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07110132916922086, + "language_loss": 0.81226838, + "learning_rate": 0.000690693916341406, + "loss": 0.82373071, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.39916992, + "step": 2049, + "time_per_iteration": 2.5884814262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154156, + "balance_loss_mlp": 1.11398268, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.05472880535545416, + "language_loss": 0.82429487, + "learning_rate": 0.0006904058849200475, + "loss": 0.83583641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.40185547, + "step": 2050, + "time_per_iteration": 2.7662599086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144327, + "balance_loss_mlp": 1.10565519, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.06127353443593348, + "language_loss": 0.85204089, + "learning_rate": 0.0006901177795765683, + "loss": 0.86348414, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.38647461, + "step": 2051, + "time_per_iteration": 2.577353000640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011475, + "balance_loss_mlp": 1.10768366, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.10882102145067868, + "language_loss": 0.81508064, + "learning_rate": 0.0006898296004228213, + "loss": 0.82655561, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.39819336, + "step": 2052, + "time_per_iteration": 2.7242588996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118361, + "balance_loss_mlp": 1.10605848, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03880030883121314, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79245102, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12304688, + "step": 2053, + "time_per_iteration": 4.852335691452026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.1204555, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.06533456514809383, + "language_loss": 0.79943091, + "learning_rate": 0.0006892530211320763, + "loss": 0.81103128, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.39575195, + "step": 2054, + "time_per_iteration": 2.726592779159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163981, + "balance_loss_mlp": 1.12528563, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.06955061494726521, + "language_loss": 0.8399905, + "learning_rate": 0.000688964621218926, + "loss": 0.85163033, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.38696289, + "step": 2055, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156737, + "balance_loss_mlp": 1.11737382, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.06754212988294535, + "language_loss": 0.80637926, + "learning_rate": 0.0006886761479432037, + "loss": 0.81794661, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.39379883, + "step": 2056, + "time_per_iteration": 2.8334691524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169364, + "balance_loss_mlp": 1.12866604, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.08783588969410645, + "language_loss": 0.85058302, + "learning_rate": 0.0006883876014169045, + "loss": 0.86227667, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.40698242, + "step": 2057, + "time_per_iteration": 2.4859981536865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163618, + "balance_loss_mlp": 1.12344468, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07066278036752763, + "language_loss": 0.90527105, + "learning_rate": 0.000688098981752052, + "loss": 0.91690719, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.40161133, + "step": 2058, + "time_per_iteration": 2.737825393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169191, + "balance_loss_mlp": 1.12849319, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08574741875980238, + "language_loss": 0.80283022, + "learning_rate": 0.0006878102890606982, + "loss": 0.81452215, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.40722656, + "step": 2059, + "time_per_iteration": 3.0589451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159966, + "balance_loss_mlp": 1.12034082, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.07158976818793618, + "language_loss": 0.81510139, + "learning_rate": 0.0006875215234549239, + "loss": 0.8267011, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.39648438, + "step": 2060, + "time_per_iteration": 2.5404529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11150885, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.11168111879418678, + "language_loss": 0.86092877, + "learning_rate": 0.0006872326850468376, + "loss": 0.87244487, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.40087891, + "step": 2061, + "time_per_iteration": 2.6653215885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153197, + "balance_loss_mlp": 1.11133087, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.0731410886524803, + "language_loss": 0.79433036, + "learning_rate": 0.0006869437739485762, + "loss": 0.80586231, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.41870117, + "step": 2062, + "time_per_iteration": 2.6032299995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147299, + "balance_loss_mlp": 1.1086272, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06685158443863869, + "language_loss": 0.9296748, + "learning_rate": 0.0006866547902723053, + "loss": 0.9411478, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.38647461, + "step": 2063, + "time_per_iteration": 2.676166534423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150184, + "balance_loss_mlp": 1.11148858, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10136223850880095, + "language_loss": 0.80330342, + "learning_rate": 0.000686365734130218, + "loss": 0.81480527, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.38696289, + "step": 2064, + "time_per_iteration": 2.6844232082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143564, + "balance_loss_mlp": 1.10420108, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06083764513088428, + "language_loss": 0.84282482, + "learning_rate": 0.000686076605634536, + "loss": 0.85426044, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.39379883, + "step": 2065, + "time_per_iteration": 2.6315250396728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156007, + "balance_loss_mlp": 1.11704922, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.07154960647229537, + "language_loss": 0.84777498, + "learning_rate": 0.0006857874048975088, + "loss": 0.85933506, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.38964844, + "step": 2066, + "time_per_iteration": 2.651740074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144331, + "balance_loss_mlp": 1.10298944, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06215318135177391, + "language_loss": 0.87357152, + "learning_rate": 0.0006854981320314142, + "loss": 0.88501477, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.41381836, + "step": 2067, + "time_per_iteration": 2.5062263011932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150736, + "balance_loss_mlp": 1.11089611, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.07144157906743025, + "language_loss": 0.87282014, + "learning_rate": 0.0006852087871485579, + "loss": 0.88432747, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.3984375, + "step": 2068, + "time_per_iteration": 2.6593010425567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141379, + "balance_loss_mlp": 1.10206354, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08492249089395289, + "language_loss": 0.82224536, + "learning_rate": 0.0006849193703612735, + "loss": 0.83365911, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.39282227, + "step": 2069, + "time_per_iteration": 2.755782127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137654, + "balance_loss_mlp": 1.09817159, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07327967142242812, + "language_loss": 0.78054988, + "learning_rate": 0.0006846298817819225, + "loss": 0.79192644, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.39477539, + "step": 2070, + "time_per_iteration": 2.987943410873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148271, + "balance_loss_mlp": 1.10909855, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.08050617332568782, + "language_loss": 0.81162381, + "learning_rate": 0.0006843403215228945, + "loss": 0.82310653, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.3918457, + "step": 2071, + "time_per_iteration": 2.4827940464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165055, + "balance_loss_mlp": 1.12585878, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.07083437878036915, + "language_loss": 0.80721962, + "learning_rate": 0.0006840506896966065, + "loss": 0.81887019, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.3918457, + "step": 2072, + "time_per_iteration": 2.6827309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166963, + "balance_loss_mlp": 1.12621748, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.06725102297232902, + "language_loss": 0.8278873, + "learning_rate": 0.0006837609864155038, + "loss": 0.83955693, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.40771484, + "step": 2073, + "time_per_iteration": 2.9130313396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116361, + "balance_loss_mlp": 1.12584436, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.07471059517929624, + "language_loss": 0.8375988, + "learning_rate": 0.0006834712117920592, + "loss": 0.84923482, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.37768555, + "step": 2074, + "time_per_iteration": 2.61501145362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162616, + "balance_loss_mlp": 1.12325335, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.13245970923224126, + "language_loss": 0.85901093, + "learning_rate": 0.0006831813659387729, + "loss": 0.87063706, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.39331055, + "step": 2075, + "time_per_iteration": 2.563549041748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149354, + "balance_loss_mlp": 1.11075377, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.06732512968880089, + "language_loss": 0.84738618, + "learning_rate": 0.0006828914489681733, + "loss": 0.85887969, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.38574219, + "step": 2076, + "time_per_iteration": 2.7011008262634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142979, + "balance_loss_mlp": 1.10440326, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.050728888200014394, + "language_loss": 0.85780215, + "learning_rate": 0.0006826014609928162, + "loss": 0.86923194, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.38598633, + "step": 2077, + "time_per_iteration": 2.699880838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_mlp": 1.01472485, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.012471286598558728, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84226274, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12158203, + "step": 2078, + "time_per_iteration": 4.819272518157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112436, + "balance_loss_mlp": 1.08549809, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.08765386089658693, + "language_loss": 0.80571902, + "learning_rate": 0.0006820212724781896, + "loss": 0.81696254, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.38842773, + "step": 2079, + "time_per_iteration": 2.6927945613861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112693, + "balance_loss_mlp": 1.07526088, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06830833334646268, + "language_loss": 0.84229112, + "learning_rate": 0.0006817310721641694, + "loss": 0.85341799, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.37402344, + "step": 2080, + "time_per_iteration": 2.8158507347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_mlp": 1.07422495, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0821477508940244, + "language_loss": 0.84532309, + "learning_rate": 0.00068144080129589, + "loss": 0.85646749, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.40234375, + "step": 2081, + "time_per_iteration": 2.665823221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111145, + "balance_loss_mlp": 1.07206321, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.06681211266265834, + "language_loss": 0.83178174, + "learning_rate": 0.0006811504599860441, + "loss": 0.84289622, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.39379883, + "step": 2082, + "time_per_iteration": 2.517651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112401, + "balance_loss_mlp": 1.07382464, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.04646658923847655, + "language_loss": 0.86172366, + "learning_rate": 0.0006808600483473526, + "loss": 0.87284768, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.38549805, + "step": 2083, + "time_per_iteration": 2.85060715675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106871, + "balance_loss_mlp": 1.06743646, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.05030907040360332, + "language_loss": 0.86459124, + "learning_rate": 0.0006805695664925629, + "loss": 0.87565994, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.39379883, + "step": 2084, + "time_per_iteration": 2.775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117346, + "balance_loss_mlp": 1.07810271, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.06453737345570608, + "language_loss": 0.84040797, + "learning_rate": 0.0006802790145344506, + "loss": 0.85158145, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.39233398, + "step": 2085, + "time_per_iteration": 2.4470229148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112227, + "balance_loss_mlp": 1.08459997, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07025741726477988, + "language_loss": 0.87659204, + "learning_rate": 0.0006799883925858176, + "loss": 0.8878147, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.37646484, + "step": 2086, + "time_per_iteration": 2.861490249633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136148, + "balance_loss_mlp": 1.09709549, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06341077230687828, + "language_loss": 0.85575259, + "learning_rate": 0.0006796977007594933, + "loss": 0.86711407, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.39038086, + "step": 2087, + "time_per_iteration": 2.619633197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.10920811, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.0625455511079972, + "language_loss": 0.86956239, + "learning_rate": 0.0006794069391683345, + "loss": 0.88106287, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.40844727, + "step": 2088, + "time_per_iteration": 4.210111618041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145795, + "balance_loss_mlp": 1.10683715, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.0705312667092641, + "language_loss": 0.81334388, + "learning_rate": 0.0006791161079252248, + "loss": 0.8248018, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.38916016, + "step": 2089, + "time_per_iteration": 2.614766836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_mlp": 1.10286903, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.084499094041807, + "language_loss": 0.82758236, + "learning_rate": 0.0006788252071430747, + "loss": 0.83899295, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.38183594, + "step": 2090, + "time_per_iteration": 2.617656707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135863, + "balance_loss_mlp": 1.09490228, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.0700927477934208, + "language_loss": 0.8703053, + "learning_rate": 0.0006785342369348222, + "loss": 0.88166392, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.40942383, + "step": 2091, + "time_per_iteration": 2.7607271671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122396, + "balance_loss_mlp": 1.08513117, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.09140990562062702, + "language_loss": 0.8009733, + "learning_rate": 0.0006782431974134316, + "loss": 0.81219733, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.37280273, + "step": 2092, + "time_per_iteration": 2.5610032081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118064, + "balance_loss_mlp": 1.07889199, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.054626907115785994, + "language_loss": 0.89608824, + "learning_rate": 0.0006779520886918949, + "loss": 0.90726894, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.3918457, + "step": 2093, + "time_per_iteration": 3.064581871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110103, + "balance_loss_mlp": 1.07279015, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.057101365791561574, + "language_loss": 0.81741238, + "learning_rate": 0.0006776609108832301, + "loss": 0.82851338, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.37304688, + "step": 2094, + "time_per_iteration": 2.77875018119812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_mlp": 1.06403446, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.06401566733015203, + "language_loss": 0.85612595, + "learning_rate": 0.0006773696641004828, + "loss": 0.86712897, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36254883, + "step": 2095, + "time_per_iteration": 2.5543506145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06522298, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.06439261414673134, + "language_loss": 0.77821416, + "learning_rate": 0.0006770783484567247, + "loss": 0.78923213, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.36572266, + "step": 2096, + "time_per_iteration": 3.14194393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114941, + "balance_loss_mlp": 1.07862973, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.051673087984505275, + "language_loss": 0.86408114, + "learning_rate": 0.000676786964065055, + "loss": 0.87523055, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36328125, + "step": 2097, + "time_per_iteration": 2.796668529510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109226, + "balance_loss_mlp": 1.07270014, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07558073774647381, + "language_loss": 0.79608446, + "learning_rate": 0.0006764955110385986, + "loss": 0.80717671, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.36547852, + "step": 2098, + "time_per_iteration": 2.721588134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.07998002, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06754969850087679, + "language_loss": 0.80409288, + "learning_rate": 0.0006762039894905083, + "loss": 0.8152715, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.37890625, + "step": 2099, + "time_per_iteration": 2.6286327838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126862, + "balance_loss_mlp": 1.08728456, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.06639046911061866, + "language_loss": 0.80760598, + "learning_rate": 0.000675912399533962, + "loss": 0.8188746, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.39599609, + "step": 2100, + "time_per_iteration": 2.5150249004364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110396, + "balance_loss_mlp": 1.07420361, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.05652757132031041, + "language_loss": 0.85431337, + "learning_rate": 0.0006756207412821656, + "loss": 0.86541736, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36206055, + "step": 2101, + "time_per_iteration": 2.9816384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.06962454, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08079981189537652, + "language_loss": 0.81269771, + "learning_rate": 0.0006753290148483505, + "loss": 0.8237704, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.37670898, + "step": 2102, + "time_per_iteration": 3.0291824340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111488, + "balance_loss_mlp": 1.07458103, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07115498960503684, + "language_loss": 0.79040611, + "learning_rate": 0.0006750372203457752, + "loss": 0.80152106, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.36914062, + "step": 2103, + "time_per_iteration": 2.5193490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111376, + "balance_loss_mlp": 1.07458746, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.049732783973711246, + "language_loss": 0.87039417, + "learning_rate": 0.0006747453578877242, + "loss": 0.88150793, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.36767578, + "step": 2104, + "time_per_iteration": 2.691030979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116651, + "balance_loss_mlp": 1.07998228, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.06592833756650988, + "language_loss": 0.83420014, + "learning_rate": 0.0006744534275875085, + "loss": 0.8453666, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.36669922, + "step": 2105, + "time_per_iteration": 2.9842946529388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.08099532, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.07270624559080442, + "language_loss": 0.85434729, + "learning_rate": 0.0006741614295584657, + "loss": 0.86553085, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.3737793, + "step": 2106, + "time_per_iteration": 2.63811993598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117316, + "balance_loss_mlp": 1.08057594, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.05922552771988275, + "language_loss": 0.78890157, + "learning_rate": 0.0006738693639139595, + "loss": 0.80007476, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.36743164, + "step": 2107, + "time_per_iteration": 2.9618351459503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116371, + "balance_loss_mlp": 1.07746077, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.06915522511623486, + "language_loss": 0.77808583, + "learning_rate": 0.0006735772307673796, + "loss": 0.78924954, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.38916016, + "step": 2108, + "time_per_iteration": 3.575981855392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111094, + "balance_loss_mlp": 1.07380557, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06309901973905298, + "language_loss": 0.83742046, + "learning_rate": 0.0006732850302321421, + "loss": 0.84853137, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.37280273, + "step": 2109, + "time_per_iteration": 3.045565605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114415, + "balance_loss_mlp": 1.0778178, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.060704196703692835, + "language_loss": 0.84782875, + "learning_rate": 0.00067299276242169, + "loss": 0.85897285, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.3659668, + "step": 2110, + "time_per_iteration": 2.6868693828582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047442, + "balance_loss_mlp": 1.03666544, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.029253972882140933, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75429612, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.10791016, + "step": 2111, + "time_per_iteration": 4.918604612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110434, + "balance_loss_mlp": 1.07281184, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.06207465310904933, + "language_loss": 0.78018594, + "learning_rate": 0.0006724080254290395, + "loss": 0.79129028, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.37597656, + "step": 2112, + "time_per_iteration": 2.798377752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116483, + "balance_loss_mlp": 1.08012438, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07195778929743761, + "language_loss": 0.89838338, + "learning_rate": 0.0006721155564738566, + "loss": 0.90954828, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36401367, + "step": 2113, + "time_per_iteration": 2.721280813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_mlp": 1.02395451, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.019551827625956694, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79657471, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.10888672, + "step": 2114, + "time_per_iteration": 4.956322193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110507, + "balance_loss_mlp": 1.07269359, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.052092004512661015, + "language_loss": 0.85970294, + "learning_rate": 0.0006715304182135078, + "loss": 0.87080801, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.37792969, + "step": 2115, + "time_per_iteration": 2.611116647720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114836, + "balance_loss_mlp": 1.07611692, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.051206353593090614, + "language_loss": 0.89130676, + "learning_rate": 0.0006712377491355127, + "loss": 0.90245515, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.38696289, + "step": 2116, + "time_per_iteration": 2.8788397312164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120118, + "balance_loss_mlp": 1.0829246, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.049235441975469474, + "language_loss": 0.81475073, + "learning_rate": 0.0006709450135771274, + "loss": 0.82595193, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.37182617, + "step": 2117, + "time_per_iteration": 2.944436550140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118262, + "balance_loss_mlp": 1.08233273, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.05682697017745506, + "language_loss": 0.86693907, + "learning_rate": 0.0006706522116520023, + "loss": 0.87812167, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.35913086, + "step": 2118, + "time_per_iteration": 2.6161422729492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125881, + "balance_loss_mlp": 1.08766294, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.060179733914174166, + "language_loss": 0.83147317, + "learning_rate": 0.0006703593434738127, + "loss": 0.84273201, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.38208008, + "step": 2119, + "time_per_iteration": 2.719313383102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123164, + "balance_loss_mlp": 1.0857563, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06825324786035328, + "language_loss": 0.78421569, + "learning_rate": 0.0006700664091562604, + "loss": 0.79544735, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.37402344, + "step": 2120, + "time_per_iteration": 2.569246530532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125736, + "balance_loss_mlp": 1.09090257, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.051920603902655335, + "language_loss": 0.85211694, + "learning_rate": 0.0006697734088130725, + "loss": 0.86337435, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.34863281, + "step": 2121, + "time_per_iteration": 2.67394757270813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124636, + "balance_loss_mlp": 1.08732319, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.05791753235244458, + "language_loss": 0.85750419, + "learning_rate": 0.0006694803425580018, + "loss": 0.86875051, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.37304688, + "step": 2122, + "time_per_iteration": 2.9812121391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129477, + "balance_loss_mlp": 1.09178257, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.06590998571054847, + "language_loss": 0.84986377, + "learning_rate": 0.0006691872105048268, + "loss": 0.86115849, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.37646484, + "step": 2123, + "time_per_iteration": 2.56272292137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137814, + "balance_loss_mlp": 1.10157394, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.05742584890727743, + "language_loss": 0.84864831, + "learning_rate": 0.0006688940127673513, + "loss": 0.86002642, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.36254883, + "step": 2124, + "time_per_iteration": 2.6935954093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113197, + "balance_loss_mlp": 1.09642184, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.05672589959491125, + "language_loss": 0.85339016, + "learning_rate": 0.0006686007494594049, + "loss": 0.86470985, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.35571289, + "step": 2125, + "time_per_iteration": 2.8291172981262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128385, + "balance_loss_mlp": 1.09097719, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.06786502616833631, + "language_loss": 0.81025755, + "learning_rate": 0.0006683074206948425, + "loss": 0.82154143, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.37402344, + "step": 2126, + "time_per_iteration": 2.5193305015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126549, + "balance_loss_mlp": 1.09095287, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.06065849070073351, + "language_loss": 0.81971312, + "learning_rate": 0.0006680140265875443, + "loss": 0.83097857, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.35595703, + "step": 2127, + "time_per_iteration": 2.8254714012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.09184861, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.054477830354085016, + "language_loss": 0.95947516, + "learning_rate": 0.0006677205672514162, + "loss": 0.97074527, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35205078, + "step": 2128, + "time_per_iteration": 2.6226608753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120165, + "balance_loss_mlp": 1.0867151, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.047090391860463696, + "language_loss": 0.88730562, + "learning_rate": 0.000667427042800389, + "loss": 0.8985073, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.3347168, + "step": 2129, + "time_per_iteration": 2.7718160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118491, + "balance_loss_mlp": 1.0833478, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.05934025192817406, + "language_loss": 0.83200449, + "learning_rate": 0.0006671334533484192, + "loss": 0.84318936, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.3515625, + "step": 2130, + "time_per_iteration": 2.7164061069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126113, + "balance_loss_mlp": 1.09199548, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.04849724567471186, + "language_loss": 0.83465552, + "learning_rate": 0.0006668397990094881, + "loss": 0.84591663, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.34130859, + "step": 2131, + "time_per_iteration": 2.684115171432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124098, + "balance_loss_mlp": 1.08738196, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.059898700524732326, + "language_loss": 0.84854865, + "learning_rate": 0.0006665460798976027, + "loss": 0.85978961, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.3671875, + "step": 2132, + "time_per_iteration": 2.748350143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114613, + "balance_loss_mlp": 1.07899356, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057665198388541644, + "language_loss": 0.81392014, + "learning_rate": 0.0006662522961267947, + "loss": 0.82506627, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.35620117, + "step": 2133, + "time_per_iteration": 2.696699619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117281, + "balance_loss_mlp": 1.08192313, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.05272213252392562, + "language_loss": 0.87773252, + "learning_rate": 0.0006659584478111211, + "loss": 0.88890535, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.35351562, + "step": 2134, + "time_per_iteration": 2.793302536010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08249605, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.06878890228068688, + "language_loss": 0.83315176, + "learning_rate": 0.000665664535064664, + "loss": 0.84434175, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.36499023, + "step": 2135, + "time_per_iteration": 3.0627260208129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104736, + "balance_loss_mlp": 1.06987929, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.05984370507865806, + "language_loss": 0.83017695, + "learning_rate": 0.0006653705580015303, + "loss": 0.84122425, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.34863281, + "step": 2136, + "time_per_iteration": 2.6851253509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103645, + "balance_loss_mlp": 1.0668807, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.07790160743926922, + "language_loss": 0.86554241, + "learning_rate": 0.0006650765167358523, + "loss": 0.87657887, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.36743164, + "step": 2137, + "time_per_iteration": 2.7750425338745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111084, + "balance_loss_mlp": 1.07579243, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06074101962252474, + "language_loss": 0.9028185, + "learning_rate": 0.0006647824113817864, + "loss": 0.91392696, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.3503418, + "step": 2138, + "time_per_iteration": 2.5466508865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120271, + "balance_loss_mlp": 1.08348298, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.0860402389983067, + "language_loss": 0.81677365, + "learning_rate": 0.000664488242053515, + "loss": 0.82797635, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.36767578, + "step": 2139, + "time_per_iteration": 2.7149875164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114944, + "balance_loss_mlp": 1.08108878, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.05168082296105111, + "language_loss": 0.83784723, + "learning_rate": 0.0006641940088652445, + "loss": 0.84899676, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.33886719, + "step": 2140, + "time_per_iteration": 2.7871952056884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118503, + "balance_loss_mlp": 1.08130932, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.07036618696819374, + "language_loss": 0.8248812, + "learning_rate": 0.0006638997119312065, + "loss": 0.83606619, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.37207031, + "step": 2141, + "time_per_iteration": 2.7391679286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_mlp": 1.02841258, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.01970513212166274, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76101923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10205078, + "step": 2142, + "time_per_iteration": 4.920190095901489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113096, + "balance_loss_mlp": 1.09562647, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.07114532863779677, + "language_loss": 0.8524918, + "learning_rate": 0.000663310927282877, + "loss": 0.86380136, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.35327148, + "step": 2143, + "time_per_iteration": 2.762634515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.09098172, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.06302616573108136, + "language_loss": 0.86451441, + "learning_rate": 0.000663016439797172, + "loss": 0.87578332, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.35913086, + "step": 2144, + "time_per_iteration": 2.623093366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.082816, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.054034946771414454, + "language_loss": 0.80777407, + "learning_rate": 0.0006627218890228724, + "loss": 0.81894982, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.34765625, + "step": 2145, + "time_per_iteration": 2.79042911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.08373237, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.06837741268569841, + "language_loss": 0.83587825, + "learning_rate": 0.0006624272750743326, + "loss": 0.84706175, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.34643555, + "step": 2146, + "time_per_iteration": 3.0066065788269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110844, + "balance_loss_mlp": 1.07591534, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.052525216454956766, + "language_loss": 0.83126348, + "learning_rate": 0.0006621325980659322, + "loss": 0.84237194, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.34912109, + "step": 2147, + "time_per_iteration": 2.77634334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110797, + "balance_loss_mlp": 1.07429504, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06743799661442922, + "language_loss": 0.82004929, + "learning_rate": 0.000661837858112075, + "loss": 0.83115721, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.36499023, + "step": 2148, + "time_per_iteration": 2.8309879302978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108806, + "balance_loss_mlp": 1.07156515, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.060878143567582824, + "language_loss": 0.88845801, + "learning_rate": 0.0006615430553271888, + "loss": 0.89954603, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.37231445, + "step": 2149, + "time_per_iteration": 2.7831413745880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110838, + "balance_loss_mlp": 1.0737617, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.05890657915946428, + "language_loss": 0.85358977, + "learning_rate": 0.0006612481898257264, + "loss": 0.86467361, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.34619141, + "step": 2150, + "time_per_iteration": 2.8594231605529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116144, + "balance_loss_mlp": 1.08021438, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.0708787663681645, + "language_loss": 0.85383213, + "learning_rate": 0.000660953261722165, + "loss": 0.86499357, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.359375, + "step": 2151, + "time_per_iteration": 2.616218090057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110269, + "balance_loss_mlp": 1.07512605, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.05740780888166335, + "language_loss": 0.82834315, + "learning_rate": 0.0006606582711310055, + "loss": 0.83944577, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.3515625, + "step": 2152, + "time_per_iteration": 2.752922773361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116071, + "balance_loss_mlp": 1.07918727, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.062483875204726216, + "language_loss": 0.83428371, + "learning_rate": 0.0006603632181667736, + "loss": 0.84544444, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.36865234, + "step": 2153, + "time_per_iteration": 2.6699299812316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093007, + "balance_loss_mlp": 1.0828501, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03944020407638644, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8003633, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.1015625, + "step": 2154, + "time_per_iteration": 4.931839227676392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117724, + "balance_loss_mlp": 1.0825572, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08977793466970029, + "language_loss": 0.82004881, + "learning_rate": 0.0006597729255773153, + "loss": 0.83122605, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.3515625, + "step": 2155, + "time_per_iteration": 2.5100300312042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114058, + "balance_loss_mlp": 1.07769895, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.24704033930451297, + "language_loss": 0.82534748, + "learning_rate": 0.0006594776861812608, + "loss": 0.83648813, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.36328125, + "step": 2156, + "time_per_iteration": 2.652275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124067, + "balance_loss_mlp": 1.0867784, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.053182178449683815, + "language_loss": 0.86615425, + "learning_rate": 0.0006591823848704776, + "loss": 0.87739491, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.37280273, + "step": 2157, + "time_per_iteration": 2.958137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123111, + "balance_loss_mlp": 1.08653796, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.05319975052329094, + "language_loss": 0.81529272, + "learning_rate": 0.0006588870217596117, + "loss": 0.82652378, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.36572266, + "step": 2158, + "time_per_iteration": 2.739755392074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136072, + "balance_loss_mlp": 1.09847283, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.06859141393857857, + "language_loss": 0.85955006, + "learning_rate": 0.0006585915969633334, + "loss": 0.87091076, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.37573242, + "step": 2159, + "time_per_iteration": 2.561397075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138332, + "balance_loss_mlp": 1.1019969, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06079365960323944, + "language_loss": 0.89314306, + "learning_rate": 0.0006582961105963366, + "loss": 0.90452635, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.36328125, + "step": 2160, + "time_per_iteration": 2.791609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141319, + "balance_loss_mlp": 1.10546052, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.06462553372591408, + "language_loss": 0.77976739, + "learning_rate": 0.0006580005627733395, + "loss": 0.79118055, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.35913086, + "step": 2161, + "time_per_iteration": 2.6615841388702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152655, + "balance_loss_mlp": 1.11536634, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.06713934338553489, + "language_loss": 0.82142949, + "learning_rate": 0.0006577049536090838, + "loss": 0.83295602, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.37280273, + "step": 2162, + "time_per_iteration": 2.7025601863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11163712, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.06389110494138472, + "language_loss": 0.8567937, + "learning_rate": 0.000657409283218335, + "loss": 0.86828005, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37011719, + "step": 2163, + "time_per_iteration": 2.6993329524993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160123, + "balance_loss_mlp": 1.12352586, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.058431004936437055, + "language_loss": 0.81466627, + "learning_rate": 0.0006571135517158829, + "loss": 0.82626748, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.3659668, + "step": 2164, + "time_per_iteration": 2.6519243717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114432, + "balance_loss_mlp": 1.13316202, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.04824937130362004, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77908379, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.11181641, + "step": 2165, + "time_per_iteration": 4.770123481750488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155561, + "balance_loss_mlp": 1.11765289, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.07363984603082524, + "language_loss": 0.83210087, + "learning_rate": 0.0006565219058351444, + "loss": 0.84365654, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37866211, + "step": 2166, + "time_per_iteration": 2.6601247787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10470724, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.06568932383648114, + "language_loss": 0.83008349, + "learning_rate": 0.0006562259916865553, + "loss": 0.84152913, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.39868164, + "step": 2167, + "time_per_iteration": 2.5785412788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137223, + "balance_loss_mlp": 1.0999341, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.06458514122378838, + "language_loss": 0.79966152, + "learning_rate": 0.0006559300168856573, + "loss": 0.81103373, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.37255859, + "step": 2168, + "time_per_iteration": 2.7237303256988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140045, + "balance_loss_mlp": 1.10316169, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.050633821406227124, + "language_loss": 0.86603534, + "learning_rate": 0.0006556339815473577, + "loss": 0.8774358, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.36889648, + "step": 2169, + "time_per_iteration": 2.6403653621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140714, + "balance_loss_mlp": 1.10254359, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.05999280354484277, + "language_loss": 0.86559451, + "learning_rate": 0.000655337885786588, + "loss": 0.87700164, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.3815918, + "step": 2170, + "time_per_iteration": 2.927175283432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144977, + "balance_loss_mlp": 1.10737872, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.06541761902088469, + "language_loss": 0.85292417, + "learning_rate": 0.0006550417297183025, + "loss": 0.86437398, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37597656, + "step": 2171, + "time_per_iteration": 2.617203950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139774, + "balance_loss_mlp": 1.10174668, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.06470887192105082, + "language_loss": 0.81668884, + "learning_rate": 0.0006547455134574793, + "loss": 0.82808661, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.37988281, + "step": 2172, + "time_per_iteration": 2.6800732612609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.10817289, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06060457888036509, + "language_loss": 0.84434199, + "learning_rate": 0.0006544492371191198, + "loss": 0.85579354, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.36962891, + "step": 2173, + "time_per_iteration": 3.134876251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140851, + "balance_loss_mlp": 1.10113096, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.09700819760133231, + "language_loss": 0.83721489, + "learning_rate": 0.0006541529008182485, + "loss": 0.84862345, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.39697266, + "step": 2174, + "time_per_iteration": 3.1724131107330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113477, + "balance_loss_mlp": 1.09893537, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.060160949925642034, + "language_loss": 0.87700981, + "learning_rate": 0.0006538565046699136, + "loss": 0.88835752, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.3581543, + "step": 2175, + "time_per_iteration": 2.5730292797088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133428, + "balance_loss_mlp": 1.09683084, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.06692113802371265, + "language_loss": 0.81824857, + "learning_rate": 0.0006535600487891862, + "loss": 0.82958287, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.3659668, + "step": 2176, + "time_per_iteration": 2.7692394256591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121709, + "balance_loss_mlp": 1.08651876, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.07459509047969586, + "language_loss": 0.89445305, + "learning_rate": 0.0006532635332911603, + "loss": 0.90567011, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.3515625, + "step": 2177, + "time_per_iteration": 2.668281078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122309, + "balance_loss_mlp": 1.08449602, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.054056674099833946, + "language_loss": 0.80669487, + "learning_rate": 0.0006529669582909541, + "loss": 0.81791794, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37744141, + "step": 2178, + "time_per_iteration": 3.234210729598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132134, + "balance_loss_mlp": 1.0946312, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.13706718234639897, + "language_loss": 0.85650241, + "learning_rate": 0.0006526703239037077, + "loss": 0.86782372, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.37475586, + "step": 2179, + "time_per_iteration": 2.6495871543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129835, + "balance_loss_mlp": 1.09094954, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.09871097727336539, + "language_loss": 0.86649984, + "learning_rate": 0.0006523736302445851, + "loss": 0.8777982, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.38891602, + "step": 2180, + "time_per_iteration": 2.7558817863464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133221, + "balance_loss_mlp": 1.09390545, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.05706426412838818, + "language_loss": 0.77595234, + "learning_rate": 0.0006520768774287728, + "loss": 0.78728461, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.39306641, + "step": 2181, + "time_per_iteration": 3.7205944061279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143371, + "balance_loss_mlp": 1.10436535, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06053658357019196, + "language_loss": 0.85689628, + "learning_rate": 0.0006517800655714806, + "loss": 0.86832994, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.39013672, + "step": 2182, + "time_per_iteration": 2.8325769901275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140717, + "balance_loss_mlp": 1.10218823, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07751994631636654, + "language_loss": 0.85342467, + "learning_rate": 0.0006514831947879407, + "loss": 0.86483186, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.38500977, + "step": 2183, + "time_per_iteration": 2.930466890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154155, + "balance_loss_mlp": 1.11531675, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.061313063449444025, + "language_loss": 0.78360265, + "learning_rate": 0.0006511862651934091, + "loss": 0.7951442, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.38842773, + "step": 2184, + "time_per_iteration": 3.0874462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168235, + "balance_loss_mlp": 1.1299212, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.07362784353092817, + "language_loss": 0.820894, + "learning_rate": 0.0006508892769031638, + "loss": 0.83257627, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.3828125, + "step": 2185, + "time_per_iteration": 2.6239352226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.11551726, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.06908564705964859, + "language_loss": 0.87278891, + "learning_rate": 0.000650592230032506, + "loss": 0.88430935, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.36523438, + "step": 2186, + "time_per_iteration": 2.7282140254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149658, + "balance_loss_mlp": 1.11079597, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.0823679101553184, + "language_loss": 0.85327846, + "learning_rate": 0.0006502951246967595, + "loss": 0.86477506, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38891602, + "step": 2187, + "time_per_iteration": 2.8729426860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154068, + "balance_loss_mlp": 1.1164453, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.05336445965116177, + "language_loss": 0.86749196, + "learning_rate": 0.0006499979610112706, + "loss": 0.87903261, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.3762207, + "step": 2188, + "time_per_iteration": 2.7119579315185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151369, + "balance_loss_mlp": 1.1127454, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.055701229884667774, + "language_loss": 0.84561181, + "learning_rate": 0.000649700739091409, + "loss": 0.85712552, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.38623047, + "step": 2189, + "time_per_iteration": 2.7023189067840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108128, + "balance_loss_mlp": 1.07126629, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.037864476589066096, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74917555, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10009766, + "step": 2190, + "time_per_iteration": 4.808679103851318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.10751486, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.07155258064415941, + "language_loss": 0.85762346, + "learning_rate": 0.0006491061210101557, + "loss": 0.8690486, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.35009766, + "step": 2191, + "time_per_iteration": 2.7315032482147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.10880995, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.05430057095490736, + "language_loss": 0.84269011, + "learning_rate": 0.0006488087250796157, + "loss": 0.85415035, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.37231445, + "step": 2192, + "time_per_iteration": 2.91867995262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140476, + "balance_loss_mlp": 1.10154223, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.05336306174245454, + "language_loss": 0.81998622, + "learning_rate": 0.0006485112713764049, + "loss": 0.83139098, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.38916016, + "step": 2193, + "time_per_iteration": 2.954740047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139123, + "balance_loss_mlp": 1.10178626, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.05416843930927548, + "language_loss": 0.83712393, + "learning_rate": 0.0006482137600160051, + "loss": 0.84851515, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.3737793, + "step": 2194, + "time_per_iteration": 2.4989676475524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144403, + "balance_loss_mlp": 1.10573113, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.05184002865736912, + "language_loss": 0.8501671, + "learning_rate": 0.0006479161911139206, + "loss": 0.86161113, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.38671875, + "step": 2195, + "time_per_iteration": 2.5739963054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.09721804, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08063840338659255, + "language_loss": 0.85617948, + "learning_rate": 0.0006476185647856778, + "loss": 0.86753291, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38134766, + "step": 2196, + "time_per_iteration": 2.578218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124656, + "balance_loss_mlp": 1.08808231, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.05804099842364966, + "language_loss": 0.82180464, + "learning_rate": 0.0006473208811468255, + "loss": 0.8330512, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.36547852, + "step": 2197, + "time_per_iteration": 2.8833000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123675, + "balance_loss_mlp": 1.08707809, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.058050592535879256, + "language_loss": 0.84475237, + "learning_rate": 0.0006470231403129347, + "loss": 0.8559891, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.36621094, + "step": 2198, + "time_per_iteration": 2.590959072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124319, + "balance_loss_mlp": 1.08781683, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.05086119187590394, + "language_loss": 0.82252729, + "learning_rate": 0.0006467253423995988, + "loss": 0.83377045, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.36499023, + "step": 2199, + "time_per_iteration": 2.8386192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128105, + "balance_loss_mlp": 1.0917697, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.06679650853448169, + "language_loss": 0.79627949, + "learning_rate": 0.000646427487522433, + "loss": 0.8075605, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.36352539, + "step": 2200, + "time_per_iteration": 2.635103464126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08423305, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.0831390189187338, + "language_loss": 0.83172977, + "learning_rate": 0.0006461295757970749, + "loss": 0.84293896, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.36669922, + "step": 2201, + "time_per_iteration": 2.819474697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.07891917, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.062417347947693186, + "language_loss": 0.81792694, + "learning_rate": 0.0006458316073391839, + "loss": 0.82911074, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39428711, + "step": 2202, + "time_per_iteration": 2.871166229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0872103, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.05500893378921445, + "language_loss": 0.88072616, + "learning_rate": 0.0006455335822644422, + "loss": 0.89194781, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.34936523, + "step": 2203, + "time_per_iteration": 2.6111316680908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123624, + "balance_loss_mlp": 1.08683574, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.06843699867702463, + "language_loss": 0.78204858, + "learning_rate": 0.0006452355006885527, + "loss": 0.79328489, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.36791992, + "step": 2204, + "time_per_iteration": 2.6248953342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119975, + "balance_loss_mlp": 1.08209014, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.07209183527246785, + "language_loss": 0.87310261, + "learning_rate": 0.0006449373627272412, + "loss": 0.88430238, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.37866211, + "step": 2205, + "time_per_iteration": 2.703838348388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119116, + "balance_loss_mlp": 1.08197045, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.07114514004539872, + "language_loss": 0.82698691, + "learning_rate": 0.0006446391684962553, + "loss": 0.8381781, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.37158203, + "step": 2206, + "time_per_iteration": 2.6619176864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115514, + "balance_loss_mlp": 1.08022797, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.05684297237550015, + "language_loss": 0.83361518, + "learning_rate": 0.000644340918111364, + "loss": 0.84477031, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.3527832, + "step": 2207, + "time_per_iteration": 2.5367140769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126169, + "balance_loss_mlp": 1.09016824, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07504639835111325, + "language_loss": 0.8513602, + "learning_rate": 0.0006440426116883585, + "loss": 0.8626219, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.36010742, + "step": 2208, + "time_per_iteration": 2.5879015922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118301, + "balance_loss_mlp": 1.08129835, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06421639244231503, + "language_loss": 0.86279738, + "learning_rate": 0.0006437442493430519, + "loss": 0.8739804, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.37011719, + "step": 2209, + "time_per_iteration": 2.6396701335906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114919, + "balance_loss_mlp": 1.07741535, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.06478280605491378, + "language_loss": 0.87082028, + "learning_rate": 0.000643445831191278, + "loss": 0.88196945, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.37524414, + "step": 2210, + "time_per_iteration": 2.902726173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109225, + "balance_loss_mlp": 1.07265139, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.0604627940505335, + "language_loss": 0.81718135, + "learning_rate": 0.0006431473573488937, + "loss": 0.82827359, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.3659668, + "step": 2211, + "time_per_iteration": 2.756131887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.06492758, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.0751061946408966, + "language_loss": 0.84961367, + "learning_rate": 0.0006428488279317765, + "loss": 0.86063254, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.36938477, + "step": 2212, + "time_per_iteration": 2.6532113552093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100031, + "balance_loss_mlp": 1.06541276, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.06889274289933833, + "language_loss": 0.87372804, + "learning_rate": 0.0006425502430558259, + "loss": 0.88472843, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.34619141, + "step": 2213, + "time_per_iteration": 2.6332669258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_mlp": 1.06874728, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.08165118310272598, + "language_loss": 0.84992635, + "learning_rate": 0.0006422516028369628, + "loss": 0.86098623, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.37231445, + "step": 2214, + "time_per_iteration": 2.618557929992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098802, + "balance_loss_mlp": 1.06237185, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.05512742279801928, + "language_loss": 0.8369562, + "learning_rate": 0.0006419529073911296, + "loss": 0.84794426, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.36425781, + "step": 2215, + "time_per_iteration": 2.833543062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095611, + "balance_loss_mlp": 1.06166017, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.0818108199754697, + "language_loss": 0.85651129, + "learning_rate": 0.0006416541568342901, + "loss": 0.8674674, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.33935547, + "step": 2216, + "time_per_iteration": 2.8430728912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.0622437, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.05864229124252446, + "language_loss": 0.84272695, + "learning_rate": 0.0006413553512824297, + "loss": 0.85369843, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.34912109, + "step": 2217, + "time_per_iteration": 2.7368276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095005, + "balance_loss_mlp": 1.05943322, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.06419705252846208, + "language_loss": 0.84589773, + "learning_rate": 0.0006410564908515549, + "loss": 0.85684776, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.35595703, + "step": 2218, + "time_per_iteration": 2.650841236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096054, + "balance_loss_mlp": 1.06052935, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.06892642653628764, + "language_loss": 0.85406113, + "learning_rate": 0.0006407575756576935, + "loss": 0.86502165, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.35546875, + "step": 2219, + "time_per_iteration": 2.7199461460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103421, + "balance_loss_mlp": 1.0681113, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.055123892223664483, + "language_loss": 0.88112384, + "learning_rate": 0.0006404586058168951, + "loss": 0.89215803, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.35327148, + "step": 2220, + "time_per_iteration": 2.7125062942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_mlp": 1.07129836, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.06740071030395202, + "language_loss": 0.86848915, + "learning_rate": 0.0006401595814452296, + "loss": 0.87955624, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.35424805, + "step": 2221, + "time_per_iteration": 2.6037752628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07349372, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.06763062065124635, + "language_loss": 0.81391692, + "learning_rate": 0.000639860502658789, + "loss": 0.82500279, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.35131836, + "step": 2222, + "time_per_iteration": 2.620530366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.07475281, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.07514934658842116, + "language_loss": 0.85168004, + "learning_rate": 0.0006395613695736853, + "loss": 0.86278212, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.35449219, + "step": 2223, + "time_per_iteration": 2.67494797706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106777, + "balance_loss_mlp": 1.07015634, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.06258659729032073, + "language_loss": 0.81998539, + "learning_rate": 0.0006392621823060529, + "loss": 0.83105314, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.36621094, + "step": 2224, + "time_per_iteration": 2.729048490524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107838, + "balance_loss_mlp": 1.07197976, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.07791132694448914, + "language_loss": 0.85259843, + "learning_rate": 0.0006389629409720465, + "loss": 0.86367679, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.35839844, + "step": 2225, + "time_per_iteration": 2.6461989879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102526, + "balance_loss_mlp": 1.06836081, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.06694393428490365, + "language_loss": 0.88831687, + "learning_rate": 0.0006386636456878417, + "loss": 0.89934212, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.34155273, + "step": 2226, + "time_per_iteration": 2.8701326847076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106393, + "balance_loss_mlp": 1.07091641, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07990341915486338, + "language_loss": 0.92087269, + "learning_rate": 0.0006383642965696353, + "loss": 0.93193656, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.35473633, + "step": 2227, + "time_per_iteration": 2.4640464782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_mlp": 1.06544292, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.053395147694407376, + "language_loss": 0.82962096, + "learning_rate": 0.000638064893733645, + "loss": 0.84064686, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.37158203, + "step": 2228, + "time_per_iteration": 2.783597946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117505, + "balance_loss_mlp": 1.08198094, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.07356604001224937, + "language_loss": 0.89838171, + "learning_rate": 0.000637765437296109, + "loss": 0.90955675, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.35522461, + "step": 2229, + "time_per_iteration": 2.6639621257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112327, + "balance_loss_mlp": 1.07644475, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.05563237387214821, + "language_loss": 0.85128897, + "learning_rate": 0.000637465927373287, + "loss": 0.86241227, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.35913086, + "step": 2230, + "time_per_iteration": 2.6883277893066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107316, + "balance_loss_mlp": 1.07253075, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.06522010118943229, + "language_loss": 0.78980476, + "learning_rate": 0.000637166364081459, + "loss": 0.80087787, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.34790039, + "step": 2231, + "time_per_iteration": 2.711379051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111807, + "balance_loss_mlp": 1.07814288, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.06512604260411947, + "language_loss": 0.84616333, + "learning_rate": 0.0006368667475369256, + "loss": 0.85728139, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.33666992, + "step": 2232, + "time_per_iteration": 2.7521519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083825, + "balance_loss_mlp": 1.07271492, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.03367734377341464, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79611605, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.11132812, + "step": 2233, + "time_per_iteration": 4.941352605819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106502, + "balance_loss_mlp": 1.05414832, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.027928692850204096, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79960448, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10888672, + "step": 2234, + "time_per_iteration": 4.825460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117593, + "balance_loss_mlp": 1.08302259, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.05150642259295507, + "language_loss": 0.86345804, + "learning_rate": 0.0006359675795504112, + "loss": 0.87463403, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.34619141, + "step": 2235, + "time_per_iteration": 2.662977695465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.09099901, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.07348370683515035, + "language_loss": 0.74711537, + "learning_rate": 0.0006356677511584775, + "loss": 0.75838703, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.36181641, + "step": 2236, + "time_per_iteration": 3.51220965385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127234, + "balance_loss_mlp": 1.09337878, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.061045373266899905, + "language_loss": 0.86476523, + "learning_rate": 0.0006353678700956511, + "loss": 0.8760376, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.33886719, + "step": 2237, + "time_per_iteration": 2.60677170753479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_mlp": 1.09085464, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.06413862374233106, + "language_loss": 0.83745819, + "learning_rate": 0.0006350679364783569, + "loss": 0.84870958, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.34326172, + "step": 2238, + "time_per_iteration": 2.7771050930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117438, + "balance_loss_mlp": 1.08212781, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.057478588831895126, + "language_loss": 0.85746336, + "learning_rate": 0.0006347679504230393, + "loss": 0.86863768, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.35351562, + "step": 2239, + "time_per_iteration": 2.6826984882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120648, + "balance_loss_mlp": 1.08405077, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.0566935574955873, + "language_loss": 0.76113296, + "learning_rate": 0.0006344679120461632, + "loss": 0.7723394, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.36621094, + "step": 2240, + "time_per_iteration": 3.3756330013275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122418, + "balance_loss_mlp": 1.0843904, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.06383187448999712, + "language_loss": 0.80362582, + "learning_rate": 0.0006341678214642134, + "loss": 0.81484997, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.38037109, + "step": 2241, + "time_per_iteration": 2.6837639808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121026, + "balance_loss_mlp": 1.08633661, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06213603676301435, + "language_loss": 0.82894886, + "learning_rate": 0.0006338676787936963, + "loss": 0.84015912, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.34716797, + "step": 2242, + "time_per_iteration": 3.0835442543029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.09019864, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.06026893794725229, + "language_loss": 0.83885002, + "learning_rate": 0.0006335674841511367, + "loss": 0.85011244, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.36035156, + "step": 2243, + "time_per_iteration": 2.6649861335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054007, + "balance_loss_mlp": 1.04466057, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.029651379922801115, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80235171, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09326172, + "step": 2244, + "time_per_iteration": 5.015843868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043427, + "balance_loss_mlp": 1.03412855, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.025217175998849217, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7840898, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09277344, + "step": 2245, + "time_per_iteration": 4.923234939575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118877, + "balance_loss_mlp": 1.08282828, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.05723795681410829, + "language_loss": 0.83027297, + "learning_rate": 0.0006326665895567652, + "loss": 0.84146178, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.3605957, + "step": 2246, + "time_per_iteration": 2.6065175533294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112241, + "balance_loss_mlp": 1.08652771, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.06570844887047847, + "language_loss": 0.87358153, + "learning_rate": 0.0006323661881916976, + "loss": 0.88480568, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.35864258, + "step": 2247, + "time_per_iteration": 2.682924509048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124487, + "balance_loss_mlp": 1.08996427, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.05864327339271887, + "language_loss": 0.8139447, + "learning_rate": 0.0006320657354375179, + "loss": 0.82518953, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.34521484, + "step": 2248, + "time_per_iteration": 2.9315433502197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112819, + "balance_loss_mlp": 1.09125865, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.05550733837968219, + "language_loss": 0.87244421, + "learning_rate": 0.0006317652314108726, + "loss": 0.88372612, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.36938477, + "step": 2249, + "time_per_iteration": 2.5357820987701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125911, + "balance_loss_mlp": 1.09186506, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.06944226399680122, + "language_loss": 0.91745955, + "learning_rate": 0.0006314646762284277, + "loss": 0.92871869, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.34057617, + "step": 2250, + "time_per_iteration": 2.650629997253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010844, + "balance_loss_mlp": 1.00116396, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012503035455709091, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76436675, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09667969, + "step": 2251, + "time_per_iteration": 4.895758867263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118011, + "balance_loss_mlp": 1.08341658, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.05843208138947643, + "language_loss": 0.77784407, + "learning_rate": 0.0006308634128629022, + "loss": 0.78902417, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.34570312, + "step": 2252, + "time_per_iteration": 2.916623592376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112901, + "balance_loss_mlp": 1.09289002, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0729174620046303, + "language_loss": 0.87908506, + "learning_rate": 0.0006305627049132531, + "loss": 0.89037514, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.36132812, + "step": 2253, + "time_per_iteration": 2.741239070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121412, + "balance_loss_mlp": 1.08660293, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05583951255628595, + "language_loss": 0.8599245, + "learning_rate": 0.0006302619462746662, + "loss": 0.87113857, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.34814453, + "step": 2254, + "time_per_iteration": 3.1628546714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123282, + "balance_loss_mlp": 1.08966494, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.05704174545577272, + "language_loss": 0.90291667, + "learning_rate": 0.0006299611370639069, + "loss": 0.91414952, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.33618164, + "step": 2255, + "time_per_iteration": 2.7106690406799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125975, + "balance_loss_mlp": 1.09157157, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.06008787734976465, + "language_loss": 0.79589838, + "learning_rate": 0.0006296602773977593, + "loss": 0.80715805, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.34423828, + "step": 2256, + "time_per_iteration": 2.673064947128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.08652973, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.05906133720876415, + "language_loss": 0.87730187, + "learning_rate": 0.0006293593673930277, + "loss": 0.88852072, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.35400391, + "step": 2257, + "time_per_iteration": 2.6278131008148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115203, + "balance_loss_mlp": 1.08010745, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07846710421999975, + "language_loss": 0.7888447, + "learning_rate": 0.0006290584071665358, + "loss": 0.79999673, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.35107422, + "step": 2258, + "time_per_iteration": 2.8708009719848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112425, + "balance_loss_mlp": 1.07709181, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06520269446334741, + "language_loss": 0.82244253, + "learning_rate": 0.0006287573968351266, + "loss": 0.83356678, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.35351562, + "step": 2259, + "time_per_iteration": 2.5682222843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113367, + "balance_loss_mlp": 1.07729471, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.07246583855612315, + "language_loss": 0.82777989, + "learning_rate": 0.0006284563365156626, + "loss": 0.83891356, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.3605957, + "step": 2260, + "time_per_iteration": 2.827087879180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108747, + "balance_loss_mlp": 1.07148242, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.12125557864683041, + "language_loss": 0.87600839, + "learning_rate": 0.0006281552263250261, + "loss": 0.88709581, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37255859, + "step": 2261, + "time_per_iteration": 2.479753017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_mlp": 1.02072453, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.029168664611412945, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81722796, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10546875, + "step": 2262, + "time_per_iteration": 4.812009334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106503, + "balance_loss_mlp": 1.07104969, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.06614620097740347, + "language_loss": 0.81361771, + "learning_rate": 0.0006275528567978593, + "loss": 0.82468277, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.35449219, + "step": 2263, + "time_per_iteration": 2.903029203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115264, + "balance_loss_mlp": 1.07923913, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07895665669601973, + "language_loss": 0.82951373, + "learning_rate": 0.0006272515976951898, + "loss": 0.84066635, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.3605957, + "step": 2264, + "time_per_iteration": 3.066096544265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109411, + "balance_loss_mlp": 1.07300496, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06560373300441709, + "language_loss": 0.79299462, + "learning_rate": 0.0006269502891890687, + "loss": 0.80408877, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.36425781, + "step": 2265, + "time_per_iteration": 3.036302089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098467, + "balance_loss_mlp": 1.06504071, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05296436812265497, + "language_loss": 0.88411891, + "learning_rate": 0.0006266489313964743, + "loss": 0.89510357, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.33447266, + "step": 2266, + "time_per_iteration": 2.766963481903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105293, + "balance_loss_mlp": 1.06907725, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.057339134399699385, + "language_loss": 0.85443783, + "learning_rate": 0.0006263475244344041, + "loss": 0.86549073, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.36230469, + "step": 2267, + "time_per_iteration": 2.8397552967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104848, + "balance_loss_mlp": 1.0681076, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.06097162500725226, + "language_loss": 0.84725475, + "learning_rate": 0.0006260460684198746, + "loss": 0.85830331, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.36743164, + "step": 2268, + "time_per_iteration": 2.725037097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.06901538, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07238177879654556, + "language_loss": 0.84404624, + "learning_rate": 0.0006257445634699213, + "loss": 0.85510075, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.36425781, + "step": 2269, + "time_per_iteration": 2.623194456100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.06855631, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.060050482587473634, + "language_loss": 0.83212304, + "learning_rate": 0.0006254430097015993, + "loss": 0.84317344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36499023, + "step": 2270, + "time_per_iteration": 2.6570417881011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_mlp": 1.02752221, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.021802814945167073, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751677, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.1015625, + "step": 2271, + "time_per_iteration": 4.800662517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109594, + "balance_loss_mlp": 1.07299662, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.08079345415457889, + "language_loss": 0.85730046, + "learning_rate": 0.0006248397561781609, + "loss": 0.8683964, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.3659668, + "step": 2272, + "time_per_iteration": 2.879779815673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110506, + "balance_loss_mlp": 1.07312167, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.06456885574264018, + "language_loss": 0.86308181, + "learning_rate": 0.0006245380566572482, + "loss": 0.87418681, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.37402344, + "step": 2273, + "time_per_iteration": 2.671515703201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108969, + "balance_loss_mlp": 1.07227635, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07977356034675265, + "language_loss": 0.76295209, + "learning_rate": 0.0006242363087863744, + "loss": 0.77404177, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36669922, + "step": 2274, + "time_per_iteration": 3.0036468505859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_mlp": 1.07430363, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06387432282930158, + "language_loss": 0.86488557, + "learning_rate": 0.0006239345126826878, + "loss": 0.87598979, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.36132812, + "step": 2275, + "time_per_iteration": 2.8046963214874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113113, + "balance_loss_mlp": 1.07551455, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.06304446482372832, + "language_loss": 0.84217036, + "learning_rate": 0.0006236326684633561, + "loss": 0.85330147, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37597656, + "step": 2276, + "time_per_iteration": 2.8238136768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113871, + "balance_loss_mlp": 1.07725024, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07202298424456109, + "language_loss": 0.75335848, + "learning_rate": 0.0006233307762455658, + "loss": 0.76449716, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.36645508, + "step": 2277, + "time_per_iteration": 2.6191978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121381, + "balance_loss_mlp": 1.08576083, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.053405108271766075, + "language_loss": 0.8389169, + "learning_rate": 0.0006230288361465216, + "loss": 0.85013068, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.35644531, + "step": 2278, + "time_per_iteration": 3.0405595302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113147, + "balance_loss_mlp": 1.09399056, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.06317085407877503, + "language_loss": 0.85187429, + "learning_rate": 0.0006227268482834473, + "loss": 0.86318898, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.37475586, + "step": 2279, + "time_per_iteration": 2.884791135787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140517, + "balance_loss_mlp": 1.10272789, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.08374351035766264, + "language_loss": 0.87551039, + "learning_rate": 0.000622424812773585, + "loss": 0.88691556, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.37768555, + "step": 2280, + "time_per_iteration": 2.790846824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129266, + "balance_loss_mlp": 1.09150028, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07881944372222376, + "language_loss": 0.79747838, + "learning_rate": 0.000622122729734195, + "loss": 0.80877101, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.37744141, + "step": 2281, + "time_per_iteration": 2.5392401218414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130236, + "balance_loss_mlp": 1.09404397, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06512890224106707, + "language_loss": 0.87574816, + "learning_rate": 0.0006218205992825566, + "loss": 0.88705051, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.36206055, + "step": 2282, + "time_per_iteration": 2.6409003734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130264, + "balance_loss_mlp": 1.09304714, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.058092029820517505, + "language_loss": 0.82094592, + "learning_rate": 0.0006215184215359671, + "loss": 0.83224851, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37207031, + "step": 2283, + "time_per_iteration": 2.798405647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112171, + "balance_loss_mlp": 1.08506513, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.06799742884418125, + "language_loss": 0.86864793, + "learning_rate": 0.0006212161966117425, + "loss": 0.87986505, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36669922, + "step": 2284, + "time_per_iteration": 2.7305543422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120327, + "balance_loss_mlp": 1.0823704, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.0718064317498989, + "language_loss": 0.81899178, + "learning_rate": 0.0006209139246272164, + "loss": 0.83019507, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37939453, + "step": 2285, + "time_per_iteration": 2.9496707916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114672, + "balance_loss_mlp": 1.07569027, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.0666339573323591, + "language_loss": 0.81558084, + "learning_rate": 0.0006206116056997421, + "loss": 0.82672757, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.38964844, + "step": 2286, + "time_per_iteration": 2.56559681892395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.08414793, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.07939984369379535, + "language_loss": 0.82495737, + "learning_rate": 0.0006203092399466892, + "loss": 0.83617818, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.37915039, + "step": 2287, + "time_per_iteration": 2.614211082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08368051, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.05953237575059506, + "language_loss": 0.85318255, + "learning_rate": 0.0006200068274854473, + "loss": 0.86438239, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36303711, + "step": 2288, + "time_per_iteration": 2.6718688011169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123012, + "balance_loss_mlp": 1.08679628, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0828196201385275, + "language_loss": 0.86406159, + "learning_rate": 0.0006197043684334229, + "loss": 0.87529171, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.36230469, + "step": 2289, + "time_per_iteration": 2.7540907859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128468, + "balance_loss_mlp": 1.09158421, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.11266642339430595, + "language_loss": 0.79650962, + "learning_rate": 0.0006194018629080411, + "loss": 0.80779433, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.36865234, + "step": 2290, + "time_per_iteration": 2.7200653553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127999, + "balance_loss_mlp": 1.09099627, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.0658560511601545, + "language_loss": 0.81793892, + "learning_rate": 0.0006190993110267451, + "loss": 0.82921886, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.36987305, + "step": 2291, + "time_per_iteration": 2.709512233734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130933, + "balance_loss_mlp": 1.09311938, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.0787223425712205, + "language_loss": 0.84518313, + "learning_rate": 0.0006187967129069958, + "loss": 0.85649246, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.37792969, + "step": 2292, + "time_per_iteration": 2.4924299716949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.08935523, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07162475848736369, + "language_loss": 0.87490463, + "learning_rate": 0.0006184940686662722, + "loss": 0.88615251, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.35449219, + "step": 2293, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119268, + "balance_loss_mlp": 1.08445859, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06340812224100711, + "language_loss": 0.9041853, + "learning_rate": 0.0006181913784220714, + "loss": 0.91537791, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.34838867, + "step": 2294, + "time_per_iteration": 2.64821457862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_mlp": 1.0290786, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.025861242717412188, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81591213, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.08349609, + "step": 2295, + "time_per_iteration": 4.885660171508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119684, + "balance_loss_mlp": 1.08537531, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.10155164806079009, + "language_loss": 0.80041152, + "learning_rate": 0.0006175858603933146, + "loss": 0.81160837, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.34326172, + "step": 2296, + "time_per_iteration": 2.881615400314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129433, + "balance_loss_mlp": 1.09393275, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.0685445546464461, + "language_loss": 0.81208229, + "learning_rate": 0.0006172830328438416, + "loss": 0.82337666, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.35498047, + "step": 2297, + "time_per_iteration": 2.940401315689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.08680558, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.09103818832157724, + "language_loss": 0.87286425, + "learning_rate": 0.0006169801597610572, + "loss": 0.88410091, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.36889648, + "step": 2298, + "time_per_iteration": 2.7739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.08195138, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.1052787532551667, + "language_loss": 0.9040001, + "learning_rate": 0.0006166772412625469, + "loss": 0.91515625, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.33666992, + "step": 2299, + "time_per_iteration": 2.734384298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112173, + "balance_loss_mlp": 1.07710147, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.07592361192988976, + "language_loss": 0.81779516, + "learning_rate": 0.0006163742774659141, + "loss": 0.82891691, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.35107422, + "step": 2300, + "time_per_iteration": 2.8436357975006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107602, + "balance_loss_mlp": 1.07410455, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.0790889900730028, + "language_loss": 0.86033177, + "learning_rate": 0.0006160712684887801, + "loss": 0.87140775, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.33496094, + "step": 2301, + "time_per_iteration": 2.816479206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.07118952, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.0554513610730849, + "language_loss": 0.82599401, + "learning_rate": 0.0006157682144487832, + "loss": 0.83703709, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.33129883, + "step": 2302, + "time_per_iteration": 2.7371127605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112573, + "balance_loss_mlp": 1.07776368, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.08617173815320239, + "language_loss": 0.83484352, + "learning_rate": 0.0006154651154635793, + "loss": 0.84596926, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.34838867, + "step": 2303, + "time_per_iteration": 2.822388172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122213, + "balance_loss_mlp": 1.08664048, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.06891313471916412, + "language_loss": 0.85087454, + "learning_rate": 0.0006151619716508421, + "loss": 0.86209667, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.35571289, + "step": 2304, + "time_per_iteration": 2.5669307708740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113601, + "balance_loss_mlp": 1.07905424, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0676174746334525, + "language_loss": 0.87354678, + "learning_rate": 0.0006148587831282625, + "loss": 0.88468277, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.34545898, + "step": 2305, + "time_per_iteration": 2.7296478748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_mlp": 1.03257155, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03035679683037383, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80218178, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.09521484, + "step": 2306, + "time_per_iteration": 4.932115077972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132739, + "balance_loss_mlp": 1.09490204, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.0708853860960667, + "language_loss": 0.87972111, + "learning_rate": 0.0006142522724244255, + "loss": 0.89104849, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.37817383, + "step": 2307, + "time_per_iteration": 2.5106770992279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_mlp": 1.01785433, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.02287011405410123, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77512109, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09521484, + "step": 2308, + "time_per_iteration": 4.842617034912109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120011, + "balance_loss_mlp": 1.08405757, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07624843376245131, + "language_loss": 0.77539825, + "learning_rate": 0.000613645584293942, + "loss": 0.78659838, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.35986328, + "step": 2309, + "time_per_iteration": 2.8661446571350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.08933806, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.0700550632478262, + "language_loss": 0.83505249, + "learning_rate": 0.0006133421739881185, + "loss": 0.84630251, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.35693359, + "step": 2310, + "time_per_iteration": 2.6644127368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118668, + "balance_loss_mlp": 1.08319092, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.11928760190169391, + "language_loss": 0.83116257, + "learning_rate": 0.0006130387196789605, + "loss": 0.84234929, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.35522461, + "step": 2311, + "time_per_iteration": 2.7157018184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111828, + "balance_loss_mlp": 1.07699549, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.05741887786628051, + "language_loss": 0.84819949, + "learning_rate": 0.0006127352214842795, + "loss": 0.85931778, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.34838867, + "step": 2312, + "time_per_iteration": 2.9459052085876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118701, + "balance_loss_mlp": 1.08293796, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07350143541661519, + "language_loss": 0.85691726, + "learning_rate": 0.0006124316795219041, + "loss": 0.86810434, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.35742188, + "step": 2313, + "time_per_iteration": 2.772299289703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131037, + "balance_loss_mlp": 1.0956552, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.06263706285199609, + "language_loss": 0.82505524, + "learning_rate": 0.0006121280939096794, + "loss": 0.83636558, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.35424805, + "step": 2314, + "time_per_iteration": 2.7951674461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114668, + "balance_loss_mlp": 1.11020195, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.0720052818844606, + "language_loss": 0.88360798, + "learning_rate": 0.000611824464765468, + "loss": 0.89507478, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.36499023, + "step": 2315, + "time_per_iteration": 2.5895602703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067507, + "balance_loss_mlp": 1.05682635, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.0344692196546668, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79662448, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.10693359, + "step": 2316, + "time_per_iteration": 4.6560447216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137929, + "balance_loss_mlp": 1.1022377, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.06826351361083724, + "language_loss": 0.85665047, + "learning_rate": 0.000611217076352619, + "loss": 0.86802971, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35693359, + "step": 2317, + "time_per_iteration": 2.7965078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132041, + "balance_loss_mlp": 1.09835279, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06652231411845559, + "language_loss": 0.83542907, + "learning_rate": 0.0006109133173197905, + "loss": 0.84674948, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.33691406, + "step": 2318, + "time_per_iteration": 2.678832769393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124047, + "balance_loss_mlp": 1.08897519, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.06811942389724822, + "language_loss": 0.85992062, + "learning_rate": 0.0006106095152265935, + "loss": 0.8711611, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35107422, + "step": 2319, + "time_per_iteration": 2.9018518924713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111639, + "balance_loss_mlp": 1.08060324, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.06308491230142964, + "language_loss": 0.85126555, + "learning_rate": 0.0006103056701909739, + "loss": 0.8624295, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.3581543, + "step": 2320, + "time_per_iteration": 2.927619218826294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111434, + "balance_loss_mlp": 1.07869673, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.08034132862269446, + "language_loss": 0.83192152, + "learning_rate": 0.0006100017823308956, + "loss": 0.8430649, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35644531, + "step": 2321, + "time_per_iteration": 3.1759355068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111575, + "balance_loss_mlp": 1.07645655, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.0688182521177716, + "language_loss": 0.79684091, + "learning_rate": 0.0006096978517643377, + "loss": 0.8079567, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.35131836, + "step": 2322, + "time_per_iteration": 2.791020154953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_mlp": 1.07337499, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.08831218810897808, + "language_loss": 0.83671057, + "learning_rate": 0.0006093938786092968, + "loss": 0.84780538, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.36108398, + "step": 2323, + "time_per_iteration": 2.614248037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107948, + "balance_loss_mlp": 1.0734967, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06554008035854059, + "language_loss": 0.90401232, + "learning_rate": 0.0006090898629837857, + "loss": 0.91509175, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.34448242, + "step": 2324, + "time_per_iteration": 2.7988476753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114598, + "balance_loss_mlp": 1.07950234, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.05596676685861875, + "language_loss": 0.87779921, + "learning_rate": 0.0006087858050058337, + "loss": 0.88894522, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.35083008, + "step": 2325, + "time_per_iteration": 2.8598742485046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106952, + "balance_loss_mlp": 1.07309675, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.08404177014968839, + "language_loss": 0.82489681, + "learning_rate": 0.0006084817047934866, + "loss": 0.83596623, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.33886719, + "step": 2326, + "time_per_iteration": 2.6458888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.07780075, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.07155239810176077, + "language_loss": 0.89966661, + "learning_rate": 0.0006081775624648066, + "loss": 0.91078842, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.34399414, + "step": 2327, + "time_per_iteration": 2.580366373062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120962, + "balance_loss_mlp": 1.08689189, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.06301539333332261, + "language_loss": 0.83119273, + "learning_rate": 0.0006078733781378721, + "loss": 0.8424024, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.34082031, + "step": 2328, + "time_per_iteration": 2.54127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110302, + "balance_loss_mlp": 1.07594562, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.057204005558127505, + "language_loss": 0.82213807, + "learning_rate": 0.0006075691519307781, + "loss": 0.83324105, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.34375, + "step": 2329, + "time_per_iteration": 2.8602964878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117341, + "balance_loss_mlp": 1.08193612, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.055534005363494426, + "language_loss": 0.81606597, + "learning_rate": 0.0006072648839616356, + "loss": 0.82723939, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.35400391, + "step": 2330, + "time_per_iteration": 2.662810802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119722, + "balance_loss_mlp": 1.08565211, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.050779766652796585, + "language_loss": 0.82901573, + "learning_rate": 0.0006069605743485718, + "loss": 0.84021294, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.34057617, + "step": 2331, + "time_per_iteration": 3.3678483963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128598, + "balance_loss_mlp": 1.0950762, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.04918059080846435, + "language_loss": 0.83280981, + "learning_rate": 0.0006066562232097303, + "loss": 0.84409571, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.33544922, + "step": 2332, + "time_per_iteration": 2.7449440956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123187, + "balance_loss_mlp": 1.08785367, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.052836841401222294, + "language_loss": 0.86161315, + "learning_rate": 0.0006063518306632708, + "loss": 0.87284505, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.35375977, + "step": 2333, + "time_per_iteration": 2.9690473079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127444, + "balance_loss_mlp": 1.09220576, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.06707958703687776, + "language_loss": 0.82286978, + "learning_rate": 0.0006060473968273688, + "loss": 0.83414423, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.35229492, + "step": 2334, + "time_per_iteration": 2.665539026260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142277, + "balance_loss_mlp": 1.13331211, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.036352477885187424, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79021817, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.08984375, + "step": 2335, + "time_per_iteration": 4.888899326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115384, + "balance_loss_mlp": 1.10641909, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.027581232823365703, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82120597, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.08984375, + "step": 2336, + "time_per_iteration": 4.835580348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126219, + "balance_loss_mlp": 1.09155297, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.06484007747623576, + "language_loss": 0.88115394, + "learning_rate": 0.0006051338487650047, + "loss": 0.89241612, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.34667969, + "step": 2337, + "time_per_iteration": 2.4327657222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125299, + "balance_loss_mlp": 1.08846319, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.06762371666749806, + "language_loss": 0.82857472, + "learning_rate": 0.0006048292509534095, + "loss": 0.83982766, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.3684082, + "step": 2338, + "time_per_iteration": 2.583315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.08851767, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.06288042140328122, + "language_loss": 0.78114402, + "learning_rate": 0.0006045246124434895, + "loss": 0.792373, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.34350586, + "step": 2339, + "time_per_iteration": 2.718944787979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111671, + "balance_loss_mlp": 1.08223438, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06455240115792976, + "language_loss": 0.86995041, + "learning_rate": 0.0006042199333535162, + "loss": 0.88111752, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.3449707, + "step": 2340, + "time_per_iteration": 3.280731439590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120556, + "balance_loss_mlp": 1.08803582, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06119421780994794, + "language_loss": 0.83960807, + "learning_rate": 0.0006039152138017763, + "loss": 0.85081363, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.32519531, + "step": 2341, + "time_per_iteration": 3.042808771133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08285511, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.06181422787629511, + "language_loss": 0.83835328, + "learning_rate": 0.0006036104539065726, + "loss": 0.84952325, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.34155273, + "step": 2342, + "time_per_iteration": 2.671872138977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117893, + "balance_loss_mlp": 1.08208227, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.05413998463628708, + "language_loss": 0.84596831, + "learning_rate": 0.000603305653786223, + "loss": 0.85714728, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.3581543, + "step": 2343, + "time_per_iteration": 3.153627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116847, + "balance_loss_mlp": 1.08182287, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.06019885466307642, + "language_loss": 0.84242773, + "learning_rate": 0.0006030008135590622, + "loss": 0.85359621, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.35058594, + "step": 2344, + "time_per_iteration": 2.724281072616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109974, + "balance_loss_mlp": 1.07564187, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.06173385406680834, + "language_loss": 0.80783409, + "learning_rate": 0.0006026959333434387, + "loss": 0.81893378, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.34350586, + "step": 2345, + "time_per_iteration": 2.7752277851104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107914, + "balance_loss_mlp": 1.07336736, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.04677974400708639, + "language_loss": 0.77811158, + "learning_rate": 0.0006023910132577181, + "loss": 0.78919077, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.34545898, + "step": 2346, + "time_per_iteration": 2.663447141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_mlp": 1.06802082, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.060558646022808645, + "language_loss": 0.85310882, + "learning_rate": 0.0006020860534202806, + "loss": 0.86412525, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.33618164, + "step": 2347, + "time_per_iteration": 2.480811595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108815, + "balance_loss_mlp": 1.07388651, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06606096221098971, + "language_loss": 0.81316173, + "learning_rate": 0.0006017810539495224, + "loss": 0.82424992, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.34960938, + "step": 2348, + "time_per_iteration": 2.9476070404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098582, + "balance_loss_mlp": 1.06415427, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.0571113923067653, + "language_loss": 0.82774842, + "learning_rate": 0.0006014760149638547, + "loss": 0.83873427, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.34423828, + "step": 2349, + "time_per_iteration": 2.6655263900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103718, + "balance_loss_mlp": 1.07005334, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.06475243948679671, + "language_loss": 0.88831103, + "learning_rate": 0.000601170936581704, + "loss": 0.89934826, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.33666992, + "step": 2350, + "time_per_iteration": 2.5269417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06343222, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.06432650174878703, + "language_loss": 0.84562814, + "learning_rate": 0.0006008658189215121, + "loss": 0.85660601, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.34399414, + "step": 2351, + "time_per_iteration": 2.621596097946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110179, + "balance_loss_mlp": 1.07267594, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.3016755485520666, + "language_loss": 0.8046757, + "learning_rate": 0.0006005606621017366, + "loss": 0.81577748, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.375, + "step": 2352, + "time_per_iteration": 2.561138153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111286, + "balance_loss_mlp": 1.07564211, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.055264843638134026, + "language_loss": 0.80770934, + "learning_rate": 0.0006002554662408496, + "loss": 0.81882215, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.35644531, + "step": 2353, + "time_per_iteration": 2.87947940826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118454, + "balance_loss_mlp": 1.08180928, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.06003231312298175, + "language_loss": 0.91710508, + "learning_rate": 0.0005999502314573388, + "loss": 0.92828965, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36645508, + "step": 2354, + "time_per_iteration": 2.703589916229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127767, + "balance_loss_mlp": 1.09119391, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.06522748471040672, + "language_loss": 0.86741221, + "learning_rate": 0.0005996449578697066, + "loss": 0.87868989, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.36547852, + "step": 2355, + "time_per_iteration": 2.6407227516174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114254, + "balance_loss_mlp": 1.10627651, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.05645244306136207, + "language_loss": 0.81587362, + "learning_rate": 0.0005993396455964709, + "loss": 0.827299, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36279297, + "step": 2356, + "time_per_iteration": 2.7260916233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159041, + "balance_loss_mlp": 1.12263405, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.0574643396084849, + "language_loss": 0.81904489, + "learning_rate": 0.0005990342947561647, + "loss": 0.83063525, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.36401367, + "step": 2357, + "time_per_iteration": 2.763461112976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158122, + "balance_loss_mlp": 1.12109542, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.06627350558163068, + "language_loss": 0.78124607, + "learning_rate": 0.0005987289054673351, + "loss": 0.79282725, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.37011719, + "step": 2358, + "time_per_iteration": 2.7317159175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172658, + "balance_loss_mlp": 1.16121387, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.05600708096364228, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77748394, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11425781, + "step": 2359, + "time_per_iteration": 4.815205335617065 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168714, + "balance_loss_mlp": 1.13257003, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.0832511333205401, + "language_loss": 0.91429126, + "learning_rate": 0.0005981180120183722, + "loss": 0.92597842, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36206055, + "step": 2360, + "time_per_iteration": 2.675994873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154784, + "balance_loss_mlp": 1.11825836, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.06456101952662723, + "language_loss": 0.85450256, + "learning_rate": 0.0005978125080954089, + "loss": 0.86605042, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.36523438, + "step": 2361, + "time_per_iteration": 2.844592332839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134938, + "balance_loss_mlp": 1.0997715, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.06943573222196867, + "language_loss": 0.77225572, + "learning_rate": 0.000597506966198262, + "loss": 0.7836051, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.35180664, + "step": 2362, + "time_per_iteration": 2.990652322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127189, + "balance_loss_mlp": 1.09216547, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.07387250459530678, + "language_loss": 0.84014916, + "learning_rate": 0.0005972013864455536, + "loss": 0.85142106, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.3503418, + "step": 2363, + "time_per_iteration": 2.589594841003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124933, + "balance_loss_mlp": 1.09141088, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.06451639193106218, + "language_loss": 0.85711533, + "learning_rate": 0.0005968957689559203, + "loss": 0.86836469, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.33544922, + "step": 2364, + "time_per_iteration": 2.6682167053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119758, + "balance_loss_mlp": 1.08585453, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.06239355206550831, + "language_loss": 0.89508158, + "learning_rate": 0.0005965901138480131, + "loss": 0.90627909, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.33911133, + "step": 2365, + "time_per_iteration": 2.6365487575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125783, + "balance_loss_mlp": 1.08816087, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07086256306792152, + "language_loss": 0.87331104, + "learning_rate": 0.0005962844212404982, + "loss": 0.88456881, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.37597656, + "step": 2366, + "time_per_iteration": 2.6617612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123043, + "balance_loss_mlp": 1.08763838, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05743086206543283, + "language_loss": 0.87604624, + "learning_rate": 0.0005959786912520558, + "loss": 0.88727665, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.35400391, + "step": 2367, + "time_per_iteration": 2.5842456817626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112429, + "balance_loss_mlp": 1.08878994, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.05541530908978363, + "language_loss": 0.84261698, + "learning_rate": 0.0005956729240013806, + "loss": 0.8538599, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.35522461, + "step": 2368, + "time_per_iteration": 2.8338305950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131752, + "balance_loss_mlp": 1.09880257, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06117437276065272, + "language_loss": 0.91673207, + "learning_rate": 0.0005953671196071824, + "loss": 0.92804956, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.32958984, + "step": 2369, + "time_per_iteration": 2.6954920291900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140492, + "balance_loss_mlp": 1.10089099, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.05874804832244865, + "language_loss": 0.80540514, + "learning_rate": 0.0005950612781881846, + "loss": 0.81681007, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.39575195, + "step": 2370, + "time_per_iteration": 2.695518732070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133052, + "balance_loss_mlp": 1.09526241, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.054922750315337415, + "language_loss": 0.76194978, + "learning_rate": 0.0005947553998631259, + "loss": 0.77328038, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37792969, + "step": 2371, + "time_per_iteration": 2.854757070541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133988, + "balance_loss_mlp": 1.09777188, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.04850294692755014, + "language_loss": 0.79227567, + "learning_rate": 0.000594449484750758, + "loss": 0.80361551, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36206055, + "step": 2372, + "time_per_iteration": 3.2277348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128775, + "balance_loss_mlp": 1.09263051, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.06286219474212958, + "language_loss": 0.83387208, + "learning_rate": 0.0005941435329698484, + "loss": 0.84515989, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36132812, + "step": 2373, + "time_per_iteration": 2.676492929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126424, + "balance_loss_mlp": 1.09025562, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.05768590484176838, + "language_loss": 0.83615124, + "learning_rate": 0.0005938375446391778, + "loss": 0.84741557, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36181641, + "step": 2374, + "time_per_iteration": 2.7465567588806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137671, + "balance_loss_mlp": 1.09969115, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.05745321957635053, + "language_loss": 0.89048398, + "learning_rate": 0.0005935315198775415, + "loss": 0.90186071, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38012695, + "step": 2375, + "time_per_iteration": 2.6580095291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128206, + "balance_loss_mlp": 1.09320593, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06107240600749233, + "language_loss": 0.87175268, + "learning_rate": 0.0005932254588037486, + "loss": 0.88303471, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.35009766, + "step": 2376, + "time_per_iteration": 2.488588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121963, + "balance_loss_mlp": 1.08600903, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.05478508122440065, + "language_loss": 0.86331463, + "learning_rate": 0.000592919361536623, + "loss": 0.87453431, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.35961914, + "step": 2377, + "time_per_iteration": 2.644374132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127537, + "balance_loss_mlp": 1.09196472, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.05713052679154174, + "language_loss": 0.89246452, + "learning_rate": 0.0005926132281950017, + "loss": 0.90373993, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.35571289, + "step": 2378, + "time_per_iteration": 2.7563676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121403, + "balance_loss_mlp": 1.08406663, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.05503863795363348, + "language_loss": 0.85310149, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431557, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37329102, + "step": 2379, + "time_per_iteration": 2.8923282623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123743, + "balance_loss_mlp": 1.087098, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.05441682742417314, + "language_loss": 0.86308765, + "learning_rate": 0.0005920008537636931, + "loss": 0.8743251, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.3659668, + "step": 2380, + "time_per_iteration": 2.8928191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121741, + "balance_loss_mlp": 1.0852387, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.0522540937039379, + "language_loss": 0.86756825, + "learning_rate": 0.0005916946129117504, + "loss": 0.87878567, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.9031155109405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129507, + "balance_loss_mlp": 1.09281409, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.055637229661903514, + "language_loss": 0.80852348, + "learning_rate": 0.0005913883364608017, + "loss": 0.8198185, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36694336, + "step": 2382, + "time_per_iteration": 3.0779874324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123213, + "balance_loss_mlp": 1.088094, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.05906328885450196, + "language_loss": 0.88737094, + "learning_rate": 0.0005910820245297542, + "loss": 0.89860308, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.35131836, + "step": 2383, + "time_per_iteration": 2.889805555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119246, + "balance_loss_mlp": 1.0824585, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.06990697064707628, + "language_loss": 0.80825961, + "learning_rate": 0.000590775677237529, + "loss": 0.81945217, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.36791992, + "step": 2384, + "time_per_iteration": 2.7286477088928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011127, + "balance_loss_mlp": 1.07562566, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.06044507930671915, + "language_loss": 0.80186594, + "learning_rate": 0.0005904692947030601, + "loss": 0.81299293, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.37084961, + "step": 2385, + "time_per_iteration": 2.6249661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112584, + "balance_loss_mlp": 1.07446146, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.06266023003425206, + "language_loss": 0.89858609, + "learning_rate": 0.0005901628770452963, + "loss": 0.90971196, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.38110352, + "step": 2386, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106229, + "balance_loss_mlp": 1.06925035, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.05741151930163357, + "language_loss": 0.87304425, + "learning_rate": 0.000589856424383199, + "loss": 0.88410658, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.36987305, + "step": 2387, + "time_per_iteration": 2.6852517127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.07863569, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.06606538590283985, + "language_loss": 0.83553612, + "learning_rate": 0.000589549936835744, + "loss": 0.84669703, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.37451172, + "step": 2388, + "time_per_iteration": 2.8861043453216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106236, + "balance_loss_mlp": 1.07135534, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06160096974470471, + "language_loss": 0.79523546, + "learning_rate": 0.0005892434145219202, + "loss": 0.80629778, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.34912109, + "step": 2389, + "time_per_iteration": 2.6016130447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06219506, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07218042116864783, + "language_loss": 0.82768381, + "learning_rate": 0.0005889368575607303, + "loss": 0.83865625, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.35058594, + "step": 2390, + "time_per_iteration": 2.806382894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_mlp": 1.06791568, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.06321076421250729, + "language_loss": 0.78347147, + "learning_rate": 0.00058863026607119, + "loss": 0.7944994, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.34912109, + "step": 2391, + "time_per_iteration": 3.0679373741149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06800711, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07981135891264553, + "language_loss": 0.80153728, + "learning_rate": 0.0005883236401723287, + "loss": 0.81255829, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.34130859, + "step": 2392, + "time_per_iteration": 3.178016185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102518, + "balance_loss_mlp": 1.06830466, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.05809686694512272, + "language_loss": 0.8436439, + "learning_rate": 0.0005880169799831893, + "loss": 0.85466909, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.34204102, + "step": 2393, + "time_per_iteration": 2.7394168376922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099974, + "balance_loss_mlp": 1.06537914, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.05496993027151255, + "language_loss": 0.81652063, + "learning_rate": 0.0005877102856228278, + "loss": 0.82752037, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.34594727, + "step": 2394, + "time_per_iteration": 2.857044219970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07225823, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.0685378240754912, + "language_loss": 0.84987622, + "learning_rate": 0.0005874035572103133, + "loss": 0.86095524, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.35644531, + "step": 2395, + "time_per_iteration": 2.6805660724639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_mlp": 1.06699777, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.07818612590839771, + "language_loss": 0.82504952, + "learning_rate": 0.0005870967948647288, + "loss": 0.83607757, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.35839844, + "step": 2396, + "time_per_iteration": 2.7740094661712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13801181, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.06620078890509219, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75458288, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.11962891, + "step": 2397, + "time_per_iteration": 5.407956838607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.07158542, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.05578291602549768, + "language_loss": 0.85959148, + "learning_rate": 0.0005864831688507443, + "loss": 0.87066138, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.35424805, + "step": 2398, + "time_per_iteration": 3.000498056411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108167, + "balance_loss_mlp": 1.07342887, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.0567470157783756, + "language_loss": 0.7555595, + "learning_rate": 0.0005861763054205754, + "loss": 0.7666412, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.34765625, + "step": 2399, + "time_per_iteration": 2.7206692695617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108701, + "balance_loss_mlp": 1.07303381, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.054446102099669776, + "language_loss": 0.80056608, + "learning_rate": 0.0005858694085337976, + "loss": 0.81165302, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.35668945, + "step": 2400, + "time_per_iteration": 2.8272197246551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107355, + "balance_loss_mlp": 1.07090116, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.06783884534527172, + "language_loss": 0.83774948, + "learning_rate": 0.0005855624783095589, + "loss": 0.84882307, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.36425781, + "step": 2401, + "time_per_iteration": 2.6019625663757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102889, + "balance_loss_mlp": 1.06812799, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.05559222161472476, + "language_loss": 0.8541491, + "learning_rate": 0.00058525551486702, + "loss": 0.86517805, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.34790039, + "step": 2402, + "time_per_iteration": 2.5166754722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106757, + "balance_loss_mlp": 1.07058895, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.07030933336499708, + "language_loss": 0.80856764, + "learning_rate": 0.0005849485183253548, + "loss": 0.81963521, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.36206055, + "step": 2403, + "time_per_iteration": 2.6906049251556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.07090759, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.057304610397081915, + "language_loss": 0.87811077, + "learning_rate": 0.0005846414888037501, + "loss": 0.88916934, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.34960938, + "step": 2404, + "time_per_iteration": 2.488797426223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.06899309, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.05034114049250231, + "language_loss": 0.82261539, + "learning_rate": 0.0005843344264214049, + "loss": 0.83363742, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.33203125, + "step": 2405, + "time_per_iteration": 2.746372938156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110347, + "balance_loss_mlp": 1.07068777, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.10060755415937467, + "language_loss": 0.85092008, + "learning_rate": 0.0005840273312975317, + "loss": 0.86195481, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.32788086, + "step": 2406, + "time_per_iteration": 2.834230661392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112626, + "balance_loss_mlp": 1.07829416, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.06610522075480575, + "language_loss": 0.90376371, + "learning_rate": 0.0005837202035513555, + "loss": 0.91489005, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.34326172, + "step": 2407, + "time_per_iteration": 2.577099084854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112101, + "balance_loss_mlp": 1.07693422, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06799718927162632, + "language_loss": 0.81987119, + "learning_rate": 0.0005834130433021136, + "loss": 0.83099222, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.3515625, + "step": 2408, + "time_per_iteration": 2.751481771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07537687, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07576984187058394, + "language_loss": 0.73707795, + "learning_rate": 0.0005831058506690563, + "loss": 0.74819058, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.359375, + "step": 2409, + "time_per_iteration": 2.6351587772369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104428, + "balance_loss_mlp": 1.0719074, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06066453040155937, + "language_loss": 0.86246306, + "learning_rate": 0.0005827986257714464, + "loss": 0.87350732, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.32519531, + "step": 2410, + "time_per_iteration": 2.9171712398529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_mlp": 1.07334006, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.05632663018450853, + "language_loss": 0.8897202, + "learning_rate": 0.0005824913687285591, + "loss": 0.90078408, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.33032227, + "step": 2411, + "time_per_iteration": 2.6863625049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104253, + "balance_loss_mlp": 1.07056427, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.09102731097831396, + "language_loss": 0.81903768, + "learning_rate": 0.0005821840796596821, + "loss": 0.83008015, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.3371582, + "step": 2412, + "time_per_iteration": 2.658602714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108642, + "balance_loss_mlp": 1.07605052, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.04905521047169809, + "language_loss": 0.8043226, + "learning_rate": 0.0005818767586841158, + "loss": 0.81540906, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.32592773, + "step": 2413, + "time_per_iteration": 2.7577285766601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108976, + "balance_loss_mlp": 1.07655096, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06302213894221746, + "language_loss": 0.865412, + "learning_rate": 0.0005815694059211726, + "loss": 0.8765018, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.32421875, + "step": 2414, + "time_per_iteration": 2.6655328273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174358, + "balance_loss_mlp": 1.16362953, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.06384975588330166, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82047987, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.10742188, + "step": 2415, + "time_per_iteration": 4.795905828475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_mlp": 1.09135294, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.035706806463564576, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78046715, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.10302734, + "step": 2416, + "time_per_iteration": 4.964730978012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100793, + "balance_loss_mlp": 1.06910706, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.054161288123553565, + "language_loss": 0.8669647, + "learning_rate": 0.0005806471581013931, + "loss": 0.8779726, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.31640625, + "step": 2417, + "time_per_iteration": 2.7034828662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106596, + "balance_loss_mlp": 1.07221591, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.05684649238509572, + "language_loss": 0.78830767, + "learning_rate": 0.0005803396793823146, + "loss": 0.79937363, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.34375, + "step": 2418, + "time_per_iteration": 2.810929536819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112917, + "balance_loss_mlp": 1.07848907, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07858966703970842, + "language_loss": 0.86256903, + "learning_rate": 0.0005800321694726065, + "loss": 0.87369823, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.34423828, + "step": 2419, + "time_per_iteration": 2.797091484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113087, + "balance_loss_mlp": 1.07880187, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.06627504844203173, + "language_loss": 0.86954433, + "learning_rate": 0.0005797246284916545, + "loss": 0.8806752, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.34277344, + "step": 2420, + "time_per_iteration": 2.689190149307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_mlp": 1.09355068, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.047662019725998206, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78608793, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.10058594, + "step": 2421, + "time_per_iteration": 6.38897705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112649, + "balance_loss_mlp": 1.09318316, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.06710074217369558, + "language_loss": 0.88096154, + "learning_rate": 0.0005791094537936233, + "loss": 0.8922264, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.33325195, + "step": 2422, + "time_per_iteration": 4.209144353866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126187, + "balance_loss_mlp": 1.09340453, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.0626199173608307, + "language_loss": 0.82125473, + "learning_rate": 0.0005788018203153762, + "loss": 0.83251661, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.32788086, + "step": 2423, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138161, + "balance_loss_mlp": 1.10540235, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.07666207610831233, + "language_loss": 0.85944337, + "learning_rate": 0.000578494156243549, + "loss": 0.87082505, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.32763672, + "step": 2424, + "time_per_iteration": 2.582838296890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142028, + "balance_loss_mlp": 1.10779119, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.11745148991984863, + "language_loss": 0.89446878, + "learning_rate": 0.0005781864616975878, + "loss": 0.90588903, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.3425293, + "step": 2425, + "time_per_iteration": 2.6464650630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135149, + "balance_loss_mlp": 1.10081649, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.07242740344873133, + "language_loss": 0.84278369, + "learning_rate": 0.0005778787367969502, + "loss": 0.85413516, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.34375, + "step": 2426, + "time_per_iteration": 2.5785605907440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.09822595, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.06251358549871673, + "language_loss": 0.81181312, + "learning_rate": 0.0005775709816611053, + "loss": 0.82312894, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.33374023, + "step": 2427, + "time_per_iteration": 2.9622879028320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125428, + "balance_loss_mlp": 1.09100056, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.06013841542134278, + "language_loss": 0.83607411, + "learning_rate": 0.0005772631964095346, + "loss": 0.84732836, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.34448242, + "step": 2428, + "time_per_iteration": 2.681161403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123616, + "balance_loss_mlp": 1.08990407, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.05815575913312505, + "language_loss": 0.85975552, + "learning_rate": 0.000576955381161731, + "loss": 0.87099165, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.3371582, + "step": 2429, + "time_per_iteration": 2.670814275741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122337, + "balance_loss_mlp": 1.08843446, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07250877112671852, + "language_loss": 0.86541677, + "learning_rate": 0.0005766475360371985, + "loss": 0.8766402, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.33935547, + "step": 2430, + "time_per_iteration": 2.5907814502716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118824, + "balance_loss_mlp": 1.08368063, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0946745942266809, + "language_loss": 0.84659714, + "learning_rate": 0.0005763396611554536, + "loss": 0.85778534, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.3515625, + "step": 2431, + "time_per_iteration": 2.679352045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123862, + "balance_loss_mlp": 1.0890286, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.06880905442669231, + "language_loss": 0.80567783, + "learning_rate": 0.0005760317566360237, + "loss": 0.81691647, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.34838867, + "step": 2432, + "time_per_iteration": 3.0134341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116239, + "balance_loss_mlp": 1.08090591, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.09211359876128772, + "language_loss": 0.85498667, + "learning_rate": 0.000575723822598448, + "loss": 0.86614907, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.35375977, + "step": 2433, + "time_per_iteration": 2.807387351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113775, + "balance_loss_mlp": 1.07882285, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07984033993726149, + "language_loss": 0.81515086, + "learning_rate": 0.0005754158591622773, + "loss": 0.82628858, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.35009766, + "step": 2434, + "time_per_iteration": 2.9610190391540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108856, + "balance_loss_mlp": 1.07335579, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.08173781127815187, + "language_loss": 0.83058012, + "learning_rate": 0.0005751078664470732, + "loss": 0.84166867, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.35522461, + "step": 2435, + "time_per_iteration": 2.5381393432617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105873, + "balance_loss_mlp": 1.07125473, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.06625067078188727, + "language_loss": 0.86156499, + "learning_rate": 0.0005747998445724094, + "loss": 0.87262368, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.34643555, + "step": 2436, + "time_per_iteration": 2.5991244316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110631, + "balance_loss_mlp": 1.0730263, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.06922366477490534, + "language_loss": 0.8967731, + "learning_rate": 0.0005744917936578707, + "loss": 0.90783614, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.33276367, + "step": 2437, + "time_per_iteration": 2.7876076698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110478, + "balance_loss_mlp": 1.07087731, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.05346939801811538, + "language_loss": 0.83987176, + "learning_rate": 0.0005741837138230526, + "loss": 0.8509196, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.33911133, + "step": 2438, + "time_per_iteration": 2.7089829444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110533, + "balance_loss_mlp": 1.07063985, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06113216144822436, + "language_loss": 0.8632471, + "learning_rate": 0.0005738756051875627, + "loss": 0.87430036, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.34692383, + "step": 2439, + "time_per_iteration": 3.10072922706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106031, + "balance_loss_mlp": 1.07031631, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.054040954813727636, + "language_loss": 0.83196378, + "learning_rate": 0.0005735674678710192, + "loss": 0.84302408, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.35668945, + "step": 2440, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06644058, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.06378034204188901, + "language_loss": 0.81315678, + "learning_rate": 0.0005732593019930517, + "loss": 0.82417667, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.35571289, + "step": 2441, + "time_per_iteration": 2.8945391178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_mlp": 1.0766257, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0589509513637404, + "language_loss": 0.88047123, + "learning_rate": 0.0005729511076733008, + "loss": 0.89160711, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.36962891, + "step": 2442, + "time_per_iteration": 2.6688244342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119163, + "balance_loss_mlp": 1.08199334, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.06849073497169517, + "language_loss": 0.84747314, + "learning_rate": 0.000572642885031418, + "loss": 0.85866475, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.37207031, + "step": 2443, + "time_per_iteration": 2.9179134368896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108245, + "balance_loss_mlp": 1.07276881, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.0584848920178353, + "language_loss": 0.80748844, + "learning_rate": 0.0005723346341870662, + "loss": 0.81857085, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35522461, + "step": 2444, + "time_per_iteration": 2.701399087905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129757, + "balance_loss_mlp": 1.09277797, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.11865712100152984, + "language_loss": 0.86692929, + "learning_rate": 0.0005720263552599188, + "loss": 0.87822688, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.36962891, + "step": 2445, + "time_per_iteration": 2.4486730098724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121458, + "balance_loss_mlp": 1.08500421, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08366602087356424, + "language_loss": 0.79955238, + "learning_rate": 0.0005717180483696604, + "loss": 0.81076699, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.36499023, + "step": 2446, + "time_per_iteration": 2.8785839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120985, + "balance_loss_mlp": 1.08486462, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0682417361382486, + "language_loss": 0.83352333, + "learning_rate": 0.0005714097136359862, + "loss": 0.84473318, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36157227, + "step": 2447, + "time_per_iteration": 2.6363351345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118201, + "balance_loss_mlp": 1.08296275, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.051381339811927676, + "language_loss": 0.86498094, + "learning_rate": 0.0005711013511786027, + "loss": 0.87616301, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.35253906, + "step": 2448, + "time_per_iteration": 2.762845993041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111685, + "balance_loss_mlp": 1.08170676, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.058854412729412026, + "language_loss": 0.84082228, + "learning_rate": 0.0005707929611172263, + "loss": 0.85199082, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3515625, + "step": 2449, + "time_per_iteration": 2.7246243953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08007717, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.11039935923903105, + "language_loss": 0.84227139, + "learning_rate": 0.000570484543571585, + "loss": 0.85343003, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.35791016, + "step": 2450, + "time_per_iteration": 2.610919237136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113904, + "balance_loss_mlp": 1.0777123, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.0667594391398321, + "language_loss": 0.82813287, + "learning_rate": 0.0005701760986614171, + "loss": 0.8392719, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36181641, + "step": 2451, + "time_per_iteration": 2.5151522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120641, + "balance_loss_mlp": 1.08590317, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.0603467987943219, + "language_loss": 0.87650943, + "learning_rate": 0.0005698676265064714, + "loss": 0.88771582, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.34765625, + "step": 2452, + "time_per_iteration": 2.5722150802612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114487, + "balance_loss_mlp": 1.07920074, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.07549274937771847, + "language_loss": 0.89053345, + "learning_rate": 0.0005695591272265074, + "loss": 0.90167832, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.35327148, + "step": 2453, + "time_per_iteration": 2.5431923866271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109778, + "balance_loss_mlp": 1.07384801, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.05406998074400625, + "language_loss": 0.82143486, + "learning_rate": 0.0005692506009412954, + "loss": 0.83253264, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.359375, + "step": 2454, + "time_per_iteration": 2.6976101398468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176153, + "balance_loss_mlp": 1.16375494, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.047894752053778404, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78727424, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.12402344, + "step": 2455, + "time_per_iteration": 5.006427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103739, + "balance_loss_mlp": 1.07000232, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07748007609747588, + "language_loss": 0.89838475, + "learning_rate": 0.0005686334678342593, + "loss": 0.90942216, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.3371582, + "step": 2456, + "time_per_iteration": 2.88089919090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110083, + "balance_loss_mlp": 1.07586968, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.053591450648947214, + "language_loss": 0.81747675, + "learning_rate": 0.0005683248612520274, + "loss": 0.82857764, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.34204102, + "step": 2457, + "time_per_iteration": 3.0411272048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111421, + "balance_loss_mlp": 1.07811391, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.10239407628225645, + "language_loss": 0.84273934, + "learning_rate": 0.0005680162281437321, + "loss": 0.85388148, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36083984, + "step": 2458, + "time_per_iteration": 2.8898301124572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120752, + "balance_loss_mlp": 1.08608592, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.0555075738071769, + "language_loss": 0.85104299, + "learning_rate": 0.000567707568629195, + "loss": 0.86225057, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.34692383, + "step": 2459, + "time_per_iteration": 2.706040143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122742, + "balance_loss_mlp": 1.08778977, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.06127780861136823, + "language_loss": 0.82619834, + "learning_rate": 0.0005673988828282486, + "loss": 0.83742571, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.34985352, + "step": 2460, + "time_per_iteration": 2.674525499343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111568, + "balance_loss_mlp": 1.07668757, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05574274236604154, + "language_loss": 0.81308633, + "learning_rate": 0.0005670901708607352, + "loss": 0.82420194, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.34912109, + "step": 2461, + "time_per_iteration": 2.982827663421631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109707, + "balance_loss_mlp": 1.0753746, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.15434207723638854, + "language_loss": 0.84411561, + "learning_rate": 0.0005667814328465076, + "loss": 0.85521269, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.34350586, + "step": 2462, + "time_per_iteration": 2.639051914215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.07245243, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.07072772635633937, + "language_loss": 0.81988347, + "learning_rate": 0.0005664726689054285, + "loss": 0.83094847, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34033203, + "step": 2463, + "time_per_iteration": 2.4655356407165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.07973766, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.06987107232693553, + "language_loss": 0.8107388, + "learning_rate": 0.0005661638791573704, + "loss": 0.82186544, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.32958984, + "step": 2464, + "time_per_iteration": 2.7433135509490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111686, + "balance_loss_mlp": 1.07742512, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.060845328276789123, + "language_loss": 0.87247777, + "learning_rate": 0.0005658550637222164, + "loss": 0.88359463, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.34277344, + "step": 2465, + "time_per_iteration": 2.615755558013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113074, + "balance_loss_mlp": 1.07762074, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.05153784391367151, + "language_loss": 0.82349539, + "learning_rate": 0.0005655462227198592, + "loss": 0.83462608, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35473633, + "step": 2466, + "time_per_iteration": 2.91003680229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109891, + "balance_loss_mlp": 1.07460487, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.055186067432112955, + "language_loss": 0.84493053, + "learning_rate": 0.0005652373562702016, + "loss": 0.85602945, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.3527832, + "step": 2467, + "time_per_iteration": 2.6209630966186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.07982516, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06952200013405305, + "language_loss": 0.88642848, + "learning_rate": 0.000564928464493156, + "loss": 0.89760423, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.37744141, + "step": 2468, + "time_per_iteration": 2.609154224395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117938, + "balance_loss_mlp": 1.0807451, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.05705018138682977, + "language_loss": 0.81856024, + "learning_rate": 0.000564619547508645, + "loss": 0.82973957, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.37158203, + "step": 2469, + "time_per_iteration": 3.041351556777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117314, + "balance_loss_mlp": 1.07849944, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.08036472839994792, + "language_loss": 0.83256048, + "learning_rate": 0.0005643106054366008, + "loss": 0.84373355, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.38818359, + "step": 2470, + "time_per_iteration": 2.5631182193756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07258332, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.05805518051262763, + "language_loss": 0.79916292, + "learning_rate": 0.000564001638396965, + "loss": 0.81025255, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.36376953, + "step": 2471, + "time_per_iteration": 2.7381579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110653, + "balance_loss_mlp": 1.0717926, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0665112346682766, + "language_loss": 0.82313401, + "learning_rate": 0.0005636926465096897, + "loss": 0.83419931, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.34741211, + "step": 2472, + "time_per_iteration": 3.0346837043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111103, + "balance_loss_mlp": 1.07622218, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.06532220540392095, + "language_loss": 0.87808621, + "learning_rate": 0.0005633836298947363, + "loss": 0.88919711, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.34912109, + "step": 2473, + "time_per_iteration": 2.587581157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122307, + "balance_loss_mlp": 1.08716393, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09099011339346055, + "language_loss": 0.70947754, + "learning_rate": 0.000563074588672075, + "loss": 0.72070062, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3515625, + "step": 2474, + "time_per_iteration": 2.7112982273101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.09012604, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06360669353624634, + "language_loss": 0.85420531, + "learning_rate": 0.0005627655229616868, + "loss": 0.8654604, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.35400391, + "step": 2475, + "time_per_iteration": 2.7166192531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131445, + "balance_loss_mlp": 1.09532499, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.05566651470752815, + "language_loss": 0.90219474, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350919, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.36132812, + "step": 2476, + "time_per_iteration": 2.8342158794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.08339906, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06751222051526788, + "language_loss": 0.8450973, + "learning_rate": 0.0005621473185576986, + "loss": 0.85629016, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.35913086, + "step": 2477, + "time_per_iteration": 2.727320432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126891, + "balance_loss_mlp": 1.0915097, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.06498777385437565, + "language_loss": 0.87181318, + "learning_rate": 0.0005618381801041068, + "loss": 0.88308215, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.35400391, + "step": 2478, + "time_per_iteration": 2.622197389602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136775, + "balance_loss_mlp": 1.09965336, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.0693017023966873, + "language_loss": 0.83176625, + "learning_rate": 0.0005615290176428044, + "loss": 0.84313405, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.37084961, + "step": 2479, + "time_per_iteration": 2.6874895095825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.10275292, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.06633902685884922, + "language_loss": 0.85015559, + "learning_rate": 0.0005612198312938187, + "loss": 0.86152905, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.34619141, + "step": 2480, + "time_per_iteration": 2.7283356189727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143197, + "balance_loss_mlp": 1.10717165, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08700724997250119, + "language_loss": 0.79558903, + "learning_rate": 0.0005609106211771868, + "loss": 0.80702102, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.36035156, + "step": 2481, + "time_per_iteration": 2.8008668422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155105, + "balance_loss_mlp": 1.11857891, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07115217474866456, + "language_loss": 0.89249581, + "learning_rate": 0.0005606013874129543, + "loss": 0.90404689, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36523438, + "step": 2482, + "time_per_iteration": 2.746906280517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146439, + "balance_loss_mlp": 1.11027122, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.052135079835272054, + "language_loss": 0.80459106, + "learning_rate": 0.0005602921301211768, + "loss": 0.81605548, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36181641, + "step": 2483, + "time_per_iteration": 2.760091543197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133668, + "balance_loss_mlp": 1.09895456, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.06775745953777351, + "language_loss": 0.82220864, + "learning_rate": 0.0005599828494219185, + "loss": 0.83354533, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.34716797, + "step": 2484, + "time_per_iteration": 2.5458662509918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113545, + "balance_loss_mlp": 1.10004473, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.08200141457856946, + "language_loss": 0.89550984, + "learning_rate": 0.0005596735454352527, + "loss": 0.90686429, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35424805, + "step": 2485, + "time_per_iteration": 2.8570785522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143886, + "balance_loss_mlp": 1.1075511, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07792091337932193, + "language_loss": 0.85635722, + "learning_rate": 0.0005593642182812619, + "loss": 0.86779606, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36352539, + "step": 2486, + "time_per_iteration": 2.630213975906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139867, + "balance_loss_mlp": 1.10369921, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.06102595686098437, + "language_loss": 0.83692348, + "learning_rate": 0.0005590548680800378, + "loss": 0.84832209, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36206055, + "step": 2487, + "time_per_iteration": 3.1342179775238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139371, + "balance_loss_mlp": 1.10389483, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0657277256500081, + "language_loss": 0.76383913, + "learning_rate": 0.0005587454949516804, + "loss": 0.77523285, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35498047, + "step": 2488, + "time_per_iteration": 2.6958112716674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145548, + "balance_loss_mlp": 1.10833097, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.061160167550216256, + "language_loss": 0.88185161, + "learning_rate": 0.0005584360990162993, + "loss": 0.89330709, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.37255859, + "step": 2489, + "time_per_iteration": 2.61667537689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133811, + "balance_loss_mlp": 1.09881115, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.0507120714137282, + "language_loss": 0.85551566, + "learning_rate": 0.0005581266803940124, + "loss": 0.86685371, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.35009766, + "step": 2490, + "time_per_iteration": 2.7139766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133471, + "balance_loss_mlp": 1.09649253, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0583035541715914, + "language_loss": 0.87154239, + "learning_rate": 0.0005578172392049471, + "loss": 0.88287711, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.36987305, + "step": 2491, + "time_per_iteration": 2.7481577396392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134521, + "balance_loss_mlp": 1.09918737, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.08141144255217014, + "language_loss": 0.84311044, + "learning_rate": 0.0005575077755692386, + "loss": 0.85445559, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.35351562, + "step": 2492, + "time_per_iteration": 2.7962934970855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132576, + "balance_loss_mlp": 1.09793389, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.053456927876726165, + "language_loss": 0.86199152, + "learning_rate": 0.0005571982896070316, + "loss": 0.87331724, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.34692383, + "step": 2493, + "time_per_iteration": 2.6755988597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131371, + "balance_loss_mlp": 1.09534633, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.059320296473078654, + "language_loss": 0.89793247, + "learning_rate": 0.0005568887814384792, + "loss": 0.90924621, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.36035156, + "step": 2494, + "time_per_iteration": 2.5790021419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139931, + "balance_loss_mlp": 1.1042639, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.061123462827233396, + "language_loss": 0.87048668, + "learning_rate": 0.000556579251183743, + "loss": 0.88188601, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.35693359, + "step": 2495, + "time_per_iteration": 2.6916205883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134229, + "balance_loss_mlp": 1.0992769, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.05789705924573782, + "language_loss": 0.80256224, + "learning_rate": 0.0005562696989629936, + "loss": 0.81390452, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.34960938, + "step": 2496, + "time_per_iteration": 2.690638542175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133544, + "balance_loss_mlp": 1.0990684, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.06114364023526716, + "language_loss": 0.82642174, + "learning_rate": 0.0005559601248964095, + "loss": 0.83775711, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.34521484, + "step": 2497, + "time_per_iteration": 2.6249618530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135944, + "balance_loss_mlp": 1.10249412, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.06899971908711858, + "language_loss": 0.85956562, + "learning_rate": 0.0005556505291041783, + "loss": 0.87092507, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.33447266, + "step": 2498, + "time_per_iteration": 2.7098748683929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135161, + "balance_loss_mlp": 1.10097158, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.055207166893370456, + "language_loss": 0.84689957, + "learning_rate": 0.0005553409117064954, + "loss": 0.85825121, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.34228516, + "step": 2499, + "time_per_iteration": 2.8708267211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_mlp": 1.10242295, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.06687134527330599, + "language_loss": 0.8476308, + "learning_rate": 0.0005550312728235654, + "loss": 0.85899949, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.34448242, + "step": 2500, + "time_per_iteration": 2.6980721950531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128863, + "balance_loss_mlp": 1.09500802, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07829313389837793, + "language_loss": 0.83860761, + "learning_rate": 0.0005547216125756003, + "loss": 0.84989619, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.33862305, + "step": 2501, + "time_per_iteration": 2.737539291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140174, + "balance_loss_mlp": 1.10729611, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.06644553638954338, + "language_loss": 0.82266629, + "learning_rate": 0.0005544119310828211, + "loss": 0.83406806, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.32885742, + "step": 2502, + "time_per_iteration": 3.082392930984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125836, + "balance_loss_mlp": 1.09245706, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.061244964440333595, + "language_loss": 0.85365945, + "learning_rate": 0.0005541022284654568, + "loss": 0.86491781, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.33398438, + "step": 2503, + "time_per_iteration": 2.9372761249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125034, + "balance_loss_mlp": 1.09189391, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06168262746563105, + "language_loss": 0.84156048, + "learning_rate": 0.0005537925048437446, + "loss": 0.8528108, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.33154297, + "step": 2504, + "time_per_iteration": 2.589538097381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_mlp": 1.04296899, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.02361726537833674, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.7680397, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09521484, + "step": 2505, + "time_per_iteration": 4.908772230148315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111219, + "balance_loss_mlp": 1.07819104, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.056356974386017084, + "language_loss": 0.88423991, + "learning_rate": 0.0005531729950682664, + "loss": 0.89536178, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.34008789, + "step": 2506, + "time_per_iteration": 3.003096580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108061, + "balance_loss_mlp": 1.0739913, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.08388532833554185, + "language_loss": 0.85083711, + "learning_rate": 0.000552863209155015, + "loss": 0.86191773, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.34082031, + "step": 2507, + "time_per_iteration": 2.511463165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.07331145, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.05856414722035687, + "language_loss": 0.82348502, + "learning_rate": 0.0005525534027184461, + "loss": 0.83454525, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.32714844, + "step": 2508, + "time_per_iteration": 2.6477487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102119, + "balance_loss_mlp": 1.06993294, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.054304228935087996, + "language_loss": 0.83357495, + "learning_rate": 0.0005522435758788365, + "loss": 0.84459615, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.32177734, + "step": 2509, + "time_per_iteration": 2.715082883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_mlp": 1.06741309, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.07316920081788965, + "language_loss": 0.80354846, + "learning_rate": 0.0005519337287564721, + "loss": 0.81456852, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.34594727, + "step": 2510, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103913, + "balance_loss_mlp": 1.07225132, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07052632360826482, + "language_loss": 0.83703697, + "learning_rate": 0.000551623861471646, + "loss": 0.84807611, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.31640625, + "step": 2511, + "time_per_iteration": 2.7521867752075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_mlp": 1.01886296, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.02307493847576384, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79847658, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09960938, + "step": 2512, + "time_per_iteration": 4.850410461425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110192, + "balance_loss_mlp": 1.06877947, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.060960940408773784, + "language_loss": 0.86943817, + "learning_rate": 0.0005510040668958211, + "loss": 0.88045734, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.33154297, + "step": 2513, + "time_per_iteration": 2.5581674575805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00145698, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.01573295897448314, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78772056, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.10009766, + "step": 2514, + "time_per_iteration": 4.821207523345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101449, + "balance_loss_mlp": 1.06876206, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.06635931409503217, + "language_loss": 0.8316704, + "learning_rate": 0.0005503841931138645, + "loss": 0.84268492, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.3269043, + "step": 2515, + "time_per_iteration": 2.6826930046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109492, + "balance_loss_mlp": 1.06247151, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07963111819885421, + "language_loss": 0.81975293, + "learning_rate": 0.0005500742268214025, + "loss": 0.83070219, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.32446289, + "step": 2516, + "time_per_iteration": 2.4913811683654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109084, + "balance_loss_mlp": 1.07763672, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.057140457991015275, + "language_loss": 0.85559756, + "learning_rate": 0.0005497642410884014, + "loss": 0.86668837, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.31420898, + "step": 2517, + "time_per_iteration": 2.7807135581970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101598, + "balance_loss_mlp": 1.06855321, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.05176470538316484, + "language_loss": 0.85257566, + "learning_rate": 0.0005494542360352085, + "loss": 0.86359167, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.33056641, + "step": 2518, + "time_per_iteration": 2.653507947921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115114, + "balance_loss_mlp": 1.08285642, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.0599313447084905, + "language_loss": 0.85512084, + "learning_rate": 0.0005491442117821783, + "loss": 0.86627203, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.32226562, + "step": 2519, + "time_per_iteration": 2.717984676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08325005, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.0649010079315795, + "language_loss": 0.87622237, + "learning_rate": 0.0005488341684496732, + "loss": 0.88739175, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.33691406, + "step": 2520, + "time_per_iteration": 2.652135133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108566, + "balance_loss_mlp": 1.07606971, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06559854904132026, + "language_loss": 0.92200404, + "learning_rate": 0.0005485241061580624, + "loss": 0.93308973, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.32495117, + "step": 2521, + "time_per_iteration": 2.7108826637268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102755, + "balance_loss_mlp": 1.07037747, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.055876909605250345, + "language_loss": 0.84836948, + "learning_rate": 0.0005482140250277228, + "loss": 0.85939705, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.32373047, + "step": 2522, + "time_per_iteration": 2.997586965560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105629, + "balance_loss_mlp": 1.07408667, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.07027884549034326, + "language_loss": 0.87641776, + "learning_rate": 0.0005479039251790387, + "loss": 0.88747412, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.31518555, + "step": 2523, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096851, + "balance_loss_mlp": 1.06478369, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.061509725516535926, + "language_loss": 0.8502717, + "learning_rate": 0.0005475938067324014, + "loss": 0.86124021, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.32055664, + "step": 2524, + "time_per_iteration": 2.8200628757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07339168, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.064836171654712, + "language_loss": 0.83736813, + "learning_rate": 0.0005472836698082098, + "loss": 0.84842694, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.32495117, + "step": 2525, + "time_per_iteration": 2.4986329078674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100608, + "balance_loss_mlp": 1.06763458, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.05406459595211624, + "language_loss": 0.8394289, + "learning_rate": 0.0005469735145268694, + "loss": 0.8504349, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.32983398, + "step": 2526, + "time_per_iteration": 2.7246296405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107934, + "balance_loss_mlp": 1.07455492, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0623071528474554, + "language_loss": 0.8099308, + "learning_rate": 0.0005466633410087933, + "loss": 0.82101017, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.33398438, + "step": 2527, + "time_per_iteration": 2.660274028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049375, + "balance_loss_mlp": 1.03955197, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.029762737629489368, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78310198, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.09814453, + "step": 2528, + "time_per_iteration": 4.886114835739136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098205, + "balance_loss_mlp": 1.06663859, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.05348067523581763, + "language_loss": 0.88341582, + "learning_rate": 0.0005460429397441214, + "loss": 0.89439785, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.31542969, + "step": 2529, + "time_per_iteration": 2.556168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06572175, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.07361694113297405, + "language_loss": 0.86787206, + "learning_rate": 0.0005457327122383866, + "loss": 0.87883973, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.31030273, + "step": 2530, + "time_per_iteration": 2.6101198196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_mlp": 1.02248013, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.016416545513431694, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75668502, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.09521484, + "step": 2531, + "time_per_iteration": 4.807017803192139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102878, + "balance_loss_mlp": 1.071383, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.061169122564006716, + "language_loss": 0.75803703, + "learning_rate": 0.0005451122040823244, + "loss": 0.7690658, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.31469727, + "step": 2532, + "time_per_iteration": 2.778230667114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110046, + "balance_loss_mlp": 1.07611895, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.05283553568044795, + "language_loss": 0.77404439, + "learning_rate": 0.0005448019236728997, + "loss": 0.78514493, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.33959961, + "step": 2533, + "time_per_iteration": 2.8531336784362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106047, + "balance_loss_mlp": 1.07521987, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.06480756266699016, + "language_loss": 0.84952033, + "learning_rate": 0.0005444916258698255, + "loss": 0.8605808, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.30810547, + "step": 2534, + "time_per_iteration": 2.5989930629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108181, + "balance_loss_mlp": 1.07701969, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.058540646847924545, + "language_loss": 0.8623631, + "learning_rate": 0.0005441813107935704, + "loss": 0.87344491, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.31152344, + "step": 2535, + "time_per_iteration": 2.6970572471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.0836966, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.06249509461195645, + "language_loss": 0.85908329, + "learning_rate": 0.0005438709785646091, + "loss": 0.87024212, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.32177734, + "step": 2536, + "time_per_iteration": 2.5461835861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109072, + "balance_loss_mlp": 1.07688498, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06859245202813889, + "language_loss": 0.87149572, + "learning_rate": 0.0005435606293034234, + "loss": 0.88258648, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.32177734, + "step": 2537, + "time_per_iteration": 2.6585540771484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_mlp": 1.07018018, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.07107602922960535, + "language_loss": 0.84916604, + "learning_rate": 0.0005432502631305016, + "loss": 0.86016917, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.30126953, + "step": 2538, + "time_per_iteration": 2.6976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103363, + "balance_loss_mlp": 1.07055688, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.04961852862161645, + "language_loss": 0.83663356, + "learning_rate": 0.0005429398801663386, + "loss": 0.84766722, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.32788086, + "step": 2539, + "time_per_iteration": 2.9294815063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101134, + "balance_loss_mlp": 1.06916165, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.06193008336457455, + "language_loss": 0.83023834, + "learning_rate": 0.0005426294805314355, + "loss": 0.84124964, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.31958008, + "step": 2540, + "time_per_iteration": 2.5207223892211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099167, + "balance_loss_mlp": 1.06593108, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.0603925409034683, + "language_loss": 0.80357647, + "learning_rate": 0.0005423190643463003, + "loss": 0.8145681, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.33251953, + "step": 2541, + "time_per_iteration": 3.0720365047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101012, + "balance_loss_mlp": 1.06915879, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.0609118032347285, + "language_loss": 0.83149743, + "learning_rate": 0.0005420086317314473, + "loss": 0.84250748, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.31835938, + "step": 2542, + "time_per_iteration": 2.7291080951690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06470084, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.056070719415307675, + "language_loss": 0.81426919, + "learning_rate": 0.0005416981828073971, + "loss": 0.8252514, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.33544922, + "step": 2543, + "time_per_iteration": 2.7784368991851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_mlp": 1.02441669, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.02316516352555082, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78148878, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09423828, + "step": 2544, + "time_per_iteration": 4.838131666183472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.07023609, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07449943721079477, + "language_loss": 0.85317016, + "learning_rate": 0.000541077236513819, + "loss": 0.86419702, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.32446289, + "step": 2545, + "time_per_iteration": 2.5264503955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101803, + "balance_loss_mlp": 1.07071328, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.056060473734182076, + "language_loss": 0.82499588, + "learning_rate": 0.0005407667393853638, + "loss": 0.83601391, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31054688, + "step": 2546, + "time_per_iteration": 2.66180157661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06699038, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06590134685442105, + "language_loss": 0.83337891, + "learning_rate": 0.0005404562264298569, + "loss": 0.84437472, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32592773, + "step": 2547, + "time_per_iteration": 2.8525304794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098759, + "balance_loss_mlp": 1.06390238, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.05425762766620139, + "language_loss": 0.83855128, + "learning_rate": 0.0005401456977678498, + "loss": 0.84953886, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.34838867, + "step": 2548, + "time_per_iteration": 2.6519198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098656, + "balance_loss_mlp": 1.06561112, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.06384769679028596, + "language_loss": 0.77718782, + "learning_rate": 0.0005398351535199008, + "loss": 0.78817439, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.33056641, + "step": 2549, + "time_per_iteration": 3.0877339839935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096351, + "balance_loss_mlp": 1.06499887, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.053089286344054805, + "language_loss": 0.83930391, + "learning_rate": 0.0005395245938065735, + "loss": 0.85026741, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31347656, + "step": 2550, + "time_per_iteration": 2.8241264820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099597, + "balance_loss_mlp": 1.06669557, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.0641036113016549, + "language_loss": 0.82636213, + "learning_rate": 0.0005392140187484379, + "loss": 0.83735812, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.32885742, + "step": 2551, + "time_per_iteration": 2.593710422515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105531, + "balance_loss_mlp": 1.07332087, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.06156906510059403, + "language_loss": 0.89866853, + "learning_rate": 0.0005389034284660701, + "loss": 0.90972388, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.32202148, + "step": 2552, + "time_per_iteration": 2.8167800903320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112217, + "balance_loss_mlp": 1.07957709, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.06543971253041776, + "language_loss": 0.82440078, + "learning_rate": 0.000538592823080052, + "loss": 0.83552289, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.32641602, + "step": 2553, + "time_per_iteration": 3.190459966659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110985, + "balance_loss_mlp": 1.07817876, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.061393832790464745, + "language_loss": 0.85407627, + "learning_rate": 0.000538282202710971, + "loss": 0.8651861, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.328125, + "step": 2554, + "time_per_iteration": 2.5911953449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111409, + "balance_loss_mlp": 1.07907963, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.06886607309109279, + "language_loss": 0.82350785, + "learning_rate": 0.000537971567479421, + "loss": 0.83462197, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.32324219, + "step": 2555, + "time_per_iteration": 2.7882654666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110188, + "balance_loss_mlp": 1.07783484, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.07781814230506547, + "language_loss": 0.87956369, + "learning_rate": 0.0005376609175060011, + "loss": 0.89066565, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32348633, + "step": 2556, + "time_per_iteration": 2.6131739616394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121597, + "balance_loss_mlp": 1.08850408, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07736545907619681, + "language_loss": 0.80871999, + "learning_rate": 0.0005373502529113162, + "loss": 0.81993598, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.33105469, + "step": 2557, + "time_per_iteration": 2.8115434646606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125925, + "balance_loss_mlp": 1.09154499, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.06369400741363575, + "language_loss": 0.81534445, + "learning_rate": 0.0005370395738159773, + "loss": 0.82660365, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.34375, + "step": 2558, + "time_per_iteration": 2.645482063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134081, + "balance_loss_mlp": 1.10120285, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.06840745530844954, + "language_loss": 0.83582544, + "learning_rate": 0.0005367288803406003, + "loss": 0.84716624, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.32885742, + "step": 2559, + "time_per_iteration": 2.6290056705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113348, + "balance_loss_mlp": 1.09895754, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.06026988921967747, + "language_loss": 0.81393933, + "learning_rate": 0.0005364181726058073, + "loss": 0.82527417, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.34545898, + "step": 2560, + "time_per_iteration": 2.683072805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113164, + "balance_loss_mlp": 1.09771323, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.10364093826622443, + "language_loss": 0.8257041, + "learning_rate": 0.0005361074507322261, + "loss": 0.83702052, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.33935547, + "step": 2561, + "time_per_iteration": 2.5988388061523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127176, + "balance_loss_mlp": 1.09420276, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.08607714934124724, + "language_loss": 0.81995922, + "learning_rate": 0.000535796714840489, + "loss": 0.831231, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.32983398, + "step": 2562, + "time_per_iteration": 2.617560625076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124157, + "balance_loss_mlp": 1.09099317, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.06602924000575079, + "language_loss": 0.84137893, + "learning_rate": 0.0005354859650512348, + "loss": 0.85262048, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.33154297, + "step": 2563, + "time_per_iteration": 2.7547245025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118883, + "balance_loss_mlp": 1.08707833, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.060127327089604984, + "language_loss": 0.87529951, + "learning_rate": 0.0005351752014851074, + "loss": 0.88648832, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31787109, + "step": 2564, + "time_per_iteration": 2.5543923377990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115017, + "balance_loss_mlp": 1.08199644, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.057267908508465526, + "language_loss": 0.83867848, + "learning_rate": 0.0005348644242627553, + "loss": 0.84982872, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.33032227, + "step": 2565, + "time_per_iteration": 2.7361738681793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074248, + "balance_loss_mlp": 1.06585574, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.028047824457769776, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76360869, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.08398438, + "step": 2566, + "time_per_iteration": 4.955476760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126385, + "balance_loss_mlp": 1.09605825, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.0818104780923525, + "language_loss": 0.81442422, + "learning_rate": 0.0005342428293320013, + "loss": 0.825688, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30297852, + "step": 2567, + "time_per_iteration": 2.7417242527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133289, + "balance_loss_mlp": 1.10160363, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.06602747501781048, + "language_loss": 0.83786738, + "learning_rate": 0.0005339320118649238, + "loss": 0.84920025, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.31665039, + "step": 2568, + "time_per_iteration": 2.6943705081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.11111128, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.08080827100230976, + "language_loss": 0.86562729, + "learning_rate": 0.000533621181224271, + "loss": 0.87704599, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30737305, + "step": 2569, + "time_per_iteration": 2.7706520557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140818, + "balance_loss_mlp": 1.10748696, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.0686138609954652, + "language_loss": 0.81810164, + "learning_rate": 0.0005333103375307182, + "loss": 0.82950985, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.33349609, + "step": 2570, + "time_per_iteration": 2.86440372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114456, + "balance_loss_mlp": 1.11196864, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06689740684779927, + "language_loss": 0.86211395, + "learning_rate": 0.0005329994809049451, + "loss": 0.87355959, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.32592773, + "step": 2571, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.10243487, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.10119095251173513, + "language_loss": 0.87867194, + "learning_rate": 0.0005326886114676375, + "loss": 0.89004534, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.34936523, + "step": 2572, + "time_per_iteration": 2.7414114475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122524, + "balance_loss_mlp": 1.09017086, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06560191593845013, + "language_loss": 0.8820219, + "learning_rate": 0.0005323777293394854, + "loss": 0.89324713, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.32348633, + "step": 2573, + "time_per_iteration": 2.5354294776916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08838177, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.057507807941180766, + "language_loss": 0.8235743, + "learning_rate": 0.000532066834641184, + "loss": 0.83478361, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32543945, + "step": 2574, + "time_per_iteration": 2.6555819511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110618, + "balance_loss_mlp": 1.07401729, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.06325814646706406, + "language_loss": 0.85261214, + "learning_rate": 0.0005317559274934334, + "loss": 0.86367393, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.3215332, + "step": 2575, + "time_per_iteration": 2.7056500911712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109208, + "balance_loss_mlp": 1.07559085, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.06593319291759459, + "language_loss": 0.81090045, + "learning_rate": 0.0005314450080169382, + "loss": 0.82199252, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33642578, + "step": 2576, + "time_per_iteration": 2.6029012203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06544995, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.07692863745295915, + "language_loss": 0.80917549, + "learning_rate": 0.0005311340763324083, + "loss": 0.82014352, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.31323242, + "step": 2577, + "time_per_iteration": 2.5615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092477, + "balance_loss_mlp": 1.06081462, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06627487899009786, + "language_loss": 0.82433712, + "learning_rate": 0.0005308231325605578, + "loss": 0.83526182, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.31665039, + "step": 2578, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096329, + "balance_loss_mlp": 1.06473827, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.053568999050238396, + "language_loss": 0.77453893, + "learning_rate": 0.0005305121768221061, + "loss": 0.7855022, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.31542969, + "step": 2579, + "time_per_iteration": 3.0817010402679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_mlp": 1.00046897, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.016247003132607515, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76047277, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08496094, + "step": 2580, + "time_per_iteration": 4.813999176025391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.05099821, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06693938938040958, + "language_loss": 0.92087269, + "learning_rate": 0.0005298902299282984, + "loss": 0.93170166, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.3190918, + "step": 2581, + "time_per_iteration": 2.622823715209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096273, + "balance_loss_mlp": 1.0638243, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.06032323910602905, + "language_loss": 0.84543586, + "learning_rate": 0.0005295792390144033, + "loss": 0.85639858, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.32446289, + "step": 2582, + "time_per_iteration": 2.68511962890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110589, + "balance_loss_mlp": 1.07236862, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.06277392630469315, + "language_loss": 0.84023589, + "learning_rate": 0.0005292682366168294, + "loss": 0.85129476, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.33544922, + "step": 2583, + "time_per_iteration": 2.5309059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095632, + "balance_loss_mlp": 1.06256378, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06727867389441711, + "language_loss": 0.79973817, + "learning_rate": 0.0005289572228563181, + "loss": 0.81069446, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.33081055, + "step": 2584, + "time_per_iteration": 4.178269386291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095977, + "balance_loss_mlp": 1.06362402, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.05530053735156927, + "language_loss": 0.83410299, + "learning_rate": 0.000528646197853616, + "loss": 0.84506273, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.32373047, + "step": 2585, + "time_per_iteration": 2.706878900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101894, + "balance_loss_mlp": 1.07032776, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.05706454291548468, + "language_loss": 0.86111611, + "learning_rate": 0.0005283351617294735, + "loss": 0.87213504, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.31567383, + "step": 2586, + "time_per_iteration": 2.9042582511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017241, + "balance_loss_mlp": 1.00732255, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.020630801148902787, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77653909, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.09912109, + "step": 2587, + "time_per_iteration": 4.9974682331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099928, + "balance_loss_mlp": 1.06676388, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.07253805127360792, + "language_loss": 0.86542678, + "learning_rate": 0.0005277130565998916, + "loss": 0.87642598, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.33178711, + "step": 2588, + "time_per_iteration": 2.7639453411102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092536, + "balance_loss_mlp": 1.06144667, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05247127963424023, + "language_loss": 0.82351577, + "learning_rate": 0.0005274019878359748, + "loss": 0.83444113, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.31054688, + "step": 2589, + "time_per_iteration": 2.706843137741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109391, + "balance_loss_mlp": 1.05943429, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.06499700543891603, + "language_loss": 0.87299156, + "learning_rate": 0.0005270909084336628, + "loss": 0.88393074, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.34472656, + "step": 2590, + "time_per_iteration": 2.627092123031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095191, + "balance_loss_mlp": 1.06174052, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.06358626343280155, + "language_loss": 0.89192379, + "learning_rate": 0.0005267798185137276, + "loss": 0.90287566, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.33447266, + "step": 2591, + "time_per_iteration": 2.6053519248962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098275, + "balance_loss_mlp": 1.06434834, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.06851868017892651, + "language_loss": 0.89230084, + "learning_rate": 0.0005264687181969444, + "loss": 0.9032836, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.33959961, + "step": 2592, + "time_per_iteration": 2.7227771282196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097456, + "balance_loss_mlp": 1.06255198, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.06920907227035335, + "language_loss": 0.75419706, + "learning_rate": 0.0005261576076040937, + "loss": 0.76517165, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.34936523, + "step": 2593, + "time_per_iteration": 3.2559545040130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06430554, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.06727068797895107, + "language_loss": 0.84462249, + "learning_rate": 0.0005258464868559591, + "loss": 0.85559052, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.32519531, + "step": 2594, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06432104, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.05920105575352037, + "language_loss": 0.88943779, + "learning_rate": 0.0005255353560733284, + "loss": 0.90040118, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.32006836, + "step": 2595, + "time_per_iteration": 2.5696520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_mlp": 1.02894819, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.021649763717819466, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76616704, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.09130859, + "step": 2596, + "time_per_iteration": 4.785402059555054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096305, + "balance_loss_mlp": 1.06354642, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.055871474183400556, + "language_loss": 0.83429074, + "learning_rate": 0.0005249130648877492, + "loss": 0.84525383, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.32763672, + "step": 2597, + "time_per_iteration": 2.768077850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096166, + "balance_loss_mlp": 1.0628823, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.06479225622172138, + "language_loss": 0.85305572, + "learning_rate": 0.0005246019047263953, + "loss": 0.86401737, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.33300781, + "step": 2598, + "time_per_iteration": 2.4575202465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109757, + "balance_loss_mlp": 1.06471562, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.06552285864087816, + "language_loss": 0.82716942, + "learning_rate": 0.0005242907350137353, + "loss": 0.83814514, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.32836914, + "step": 2599, + "time_per_iteration": 2.545402765274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.06773996, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.060184934170799446, + "language_loss": 0.79316103, + "learning_rate": 0.0005239795558705754, + "loss": 0.80416048, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.32202148, + "step": 2600, + "time_per_iteration": 2.6259560585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094505, + "balance_loss_mlp": 1.06279588, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.07180292739942261, + "language_loss": 0.89506614, + "learning_rate": 0.0005236683674177264, + "loss": 0.90601116, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.31713867, + "step": 2601, + "time_per_iteration": 2.6216633319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.06531632, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.05820446715743302, + "language_loss": 0.82377899, + "learning_rate": 0.0005233571697760021, + "loss": 0.83476663, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3347168, + "step": 2602, + "time_per_iteration": 2.8286540508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107785, + "balance_loss_mlp": 1.07447851, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06262770013006937, + "language_loss": 0.83391154, + "learning_rate": 0.0005230459630662203, + "loss": 0.84498942, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.33325195, + "step": 2603, + "time_per_iteration": 2.9667811393737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107928, + "balance_loss_mlp": 1.07562184, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.06520686758548196, + "language_loss": 0.81425881, + "learning_rate": 0.0005227347474092022, + "loss": 0.82533813, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.32250977, + "step": 2604, + "time_per_iteration": 2.7840375900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109186, + "balance_loss_mlp": 1.07616544, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.04693444517106987, + "language_loss": 0.83613992, + "learning_rate": 0.0005224235229257724, + "loss": 0.84723175, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.33032227, + "step": 2605, + "time_per_iteration": 2.6730735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_mlp": 1.06970012, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.05305580167320912, + "language_loss": 0.87095463, + "learning_rate": 0.0005221122897367589, + "loss": 0.88196945, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.31762695, + "step": 2606, + "time_per_iteration": 2.804161310195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106275, + "balance_loss_mlp": 1.07384968, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.07402045106641765, + "language_loss": 0.81512845, + "learning_rate": 0.0005218010479629932, + "loss": 0.82619125, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.32421875, + "step": 2607, + "time_per_iteration": 2.6673223972320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111463, + "balance_loss_mlp": 1.0777508, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.06695708261577327, + "language_loss": 0.82331049, + "learning_rate": 0.0005214897977253102, + "loss": 0.83442515, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.33740234, + "step": 2608, + "time_per_iteration": 2.641615390777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109683, + "balance_loss_mlp": 1.06538224, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.057424183285493445, + "language_loss": 0.84299719, + "learning_rate": 0.0005211785391445473, + "loss": 0.85396552, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.31445312, + "step": 2609, + "time_per_iteration": 2.736565589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098049, + "balance_loss_mlp": 1.06381226, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.15505754048194495, + "language_loss": 0.79028511, + "learning_rate": 0.0005208672723415467, + "loss": 0.8012656, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.3425293, + "step": 2610, + "time_per_iteration": 2.7740700244903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06371355, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.06293902841757802, + "language_loss": 0.79232705, + "learning_rate": 0.0005205559974371525, + "loss": 0.80331105, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.34716797, + "step": 2611, + "time_per_iteration": 2.7674527168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096957, + "balance_loss_mlp": 1.06455564, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06270244311506845, + "language_loss": 0.82445353, + "learning_rate": 0.0005202447145522123, + "loss": 0.83542311, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.32397461, + "step": 2612, + "time_per_iteration": 2.6602847576141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100141, + "balance_loss_mlp": 1.06700087, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.1708463718003921, + "language_loss": 0.79453385, + "learning_rate": 0.0005199334238075769, + "loss": 0.80553526, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.33154297, + "step": 2613, + "time_per_iteration": 2.5568900108337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06802678, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.0528689770317124, + "language_loss": 0.92217171, + "learning_rate": 0.0005196221253241, + "loss": 0.93318725, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.3347168, + "step": 2614, + "time_per_iteration": 2.6126556396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099044, + "balance_loss_mlp": 1.06490254, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.060608661488991786, + "language_loss": 0.83149332, + "learning_rate": 0.0005193108192226383, + "loss": 0.84248376, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.34155273, + "step": 2615, + "time_per_iteration": 2.74265456199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099568, + "balance_loss_mlp": 1.06599879, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.05036532075051116, + "language_loss": 0.87427437, + "learning_rate": 0.000518999505624052, + "loss": 0.88527, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.33569336, + "step": 2616, + "time_per_iteration": 2.6870973110198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098357, + "balance_loss_mlp": 1.06483543, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.047696485592571475, + "language_loss": 0.83320528, + "learning_rate": 0.000518688184649203, + "loss": 0.84418881, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.33544922, + "step": 2617, + "time_per_iteration": 2.8016743659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.06434643, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.046578345586746416, + "language_loss": 0.83902323, + "learning_rate": 0.0005183768564189577, + "loss": 0.85000026, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.33374023, + "step": 2618, + "time_per_iteration": 2.5473384857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103149, + "balance_loss_mlp": 1.07158208, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.06435350107251939, + "language_loss": 0.81610096, + "learning_rate": 0.0005180655210541838, + "loss": 0.82713246, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31542969, + "step": 2619, + "time_per_iteration": 2.6063601970672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109319, + "balance_loss_mlp": 1.07362747, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.07554849641883571, + "language_loss": 0.83428431, + "learning_rate": 0.0005177541786757527, + "loss": 0.8453775, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.35693359, + "step": 2620, + "time_per_iteration": 2.7651278972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109868, + "balance_loss_mlp": 1.07589293, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.07801269652965341, + "language_loss": 0.8344717, + "learning_rate": 0.000517442829404538, + "loss": 0.84557039, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.33959961, + "step": 2621, + "time_per_iteration": 2.991288661956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07401848, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07509105999805234, + "language_loss": 0.87522292, + "learning_rate": 0.0005171314733614166, + "loss": 0.8862952, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.33227539, + "step": 2622, + "time_per_iteration": 2.8980941772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107621, + "balance_loss_mlp": 1.07357442, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.05402993794527385, + "language_loss": 0.78464615, + "learning_rate": 0.0005168201106672671, + "loss": 0.79572237, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.34057617, + "step": 2623, + "time_per_iteration": 2.7572929859161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106537, + "balance_loss_mlp": 1.07394505, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.0666138467605724, + "language_loss": 0.85413206, + "learning_rate": 0.0005165087414429717, + "loss": 0.86519742, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.32592773, + "step": 2624, + "time_per_iteration": 2.6197690963745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104325, + "balance_loss_mlp": 1.07178128, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.0890371890087988, + "language_loss": 0.83553296, + "learning_rate": 0.0005161973658094144, + "loss": 0.84657621, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.32543945, + "step": 2625, + "time_per_iteration": 2.688664436340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114503, + "balance_loss_mlp": 1.08188796, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.10293664596100507, + "language_loss": 0.82534152, + "learning_rate": 0.000515885983887482, + "loss": 0.83648658, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.32592773, + "step": 2626, + "time_per_iteration": 2.7382290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117287, + "balance_loss_mlp": 1.08467126, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.06112991005583596, + "language_loss": 0.84654796, + "learning_rate": 0.0005155745957980636, + "loss": 0.85772085, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.32617188, + "step": 2627, + "time_per_iteration": 2.5833873748779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.0852921, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.05493055898841465, + "language_loss": 0.88454115, + "learning_rate": 0.000515263201662051, + "loss": 0.89571404, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.31982422, + "step": 2628, + "time_per_iteration": 2.6362485885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112533, + "balance_loss_mlp": 1.09264278, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.05313724215790835, + "language_loss": 0.8271699, + "learning_rate": 0.0005149518016003378, + "loss": 0.83842319, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.3269043, + "step": 2629, + "time_per_iteration": 3.1579666137695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121904, + "balance_loss_mlp": 1.09109998, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.05858406869857789, + "language_loss": 0.82627785, + "learning_rate": 0.0005146403957338206, + "loss": 0.83749688, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30786133, + "step": 2630, + "time_per_iteration": 2.5554275512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128543, + "balance_loss_mlp": 1.09664297, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.05139775445636508, + "language_loss": 0.82087231, + "learning_rate": 0.0005143289841833975, + "loss": 0.83215779, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31884766, + "step": 2631, + "time_per_iteration": 2.866208076477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136223, + "balance_loss_mlp": 1.10332084, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.07049680225310351, + "language_loss": 0.82485932, + "learning_rate": 0.0005140175670699696, + "loss": 0.83622158, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.32885742, + "step": 2632, + "time_per_iteration": 2.589662551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136954, + "balance_loss_mlp": 1.10464883, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04937719013853961, + "language_loss": 0.83023763, + "learning_rate": 0.0005137061445144395, + "loss": 0.84160721, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.32299805, + "step": 2633, + "time_per_iteration": 2.907914161682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145225, + "balance_loss_mlp": 1.11308646, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06298038708728138, + "language_loss": 0.87351924, + "learning_rate": 0.000513394716637712, + "loss": 0.8849715, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.32128906, + "step": 2634, + "time_per_iteration": 2.7392778396606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_mlp": 1.05677319, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03015814476855984, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80255967, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.07470703, + "step": 2635, + "time_per_iteration": 4.8476762771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138418, + "balance_loss_mlp": 1.10549188, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0660835824728649, + "language_loss": 0.80952996, + "learning_rate": 0.0005127718454042958, + "loss": 0.82091409, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.3293457, + "step": 2636, + "time_per_iteration": 2.801945447921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122948, + "balance_loss_mlp": 1.09083319, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.06804864770708682, + "language_loss": 0.8454951, + "learning_rate": 0.0005124604022894269, + "loss": 0.85672456, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.32104492, + "step": 2637, + "time_per_iteration": 2.9412965774536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_mlp": 1.0316422, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.020904454547095577, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78227401, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.07519531, + "step": 2638, + "time_per_iteration": 4.857941389083862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.09507418, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.058859738864391845, + "language_loss": 0.83504963, + "learning_rate": 0.0005118375016679325, + "loss": 0.84632552, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.32495117, + "step": 2639, + "time_per_iteration": 2.7467126846313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115219, + "balance_loss_mlp": 1.08169687, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.06748446003243579, + "language_loss": 0.80393875, + "learning_rate": 0.0005115260444031382, + "loss": 0.81509095, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.33544922, + "step": 2640, + "time_per_iteration": 2.5831897258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.01354098, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011909310640322752, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79752946, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.07568359, + "step": 2641, + "time_per_iteration": 4.96182656288147 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118257, + "balance_loss_mlp": 1.08506942, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.06566448453374539, + "language_loss": 0.87279713, + "learning_rate": 0.0005109031165700483, + "loss": 0.88397968, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.33178711, + "step": 2642, + "time_per_iteration": 2.5608396530151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114089, + "balance_loss_mlp": 1.08228409, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07470030174236865, + "language_loss": 0.83423924, + "learning_rate": 0.0005105916462435945, + "loss": 0.84538019, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.31787109, + "step": 2643, + "time_per_iteration": 2.840092420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114248, + "balance_loss_mlp": 1.08272934, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0540496938056118, + "language_loss": 0.8565858, + "learning_rate": 0.0005102801718050989, + "loss": 0.86772823, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.31494141, + "step": 2644, + "time_per_iteration": 2.687993288040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111024, + "balance_loss_mlp": 1.08024383, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.0657522571772089, + "language_loss": 0.89181781, + "learning_rate": 0.0005099686933754867, + "loss": 0.90292799, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.30737305, + "step": 2645, + "time_per_iteration": 2.676555633544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110963, + "balance_loss_mlp": 1.07589364, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.06525501329559952, + "language_loss": 0.84646904, + "learning_rate": 0.0005096572110756845, + "loss": 0.85756534, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.33740234, + "step": 2646, + "time_per_iteration": 2.722046136856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098497, + "balance_loss_mlp": 1.06502318, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.055343813231999515, + "language_loss": 0.85733652, + "learning_rate": 0.0005093457250266205, + "loss": 0.86832154, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.33496094, + "step": 2647, + "time_per_iteration": 2.726637363433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105884, + "balance_loss_mlp": 1.07260132, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.07566246752622155, + "language_loss": 0.83174831, + "learning_rate": 0.000509034235349224, + "loss": 0.84280717, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.33276367, + "step": 2648, + "time_per_iteration": 2.7163400650024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06480372, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.05726246002698667, + "language_loss": 0.81403017, + "learning_rate": 0.0005087227421644266, + "loss": 0.82501602, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.33813477, + "step": 2649, + "time_per_iteration": 2.753593683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090769, + "balance_loss_mlp": 1.05836821, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.062073163804743356, + "language_loss": 0.86567879, + "learning_rate": 0.0005084112455931602, + "loss": 0.87658644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.32397461, + "step": 2650, + "time_per_iteration": 2.6115548610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109227, + "balance_loss_mlp": 1.05986929, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.07224314043681272, + "language_loss": 0.85185993, + "learning_rate": 0.0005080997457563586, + "loss": 0.8627826, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.32397461, + "step": 2651, + "time_per_iteration": 2.562626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091424, + "balance_loss_mlp": 1.05797434, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.12059659360832554, + "language_loss": 0.79420835, + "learning_rate": 0.0005077882427749569, + "loss": 0.80512255, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.3347168, + "step": 2652, + "time_per_iteration": 2.532801866531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.06072092, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09167141678281196, + "language_loss": 0.85065627, + "learning_rate": 0.0005074767367698913, + "loss": 0.86160588, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.34277344, + "step": 2653, + "time_per_iteration": 2.718952178955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06184387, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.05265423612140712, + "language_loss": 0.83726275, + "learning_rate": 0.0005071652278620988, + "loss": 0.84820282, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.3215332, + "step": 2654, + "time_per_iteration": 3.0578973293304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093541, + "balance_loss_mlp": 1.06082976, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.057781922950613636, + "language_loss": 0.8368457, + "learning_rate": 0.0005068537161725186, + "loss": 0.84778106, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.32714844, + "step": 2655, + "time_per_iteration": 2.763050079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109333, + "balance_loss_mlp": 1.06035662, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.06748478853261292, + "language_loss": 0.84411526, + "learning_rate": 0.0005065422018220893, + "loss": 0.85504854, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.32983398, + "step": 2656, + "time_per_iteration": 2.8346335887908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099653, + "balance_loss_mlp": 1.06744266, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.05948045399752535, + "language_loss": 0.80220234, + "learning_rate": 0.0005062306849317521, + "loss": 0.8131988, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.32226562, + "step": 2657, + "time_per_iteration": 2.8443868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011092, + "balance_loss_mlp": 1.07832527, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.06625791562361402, + "language_loss": 0.83381897, + "learning_rate": 0.0005059191656224487, + "loss": 0.84491098, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30859375, + "step": 2658, + "time_per_iteration": 2.7093002796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110636, + "balance_loss_mlp": 1.07883072, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.06672155578926672, + "language_loss": 0.88962573, + "learning_rate": 0.0005056076440151212, + "loss": 0.90073204, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.31787109, + "step": 2659, + "time_per_iteration": 2.6441903114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072549, + "balance_loss_mlp": 1.06272602, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.032966871601824974, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77360666, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.09814453, + "step": 2660, + "time_per_iteration": 4.922346353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124135, + "balance_loss_mlp": 1.09111381, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06875691586697516, + "language_loss": 0.87086922, + "learning_rate": 0.0005049845943901691, + "loss": 0.8821106, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.33032227, + "step": 2661, + "time_per_iteration": 2.8344130516052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107104, + "balance_loss_mlp": 1.07703924, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.06167047048505293, + "language_loss": 0.86829108, + "learning_rate": 0.0005046730666144338, + "loss": 0.87936211, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.30078125, + "step": 2662, + "time_per_iteration": 2.7832746505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110467, + "balance_loss_mlp": 1.07780349, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.05618387348962469, + "language_loss": 0.8811537, + "learning_rate": 0.0005043615370244532, + "loss": 0.89225835, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.32666016, + "step": 2663, + "time_per_iteration": 3.3585264682769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_mlp": 1.02664995, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02051261915929333, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79279995, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.08984375, + "step": 2664, + "time_per_iteration": 4.639116048812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.07670689, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.057959232824292994, + "language_loss": 0.85514903, + "learning_rate": 0.0005037384728855425, + "loss": 0.86621535, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.29882812, + "step": 2665, + "time_per_iteration": 2.7972493171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106635, + "balance_loss_mlp": 1.07456732, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08985416920229425, + "language_loss": 0.84974313, + "learning_rate": 0.0005034269385785075, + "loss": 0.86080956, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.3203125, + "step": 2666, + "time_per_iteration": 2.6164255142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.08135498, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09072509808708462, + "language_loss": 0.85031348, + "learning_rate": 0.0005031154029410168, + "loss": 0.86144769, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.32055664, + "step": 2667, + "time_per_iteration": 2.5188395977020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112022, + "balance_loss_mlp": 1.07873833, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.08345403251216076, + "language_loss": 0.86623496, + "learning_rate": 0.0005028038660940197, + "loss": 0.87735522, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.33300781, + "step": 2668, + "time_per_iteration": 2.5099217891693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104597, + "balance_loss_mlp": 1.07360303, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.051835294009996306, + "language_loss": 0.8459934, + "learning_rate": 0.0005024923281584648, + "loss": 0.85703939, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.30981445, + "step": 2669, + "time_per_iteration": 2.6409177780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113407, + "balance_loss_mlp": 1.08103013, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.05618222104131465, + "language_loss": 0.82660598, + "learning_rate": 0.0005021807892553026, + "loss": 0.83774006, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.32397461, + "step": 2670, + "time_per_iteration": 2.7168080806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105439, + "balance_loss_mlp": 1.07458735, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.052268384876698444, + "language_loss": 0.84909296, + "learning_rate": 0.0005018692495054828, + "loss": 0.86014736, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.30834961, + "step": 2671, + "time_per_iteration": 2.769845485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07063007, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.059994941655344296, + "language_loss": 0.80935681, + "learning_rate": 0.0005015577090299561, + "loss": 0.82036185, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.29833984, + "step": 2672, + "time_per_iteration": 2.681316375732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_mlp": 1.07245326, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.05683100055240327, + "language_loss": 0.86631596, + "learning_rate": 0.0005012461679496729, + "loss": 0.87733757, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.29711914, + "step": 2673, + "time_per_iteration": 2.5961544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100162, + "balance_loss_mlp": 1.06883335, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.05638845856922635, + "language_loss": 0.88303345, + "learning_rate": 0.0005009346263855848, + "loss": 0.8940351, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.31323242, + "step": 2674, + "time_per_iteration": 2.607531785964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.06903887, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.05523698149533188, + "language_loss": 0.84251857, + "learning_rate": 0.0005006230844586422, + "loss": 0.85352582, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.31665039, + "step": 2675, + "time_per_iteration": 2.766676664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106245, + "balance_loss_mlp": 1.07384396, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.054179282011379754, + "language_loss": 0.79421759, + "learning_rate": 0.0005003115422897968, + "loss": 0.80528009, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.32397461, + "step": 2676, + "time_per_iteration": 2.7511518001556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101702, + "balance_loss_mlp": 1.0696342, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06371145669365144, + "language_loss": 0.86998433, + "learning_rate": 0.0005, + "loss": 0.88100135, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.32055664, + "step": 2677, + "time_per_iteration": 2.6361911296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096983, + "balance_loss_mlp": 1.06508231, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06720272484805691, + "language_loss": 0.79773581, + "learning_rate": 0.0004996884577102033, + "loss": 0.80870569, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.3190918, + "step": 2678, + "time_per_iteration": 3.078381299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101165, + "balance_loss_mlp": 1.06726193, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.05338815308435362, + "language_loss": 0.84963048, + "learning_rate": 0.000499376915541358, + "loss": 0.86064208, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.33911133, + "step": 2679, + "time_per_iteration": 2.6979198455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096582, + "balance_loss_mlp": 1.06506324, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.0530977146452018, + "language_loss": 0.8140825, + "learning_rate": 0.0004990653736144155, + "loss": 0.82504833, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31494141, + "step": 2680, + "time_per_iteration": 2.8514578342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098157, + "balance_loss_mlp": 1.06547022, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.091547983778046, + "language_loss": 0.86229038, + "learning_rate": 0.0004987538320503271, + "loss": 0.87327194, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3269043, + "step": 2681, + "time_per_iteration": 2.478638172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_mlp": 1.06798983, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.07018643811750969, + "language_loss": 0.83312553, + "learning_rate": 0.0004984422909700442, + "loss": 0.8441304, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.32495117, + "step": 2682, + "time_per_iteration": 2.6546084880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.06783557, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.15069020701750013, + "language_loss": 0.84435642, + "learning_rate": 0.0004981307504945173, + "loss": 0.85534728, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31225586, + "step": 2683, + "time_per_iteration": 2.71260929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110147, + "balance_loss_mlp": 1.06914032, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.0559262102608404, + "language_loss": 0.89665949, + "learning_rate": 0.0004978192107446976, + "loss": 0.90767419, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.32324219, + "step": 2684, + "time_per_iteration": 2.767662763595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097385, + "balance_loss_mlp": 1.06650972, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.06901755479732997, + "language_loss": 0.87345654, + "learning_rate": 0.0004975076718415353, + "loss": 0.88443041, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30834961, + "step": 2685, + "time_per_iteration": 2.6287574768066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110088, + "balance_loss_mlp": 1.06988525, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.05502113672837593, + "language_loss": 0.91023147, + "learning_rate": 0.0004971961339059806, + "loss": 0.92124021, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.30957031, + "step": 2686, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_mlp": 1.07256198, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06476684801011888, + "language_loss": 0.84195554, + "learning_rate": 0.0004968845970589832, + "loss": 0.85300732, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.32617188, + "step": 2687, + "time_per_iteration": 2.6715877056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102414, + "balance_loss_mlp": 1.06896389, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0648303600022088, + "language_loss": 0.84613401, + "learning_rate": 0.0004965730614214926, + "loss": 0.85715812, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.3347168, + "step": 2688, + "time_per_iteration": 2.6734025478363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099959, + "balance_loss_mlp": 1.06720066, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.05675548235902804, + "language_loss": 0.85410345, + "learning_rate": 0.0004962615271144576, + "loss": 0.86510307, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.32739258, + "step": 2689, + "time_per_iteration": 2.4930050373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101842, + "balance_loss_mlp": 1.0703702, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.06418610502647971, + "language_loss": 0.82956815, + "learning_rate": 0.0004959499942588264, + "loss": 0.8405866, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.31469727, + "step": 2690, + "time_per_iteration": 2.904674768447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.070189, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.04799778536167862, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79278797, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.0859375, + "step": 2691, + "time_per_iteration": 4.761531591415405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105601, + "balance_loss_mlp": 1.07255602, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.051278898576550616, + "language_loss": 0.85877872, + "learning_rate": 0.0004953269333855661, + "loss": 0.86983472, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.33032227, + "step": 2692, + "time_per_iteration": 2.729318857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104257, + "balance_loss_mlp": 1.07328665, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.05911618517599564, + "language_loss": 0.84474307, + "learning_rate": 0.0004950154056098309, + "loss": 0.85578561, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.30932617, + "step": 2693, + "time_per_iteration": 2.6833436489105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.09158325, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.059128614865360495, + "language_loss": 0.83972096, + "learning_rate": 0.0004947038797692867, + "loss": 0.85096538, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.32861328, + "step": 2694, + "time_per_iteration": 2.82362961769104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119523, + "balance_loss_mlp": 1.08635902, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.05692767589933962, + "language_loss": 0.77609885, + "learning_rate": 0.0004943923559848789, + "loss": 0.78729415, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.33178711, + "step": 2695, + "time_per_iteration": 2.7919468879699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123482, + "balance_loss_mlp": 1.09112859, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06299979408052762, + "language_loss": 0.90267843, + "learning_rate": 0.0004940808343775515, + "loss": 0.91391325, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.32348633, + "step": 2696, + "time_per_iteration": 2.6863224506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112015, + "balance_loss_mlp": 1.08748627, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.06289973384355804, + "language_loss": 0.82184958, + "learning_rate": 0.0004937693150682479, + "loss": 0.83305109, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.32666016, + "step": 2697, + "time_per_iteration": 2.5169589519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124428, + "balance_loss_mlp": 1.09109747, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.0748090565006246, + "language_loss": 0.76575571, + "learning_rate": 0.0004934577981779107, + "loss": 0.77699995, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.33325195, + "step": 2698, + "time_per_iteration": 2.65891432762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111403, + "balance_loss_mlp": 1.08103275, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.34709701447359415, + "language_loss": 0.81575179, + "learning_rate": 0.0004931462838274817, + "loss": 0.82689214, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.33007812, + "step": 2699, + "time_per_iteration": 2.829094648361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113032, + "balance_loss_mlp": 1.09694147, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.06002024337813523, + "language_loss": 0.84538823, + "learning_rate": 0.0004928347721379011, + "loss": 0.85669148, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.33398438, + "step": 2700, + "time_per_iteration": 2.685887098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128853, + "balance_loss_mlp": 1.09499812, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.07280089907997458, + "language_loss": 0.82063133, + "learning_rate": 0.0004925232632301089, + "loss": 0.83191985, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.33886719, + "step": 2701, + "time_per_iteration": 2.5586745738983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139592, + "balance_loss_mlp": 1.10711944, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.05869071142497867, + "language_loss": 0.7981168, + "learning_rate": 0.0004922117572250431, + "loss": 0.80951279, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.32495117, + "step": 2702, + "time_per_iteration": 2.652883768081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154601, + "balance_loss_mlp": 1.12041199, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08372395695209851, + "language_loss": 0.80792272, + "learning_rate": 0.0004919002542436414, + "loss": 0.8194688, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.34155273, + "step": 2703, + "time_per_iteration": 2.8069591522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156311, + "balance_loss_mlp": 1.12131107, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.06918407740604555, + "language_loss": 0.81692028, + "learning_rate": 0.0004915887544068399, + "loss": 0.82848334, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.35009766, + "step": 2704, + "time_per_iteration": 2.6484997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159384, + "balance_loss_mlp": 1.12228656, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.0754612517988151, + "language_loss": 0.78528553, + "learning_rate": 0.0004912772578355736, + "loss": 0.79687935, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.37084961, + "step": 2705, + "time_per_iteration": 2.889177083969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115407, + "balance_loss_mlp": 1.11825967, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.06509959827239385, + "language_loss": 0.83146906, + "learning_rate": 0.000490965764650776, + "loss": 0.84300983, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.3581543, + "step": 2706, + "time_per_iteration": 2.885923385620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115916, + "balance_loss_mlp": 1.12346911, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06296986612889613, + "language_loss": 0.82775491, + "learning_rate": 0.0004906542749733798, + "loss": 0.83934653, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.35693359, + "step": 2707, + "time_per_iteration": 3.6151185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152032, + "balance_loss_mlp": 1.11653161, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.046885737032271585, + "language_loss": 0.85312223, + "learning_rate": 0.0004903427889243156, + "loss": 0.86464256, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.35498047, + "step": 2708, + "time_per_iteration": 2.8592212200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169903, + "balance_loss_mlp": 1.13335371, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07702072033180815, + "language_loss": 0.85470927, + "learning_rate": 0.0004900313066245134, + "loss": 0.86640829, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.36547852, + "step": 2709, + "time_per_iteration": 2.7046992778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155719, + "balance_loss_mlp": 1.12145817, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.049948125939344834, + "language_loss": 0.80970949, + "learning_rate": 0.0004897198281949012, + "loss": 0.82126665, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.34277344, + "step": 2710, + "time_per_iteration": 2.728750228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164738, + "balance_loss_mlp": 1.12837923, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.06520397862885238, + "language_loss": 0.77818954, + "learning_rate": 0.0004894083537564057, + "loss": 0.78983688, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.36352539, + "step": 2711, + "time_per_iteration": 2.7362277507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163972, + "balance_loss_mlp": 1.12913883, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.051241123094768644, + "language_loss": 0.81174654, + "learning_rate": 0.0004890968834299519, + "loss": 0.82338625, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.34838867, + "step": 2712, + "time_per_iteration": 2.768146514892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156146, + "balance_loss_mlp": 1.12026405, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.05945211160457726, + "language_loss": 0.78877795, + "learning_rate": 0.0004887854173364633, + "loss": 0.80033934, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.35913086, + "step": 2713, + "time_per_iteration": 2.8356804847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149792, + "balance_loss_mlp": 1.1157217, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.05274159181021226, + "language_loss": 0.81621301, + "learning_rate": 0.0004884739555968617, + "loss": 0.82771093, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.34057617, + "step": 2714, + "time_per_iteration": 2.831137180328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.09369898, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.02923312307597506, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8007924, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08496094, + "step": 2715, + "time_per_iteration": 4.95891547203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149677, + "balance_loss_mlp": 1.11534512, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06614932878153669, + "language_loss": 0.86865598, + "learning_rate": 0.0004878510456629992, + "loss": 0.88015276, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.34326172, + "step": 2716, + "time_per_iteration": 2.968658924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145792, + "balance_loss_mlp": 1.1120801, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.05224698034347332, + "language_loss": 0.8526777, + "learning_rate": 0.00048753959771057314, + "loss": 0.86413562, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.33740234, + "step": 2717, + "time_per_iteration": 2.6395833492279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140286, + "balance_loss_mlp": 1.10736012, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.0584811227693513, + "language_loss": 0.83152837, + "learning_rate": 0.0004872281545957044, + "loss": 0.84293115, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.3293457, + "step": 2718, + "time_per_iteration": 2.7039849758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135383, + "balance_loss_mlp": 1.10069275, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.050310473622198856, + "language_loss": 0.85946554, + "learning_rate": 0.0004869167164393055, + "loss": 0.87081933, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.34692383, + "step": 2719, + "time_per_iteration": 2.91475510597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132518, + "balance_loss_mlp": 1.10028338, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.0697291023285212, + "language_loss": 0.89398658, + "learning_rate": 0.00048660528336228793, + "loss": 0.90531176, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.32226562, + "step": 2720, + "time_per_iteration": 2.792276620864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124032, + "balance_loss_mlp": 1.09115386, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05026677719306565, + "language_loss": 0.90367562, + "learning_rate": 0.0004862938554855606, + "loss": 0.91491592, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.32885742, + "step": 2721, + "time_per_iteration": 2.7964749336242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129388, + "balance_loss_mlp": 1.09643817, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.0663768296863652, + "language_loss": 0.86310339, + "learning_rate": 0.0004859824329300304, + "loss": 0.87439728, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.32958984, + "step": 2722, + "time_per_iteration": 2.6039419174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128053, + "balance_loss_mlp": 1.09403062, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.0581387375581185, + "language_loss": 0.84092689, + "learning_rate": 0.00048567101581660244, + "loss": 0.85220736, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.34033203, + "step": 2723, + "time_per_iteration": 2.5987517833709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.09227037, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.06184026942262611, + "language_loss": 0.87479013, + "learning_rate": 0.00048535960426617956, + "loss": 0.88604021, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.32739258, + "step": 2724, + "time_per_iteration": 2.6038565635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121549, + "balance_loss_mlp": 1.08724082, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.05825945935903347, + "language_loss": 0.81925243, + "learning_rate": 0.0004850481983996621, + "loss": 0.83046794, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.34350586, + "step": 2725, + "time_per_iteration": 2.7633490562438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122144, + "balance_loss_mlp": 1.08907521, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.06367267368201004, + "language_loss": 0.88101065, + "learning_rate": 0.0004847367983379492, + "loss": 0.89223206, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.33081055, + "step": 2726, + "time_per_iteration": 2.520050287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119938, + "balance_loss_mlp": 1.08837104, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.059069616726974465, + "language_loss": 0.79169118, + "learning_rate": 0.00048442540420193643, + "loss": 0.80289054, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.31567383, + "step": 2727, + "time_per_iteration": 2.9363925457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125304, + "balance_loss_mlp": 1.09278345, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.06091521023817234, + "language_loss": 0.7936945, + "learning_rate": 0.0004841140161125182, + "loss": 0.8049475, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.32543945, + "step": 2728, + "time_per_iteration": 3.5786640644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127178, + "balance_loss_mlp": 1.09666038, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.054648351094499156, + "language_loss": 0.85262787, + "learning_rate": 0.0004838026341905857, + "loss": 0.86389971, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.30517578, + "step": 2729, + "time_per_iteration": 2.7021641731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.09909368, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.06068419443661206, + "language_loss": 0.85131037, + "learning_rate": 0.00048349125855702844, + "loss": 0.8626138, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.3125, + "step": 2730, + "time_per_iteration": 2.794691562652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129298, + "balance_loss_mlp": 1.09754109, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.0500444792759443, + "language_loss": 0.81508827, + "learning_rate": 0.00048317988933273287, + "loss": 0.82638121, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.31738281, + "step": 2731, + "time_per_iteration": 2.7251734733581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124341, + "balance_loss_mlp": 1.09291768, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.06596294225314246, + "language_loss": 0.82520533, + "learning_rate": 0.00048286852663858367, + "loss": 0.83644867, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.31420898, + "step": 2732, + "time_per_iteration": 2.972963571548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.0889498, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.055500139325311094, + "language_loss": 0.84107697, + "learning_rate": 0.000482557170595462, + "loss": 0.85228211, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.31542969, + "step": 2733, + "time_per_iteration": 2.858245849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112503, + "balance_loss_mlp": 1.09401202, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.13743136293517658, + "language_loss": 0.87933344, + "learning_rate": 0.0004822458213242475, + "loss": 0.89058375, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31005859, + "step": 2734, + "time_per_iteration": 2.522383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112386, + "balance_loss_mlp": 1.08115363, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.05651199089550523, + "language_loss": 0.86197513, + "learning_rate": 0.00048193447894581627, + "loss": 0.87309897, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.31201172, + "step": 2735, + "time_per_iteration": 3.0866682529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111368, + "balance_loss_mlp": 1.08235216, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06879211849592783, + "language_loss": 0.88187921, + "learning_rate": 0.00048162314358104243, + "loss": 0.89301598, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.31298828, + "step": 2736, + "time_per_iteration": 2.5985138416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108841, + "balance_loss_mlp": 1.07713127, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.05778047820427569, + "language_loss": 0.83687961, + "learning_rate": 0.0004813118153507969, + "loss": 0.84796798, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.31713867, + "step": 2737, + "time_per_iteration": 2.73371958732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_mlp": 1.01416731, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.01810308130118829, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83469975, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.08691406, + "step": 2738, + "time_per_iteration": 4.790890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097772, + "balance_loss_mlp": 1.06670594, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.05745954748436515, + "language_loss": 0.83672923, + "learning_rate": 0.00048068918077736163, + "loss": 0.84770691, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.31030273, + "step": 2739, + "time_per_iteration": 3.239821195602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06309009, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06477195420820829, + "language_loss": 0.81728363, + "learning_rate": 0.0004803778746759001, + "loss": 0.82822424, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.30932617, + "step": 2740, + "time_per_iteration": 2.942760944366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.06614065, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.05799868370730736, + "language_loss": 0.81935298, + "learning_rate": 0.00048006657619242317, + "loss": 0.83032262, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.30810547, + "step": 2741, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06550419, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07558439368734231, + "language_loss": 0.78591353, + "learning_rate": 0.00047975528544778775, + "loss": 0.79689896, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.33056641, + "step": 2742, + "time_per_iteration": 2.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.06058371, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.06405052151098177, + "language_loss": 0.88749677, + "learning_rate": 0.00047944400256284754, + "loss": 0.89840853, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.30566406, + "step": 2743, + "time_per_iteration": 2.6816787719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098065, + "balance_loss_mlp": 1.06809616, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07088810283562207, + "language_loss": 0.80461031, + "learning_rate": 0.0004791327276584532, + "loss": 0.81559092, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.29956055, + "step": 2744, + "time_per_iteration": 2.8708317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098246, + "balance_loss_mlp": 1.06596446, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.06685009455993486, + "language_loss": 0.8087393, + "learning_rate": 0.00047882146085545264, + "loss": 0.81972182, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.32250977, + "step": 2745, + "time_per_iteration": 2.610027551651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_mlp": 1.01204121, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.008936429220158798, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76423383, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.08984375, + "step": 2746, + "time_per_iteration": 5.000555038452148 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097643, + "balance_loss_mlp": 1.06767416, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.06348628312729114, + "language_loss": 0.79553157, + "learning_rate": 0.00047819895203700684, + "loss": 0.806508, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29907227, + "step": 2747, + "time_per_iteration": 2.7115635871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017614, + "balance_loss_mlp": 1.0085541, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.007776557121409109, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76530045, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09082031, + "step": 2748, + "time_per_iteration": 4.672155141830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092605, + "balance_loss_mlp": 1.06263614, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.06781776650114792, + "language_loss": 0.8852309, + "learning_rate": 0.0004775764770742277, + "loss": 0.89615691, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29931641, + "step": 2749, + "time_per_iteration": 2.8029801845550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06542146, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.07126893850665976, + "language_loss": 0.86776084, + "learning_rate": 0.00047726525259079777, + "loss": 0.87873781, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.32299805, + "step": 2750, + "time_per_iteration": 2.7803709506988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097184, + "balance_loss_mlp": 1.06568849, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.07487878206236488, + "language_loss": 0.88641649, + "learning_rate": 0.0004769540369337798, + "loss": 0.89738834, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.31469727, + "step": 2751, + "time_per_iteration": 2.7477662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103352, + "balance_loss_mlp": 1.07166588, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06303354467879724, + "language_loss": 0.86111081, + "learning_rate": 0.00047664283022399794, + "loss": 0.87214434, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.31665039, + "step": 2752, + "time_per_iteration": 2.8321616649627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111513, + "balance_loss_mlp": 1.08142424, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.1009265551294561, + "language_loss": 0.81372654, + "learning_rate": 0.00047633163258227376, + "loss": 0.82484162, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.30053711, + "step": 2753, + "time_per_iteration": 2.866710662841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107072, + "balance_loss_mlp": 1.07536244, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06597410250171662, + "language_loss": 0.85720521, + "learning_rate": 0.0004760204441294247, + "loss": 0.86827588, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.31689453, + "step": 2754, + "time_per_iteration": 2.635411500930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123028, + "balance_loss_mlp": 1.09172344, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06814428712155127, + "language_loss": 0.86859232, + "learning_rate": 0.00047570926498626486, + "loss": 0.87982261, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31274414, + "step": 2755, + "time_per_iteration": 2.678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.10846841, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05259166917973927, + "language_loss": 0.8179211, + "learning_rate": 0.00047539809527360474, + "loss": 0.82931906, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31298828, + "step": 2756, + "time_per_iteration": 2.8505630493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139868, + "balance_loss_mlp": 1.1087544, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.23589307030508885, + "language_loss": 0.82282543, + "learning_rate": 0.0004750869351122511, + "loss": 0.83422416, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.31079102, + "step": 2757, + "time_per_iteration": 3.007599353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114789, + "balance_loss_mlp": 1.11598992, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.06932827369161218, + "language_loss": 0.81883401, + "learning_rate": 0.00047477578462300685, + "loss": 0.83031291, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.31884766, + "step": 2758, + "time_per_iteration": 2.7112765312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.11215043, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.060390901611552056, + "language_loss": 0.79751188, + "learning_rate": 0.0004744646439266718, + "loss": 0.80895996, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.32641602, + "step": 2759, + "time_per_iteration": 2.9956624507904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_mlp": 1.10905194, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.0692957942514688, + "language_loss": 0.92371601, + "learning_rate": 0.000474153513144041, + "loss": 0.93513119, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.32470703, + "step": 2760, + "time_per_iteration": 2.902304172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114025, + "balance_loss_mlp": 1.10756326, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06953135792158749, + "language_loss": 0.87208283, + "learning_rate": 0.00047384239239590633, + "loss": 0.88348538, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.3269043, + "step": 2761, + "time_per_iteration": 2.9197542667388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127002, + "balance_loss_mlp": 1.09414792, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06520154041113266, + "language_loss": 0.89041948, + "learning_rate": 0.0004735312818030556, + "loss": 0.90168953, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.32861328, + "step": 2762, + "time_per_iteration": 2.699882745742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128123, + "balance_loss_mlp": 1.0964613, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.0963196289257929, + "language_loss": 0.83125454, + "learning_rate": 0.0004732201814862727, + "loss": 0.84253573, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31640625, + "step": 2763, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113884, + "balance_loss_mlp": 1.08155453, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.058489246415432364, + "language_loss": 0.81845987, + "learning_rate": 0.0004729090915663373, + "loss": 0.82959872, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.32324219, + "step": 2764, + "time_per_iteration": 2.880218029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112044, + "balance_loss_mlp": 1.07930923, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.08176902294326427, + "language_loss": 0.85593212, + "learning_rate": 0.00047259801216402534, + "loss": 0.86705256, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.32739258, + "step": 2765, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.0809716, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.0984419544464696, + "language_loss": 0.86589384, + "learning_rate": 0.00047228694340010845, + "loss": 0.87702894, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.32543945, + "step": 2766, + "time_per_iteration": 2.615323781967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106832, + "balance_loss_mlp": 1.07288122, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.06857994992356635, + "language_loss": 0.85894436, + "learning_rate": 0.0004719758853953544, + "loss": 0.87001264, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.33984375, + "step": 2767, + "time_per_iteration": 3.580965042114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109799, + "balance_loss_mlp": 1.07475162, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.07966941077078553, + "language_loss": 0.84403044, + "learning_rate": 0.00047166483827052645, + "loss": 0.85512847, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.35083008, + "step": 2768, + "time_per_iteration": 2.3937976360321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112761, + "balance_loss_mlp": 1.09797895, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.05218838233145069, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78191251, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.14746094, + "step": 2769, + "time_per_iteration": 4.980372905731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112083, + "balance_loss_mlp": 1.07910919, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.05422451751257763, + "language_loss": 0.8393681, + "learning_rate": 0.000471042777143682, + "loss": 0.8504889, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.32958984, + "step": 2770, + "time_per_iteration": 3.2559990882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109219, + "balance_loss_mlp": 1.07576907, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.05619534531580183, + "language_loss": 0.79500479, + "learning_rate": 0.0004707317633831707, + "loss": 0.80609697, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.3347168, + "step": 2771, + "time_per_iteration": 2.580369472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113861, + "balance_loss_mlp": 1.07976723, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.07426752742264173, + "language_loss": 0.78140616, + "learning_rate": 0.00047042076098559673, + "loss": 0.79254484, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.34130859, + "step": 2772, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115026, + "balance_loss_mlp": 1.08131373, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07148667655520102, + "language_loss": 0.74185407, + "learning_rate": 0.00047010977007170174, + "loss": 0.75300431, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.3371582, + "step": 2773, + "time_per_iteration": 3.2167580127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103553, + "balance_loss_mlp": 1.07079434, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.05649801417476766, + "language_loss": 0.82702589, + "learning_rate": 0.00046979879076222334, + "loss": 0.83806139, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.32763672, + "step": 2774, + "time_per_iteration": 2.6618025302886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109156, + "balance_loss_mlp": 1.07689798, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.05944272870619304, + "language_loss": 0.85247773, + "learning_rate": 0.0004694878231778939, + "loss": 0.86356932, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.32250977, + "step": 2775, + "time_per_iteration": 3.381577968597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105801, + "balance_loss_mlp": 1.07459164, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.05869389504796052, + "language_loss": 0.84721255, + "learning_rate": 0.0004691768674394423, + "loss": 0.85827059, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.31176758, + "step": 2776, + "time_per_iteration": 2.9549882411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_mlp": 1.03230345, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.020468065913813137, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85525757, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09423828, + "step": 2777, + "time_per_iteration": 4.780264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_mlp": 1.03013933, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.02045845897293101, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77692783, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09130859, + "step": 2778, + "time_per_iteration": 5.030272960662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101269, + "balance_loss_mlp": 1.06870127, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06089610481991967, + "language_loss": 0.7961477, + "learning_rate": 0.00046824407250656676, + "loss": 0.80716044, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.32568359, + "step": 2779, + "time_per_iteration": 2.6681063175201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096395, + "balance_loss_mlp": 1.06537652, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.04990324067280663, + "language_loss": 0.83819127, + "learning_rate": 0.0004679331653588161, + "loss": 0.84915525, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.30981445, + "step": 2780, + "time_per_iteration": 2.635774612426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092346, + "balance_loss_mlp": 1.05999231, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.06684745885443293, + "language_loss": 0.85806221, + "learning_rate": 0.0004676222706605147, + "loss": 0.86898565, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.32348633, + "step": 2781, + "time_per_iteration": 2.6137733459472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092057, + "balance_loss_mlp": 1.05886936, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08426708268962642, + "language_loss": 0.85464495, + "learning_rate": 0.0004673113885323626, + "loss": 0.86556554, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.33203125, + "step": 2782, + "time_per_iteration": 2.861581802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083804, + "balance_loss_mlp": 1.05083072, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.060311716473253056, + "language_loss": 0.78792584, + "learning_rate": 0.00046700051909505494, + "loss": 0.79876387, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.32983398, + "step": 2783, + "time_per_iteration": 3.182298183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089723, + "balance_loss_mlp": 1.05407953, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678042842361867, + "language_loss": 0.84239137, + "learning_rate": 0.000466689662469282, + "loss": 0.85328859, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.35644531, + "step": 2784, + "time_per_iteration": 2.6519503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.04891968, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06002174049054728, + "language_loss": 0.83905756, + "learning_rate": 0.00046637881877572917, + "loss": 0.84987772, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.33105469, + "step": 2785, + "time_per_iteration": 3.1058127880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_mlp": 1.051754, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.0580195679012457, + "language_loss": 0.8490684, + "learning_rate": 0.0004660679881350764, + "loss": 0.85991538, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.32958984, + "step": 2786, + "time_per_iteration": 2.77021861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053559, + "balance_loss_mlp": 1.0447371, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.032864625150969516, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76661706, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.08837891, + "step": 2787, + "time_per_iteration": 5.029211044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087215, + "balance_loss_mlp": 1.05335903, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07679411484967892, + "language_loss": 0.77928644, + "learning_rate": 0.0004654463664951667, + "loss": 0.79015857, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.33886719, + "step": 2788, + "time_per_iteration": 2.9762089252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088019, + "balance_loss_mlp": 1.05464029, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06025701653165108, + "language_loss": 0.83150423, + "learning_rate": 0.0004651355757372447, + "loss": 0.84238434, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.33398438, + "step": 2789, + "time_per_iteration": 2.5971946716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089252, + "balance_loss_mlp": 1.05604005, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.08338964083328992, + "language_loss": 0.8607617, + "learning_rate": 0.00046482479851489274, + "loss": 0.87165421, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.33227539, + "step": 2790, + "time_per_iteration": 2.6431193351745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109405, + "balance_loss_mlp": 1.06119633, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.07763218475438792, + "language_loss": 0.77609432, + "learning_rate": 0.00046451403494876525, + "loss": 0.78703481, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.32861328, + "step": 2791, + "time_per_iteration": 2.860164165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092942, + "balance_loss_mlp": 1.05918157, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.06279789357775317, + "language_loss": 0.84532517, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625458, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.33789062, + "step": 2792, + "time_per_iteration": 2.7511003017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106074, + "balance_loss_mlp": 1.07081246, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.05029896863334896, + "language_loss": 0.85103881, + "learning_rate": 0.00046389254926777404, + "loss": 0.86209953, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.35253906, + "step": 2793, + "time_per_iteration": 2.7946324348449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.07229924, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.05473465194283574, + "language_loss": 0.78127646, + "learning_rate": 0.0004635818273941926, + "loss": 0.79232681, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.32739258, + "step": 2794, + "time_per_iteration": 3.5742921829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109863, + "balance_loss_mlp": 1.07641304, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.07615315185796866, + "language_loss": 0.82079315, + "learning_rate": 0.0004632711196593997, + "loss": 0.83189178, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.3347168, + "step": 2795, + "time_per_iteration": 2.7694544792175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110741, + "balance_loss_mlp": 1.07907939, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.07020702036152926, + "language_loss": 0.85457337, + "learning_rate": 0.00046296042618402297, + "loss": 0.86568069, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.31640625, + "step": 2796, + "time_per_iteration": 3.0587034225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109738, + "balance_loss_mlp": 1.07883883, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06759922925686453, + "language_loss": 0.7969842, + "learning_rate": 0.0004626497470886839, + "loss": 0.80808163, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30883789, + "step": 2797, + "time_per_iteration": 3.002824068069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105945, + "balance_loss_mlp": 1.07299602, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.07466819588637175, + "language_loss": 0.82158947, + "learning_rate": 0.00046233908249399897, + "loss": 0.83264899, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.32958984, + "step": 2798, + "time_per_iteration": 2.7746241092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097876, + "balance_loss_mlp": 1.06559372, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.05453000178981586, + "language_loss": 0.78238356, + "learning_rate": 0.00046202843252057905, + "loss": 0.79336226, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.32275391, + "step": 2799, + "time_per_iteration": 2.581350803375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097308, + "balance_loss_mlp": 1.06478727, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06584834464031906, + "language_loss": 0.84020996, + "learning_rate": 0.00046171779728902896, + "loss": 0.85118306, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.32495117, + "step": 2800, + "time_per_iteration": 2.577760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.05988431, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.0769580423168035, + "language_loss": 0.85918987, + "learning_rate": 0.000461407176919948, + "loss": 0.87011129, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.32250977, + "step": 2801, + "time_per_iteration": 2.5490942001342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093913, + "balance_loss_mlp": 1.06189322, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.05361052263899676, + "language_loss": 0.85168314, + "learning_rate": 0.00046109657153392997, + "loss": 0.86262226, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.32006836, + "step": 2802, + "time_per_iteration": 2.7699196338653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095158, + "balance_loss_mlp": 1.06132686, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07003946535384918, + "language_loss": 0.82877356, + "learning_rate": 0.0004607859812515622, + "loss": 0.83972514, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.33862305, + "step": 2803, + "time_per_iteration": 2.6007485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093716, + "balance_loss_mlp": 1.06198251, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06322278970979951, + "language_loss": 0.88066649, + "learning_rate": 0.00046047540619342667, + "loss": 0.89160359, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.31713867, + "step": 2804, + "time_per_iteration": 2.5943124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06163239, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.060964528711389604, + "language_loss": 0.80115181, + "learning_rate": 0.00046016484648009933, + "loss": 0.81207782, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30957031, + "step": 2805, + "time_per_iteration": 2.707387924194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096878, + "balance_loss_mlp": 1.0659312, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.05960799154457967, + "language_loss": 0.80838758, + "learning_rate": 0.0004598543022321501, + "loss": 0.81935638, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.30908203, + "step": 2806, + "time_per_iteration": 2.606360673904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103257, + "balance_loss_mlp": 1.07080865, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.059370042319646085, + "language_loss": 0.80030453, + "learning_rate": 0.0004595437735701433, + "loss": 0.81133705, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.32446289, + "step": 2807, + "time_per_iteration": 2.674914836883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.06448901, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.07129928038264445, + "language_loss": 0.83467567, + "learning_rate": 0.00045923326061463623, + "loss": 0.84564078, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.32006836, + "step": 2808, + "time_per_iteration": 2.7732136249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093728, + "balance_loss_mlp": 1.0615654, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.061183409959599915, + "language_loss": 0.81861985, + "learning_rate": 0.00045892276348618113, + "loss": 0.82955706, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.3215332, + "step": 2809, + "time_per_iteration": 2.9496963024139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_mlp": 1.03318524, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.03295349175272743, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79301834, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.078125, + "step": 2810, + "time_per_iteration": 4.980771064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095175, + "balance_loss_mlp": 1.06375122, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.048089637178950914, + "language_loss": 0.80807102, + "learning_rate": 0.000458301817192603, + "loss": 0.81902277, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.31396484, + "step": 2811, + "time_per_iteration": 2.819394111633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014174, + "balance_loss_mlp": 1.00659227, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.018125943247431338, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81855953, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.07568359, + "step": 2812, + "time_per_iteration": 4.830869197845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094625, + "balance_loss_mlp": 1.06312966, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.07142535441885249, + "language_loss": 0.8774603, + "learning_rate": 0.00045768093565369983, + "loss": 0.88840652, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31494141, + "step": 2813, + "time_per_iteration": 2.7351324558258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06911242, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.0566212514723048, + "language_loss": 0.82215679, + "learning_rate": 0.0004573705194685646, + "loss": 0.83316934, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.32128906, + "step": 2814, + "time_per_iteration": 2.6945576667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100716, + "balance_loss_mlp": 1.06860089, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.06333436634677812, + "language_loss": 0.85428321, + "learning_rate": 0.00045706011983366157, + "loss": 0.86529034, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.32080078, + "step": 2815, + "time_per_iteration": 2.681619882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108066, + "balance_loss_mlp": 1.07623768, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.068256039366798, + "language_loss": 0.8269453, + "learning_rate": 0.00045674973686949847, + "loss": 0.83802599, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.31835938, + "step": 2816, + "time_per_iteration": 2.5405073165893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109346, + "balance_loss_mlp": 1.07830381, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.0555657817841838, + "language_loss": 0.85590029, + "learning_rate": 0.0004564393706965766, + "loss": 0.86699367, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 2.9834089279174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102481, + "balance_loss_mlp": 1.07079506, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.052731051534337416, + "language_loss": 0.81111342, + "learning_rate": 0.00045612902143539116, + "loss": 0.82213825, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31665039, + "step": 2818, + "time_per_iteration": 2.5867249965667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06935942, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.08027643777933474, + "language_loss": 0.82169372, + "learning_rate": 0.00045581868920642986, + "loss": 0.83268583, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.29833984, + "step": 2819, + "time_per_iteration": 2.538219928741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100605, + "balance_loss_mlp": 1.06968212, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.056746529630016036, + "language_loss": 0.79290533, + "learning_rate": 0.00045550837413017457, + "loss": 0.80391139, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30883789, + "step": 2820, + "time_per_iteration": 2.6461877822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100089, + "balance_loss_mlp": 1.06995249, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06471497165860861, + "language_loss": 0.85196662, + "learning_rate": 0.0004551980763271005, + "loss": 0.86296749, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30102539, + "step": 2821, + "time_per_iteration": 2.6883745193481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_mlp": 1.07015133, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.058885459141671155, + "language_loss": 0.84080005, + "learning_rate": 0.0004548877959176756, + "loss": 0.85182083, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.3190918, + "step": 2822, + "time_per_iteration": 2.861867666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.06595802, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06589540393120931, + "language_loss": 0.86233151, + "learning_rate": 0.00045457753302236166, + "loss": 0.87329865, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30737305, + "step": 2823, + "time_per_iteration": 2.687164068222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097063, + "balance_loss_mlp": 1.06685555, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07425338305054356, + "language_loss": 0.87034917, + "learning_rate": 0.00045426728776161353, + "loss": 0.88131976, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30175781, + "step": 2824, + "time_per_iteration": 2.7938835620880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104052, + "balance_loss_mlp": 1.07224679, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.05711338707468448, + "language_loss": 0.81608665, + "learning_rate": 0.00045395706025587863, + "loss": 0.82712722, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.31787109, + "step": 2825, + "time_per_iteration": 2.6212074756622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099159, + "balance_loss_mlp": 1.06907105, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.07865669635555295, + "language_loss": 0.8299852, + "learning_rate": 0.00045364685062559843, + "loss": 0.84097683, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30078125, + "step": 2826, + "time_per_iteration": 2.8868184089660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104022, + "balance_loss_mlp": 1.07505381, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06023434626032839, + "language_loss": 0.91765273, + "learning_rate": 0.0004533366589912067, + "loss": 0.92869294, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.28955078, + "step": 2827, + "time_per_iteration": 2.9981062412261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105445, + "balance_loss_mlp": 1.07557106, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.06990968055660145, + "language_loss": 0.78070033, + "learning_rate": 0.0004530264854731306, + "loss": 0.79175478, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29858398, + "step": 2828, + "time_per_iteration": 3.0054330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107215, + "balance_loss_mlp": 1.07605386, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05020371190449787, + "language_loss": 0.84383601, + "learning_rate": 0.00045271633019179034, + "loss": 0.85490811, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.3112793, + "step": 2829, + "time_per_iteration": 2.775956630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107068, + "balance_loss_mlp": 1.07605028, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05805566098722391, + "language_loss": 0.88203323, + "learning_rate": 0.0004524061932675986, + "loss": 0.8931039, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.30981445, + "step": 2830, + "time_per_iteration": 2.8221793174743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106595, + "balance_loss_mlp": 1.07555294, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.0740029895366448, + "language_loss": 0.87459874, + "learning_rate": 0.00045209607482096125, + "loss": 0.8856647, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.31005859, + "step": 2831, + "time_per_iteration": 3.0393142700195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.0710969, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.08209208283258153, + "language_loss": 0.84651136, + "learning_rate": 0.0004517859749722772, + "loss": 0.85753322, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.31054688, + "step": 2832, + "time_per_iteration": 2.6821095943450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105232, + "balance_loss_mlp": 1.07368898, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.07359331276456935, + "language_loss": 0.79572821, + "learning_rate": 0.0004514758938419376, + "loss": 0.80678058, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.31518555, + "step": 2833, + "time_per_iteration": 2.8375093936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080375, + "balance_loss_mlp": 1.07288861, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03314547284214794, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78000963, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.07470703, + "step": 2834, + "time_per_iteration": 4.963228225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06930006, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.057187543491433894, + "language_loss": 0.83827722, + "learning_rate": 0.00045085578821782175, + "loss": 0.84927469, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.3046875, + "step": 2835, + "time_per_iteration": 2.5562217235565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054355, + "balance_loss_mlp": 1.04696393, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.02358753311446476, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77189088, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.07373047, + "step": 2836, + "time_per_iteration": 4.959676742553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100094, + "balance_loss_mlp": 1.06983829, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.0408398110042356, + "language_loss": 0.80949795, + "learning_rate": 0.00045023575891159866, + "loss": 0.82049894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30200195, + "step": 2837, + "time_per_iteration": 2.74700665473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_mlp": 1.01894093, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.01524116386105569, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75790191, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.07421875, + "step": 2838, + "time_per_iteration": 4.9733850955963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.07366681, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.05292635351535042, + "language_loss": 0.78244042, + "learning_rate": 0.0004496158068861354, + "loss": 0.79347491, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29760742, + "step": 2839, + "time_per_iteration": 2.8023805618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110962, + "balance_loss_mlp": 1.08010423, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.0535580092110964, + "language_loss": 0.80844593, + "learning_rate": 0.00044930586015455207, + "loss": 0.81954211, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.29492188, + "step": 2840, + "time_per_iteration": 2.816567897796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118684, + "balance_loss_mlp": 1.08804703, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.06541969342762931, + "language_loss": 0.89212978, + "learning_rate": 0.000448995933104179, + "loss": 0.90331668, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.3059082, + "step": 2841, + "time_per_iteration": 2.903371810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115887, + "balance_loss_mlp": 1.08601356, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06848140377985366, + "language_loss": 0.80388117, + "learning_rate": 0.00044868602585534077, + "loss": 0.81504011, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.29833984, + "step": 2842, + "time_per_iteration": 2.870833396911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104882, + "balance_loss_mlp": 1.07519853, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.06871095275450309, + "language_loss": 0.89058006, + "learning_rate": 0.0004483761385283541, + "loss": 0.90162885, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.29663086, + "step": 2843, + "time_per_iteration": 2.5367324352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.06863523, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.05633892340966096, + "language_loss": 0.81610817, + "learning_rate": 0.0004480662712435281, + "loss": 0.82710731, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.3125, + "step": 2844, + "time_per_iteration": 2.8301496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.0627687, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.05986468354955699, + "language_loss": 0.88694894, + "learning_rate": 0.0004477564241211635, + "loss": 0.89787042, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.2935791, + "step": 2845, + "time_per_iteration": 2.5813820362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086916, + "balance_loss_mlp": 1.05787718, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.059098326299960216, + "language_loss": 0.87329561, + "learning_rate": 0.0004474465972815541, + "loss": 0.88416475, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.2902832, + "step": 2846, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_mlp": 1.05730796, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05595262091427783, + "language_loss": 0.87812984, + "learning_rate": 0.000447136790844985, + "loss": 0.88898313, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.28027344, + "step": 2847, + "time_per_iteration": 2.698451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086514, + "balance_loss_mlp": 1.05726016, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.06538513229207209, + "language_loss": 0.81294727, + "learning_rate": 0.00044682700493173385, + "loss": 0.82381248, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.29223633, + "step": 2848, + "time_per_iteration": 2.8252742290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05441868, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06259253721450928, + "language_loss": 0.80796725, + "learning_rate": 0.00044651723966207004, + "loss": 0.81881809, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.30639648, + "step": 2849, + "time_per_iteration": 3.093806505203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083876, + "balance_loss_mlp": 1.05424023, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.05680096345280931, + "language_loss": 0.78538483, + "learning_rate": 0.00044620749515625536, + "loss": 0.79622364, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.29614258, + "step": 2850, + "time_per_iteration": 2.759477376937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_mlp": 1.0532248, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.054672764420471885, + "language_loss": 0.85281622, + "learning_rate": 0.00044589777153454334, + "loss": 0.86365175, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30297852, + "step": 2851, + "time_per_iteration": 2.7247886657714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082527, + "balance_loss_mlp": 1.0519855, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.05927586181396917, + "language_loss": 0.83792317, + "learning_rate": 0.00044558806891717895, + "loss": 0.84874845, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30493164, + "step": 2852, + "time_per_iteration": 2.480499267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078563, + "balance_loss_mlp": 1.04847419, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06995220773511122, + "language_loss": 0.79820019, + "learning_rate": 0.0004452783874243998, + "loss": 0.80898583, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.30053711, + "step": 2853, + "time_per_iteration": 2.815159559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_mlp": 1.05354142, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.0871194319773747, + "language_loss": 0.8473509, + "learning_rate": 0.00044496872717643475, + "loss": 0.85818863, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.30200195, + "step": 2854, + "time_per_iteration": 2.671760320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_mlp": 1.02099681, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.022692984636718958, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7811873, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.08447266, + "step": 2855, + "time_per_iteration": 4.943760633468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_mlp": 1.05152166, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08481580298187671, + "language_loss": 0.82385266, + "learning_rate": 0.0004443494708958217, + "loss": 0.83465844, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.2902832, + "step": 2856, + "time_per_iteration": 2.9592692852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081194, + "balance_loss_mlp": 1.05131996, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.054737825906261944, + "language_loss": 0.81019336, + "learning_rate": 0.0004440398751035906, + "loss": 0.82100528, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29858398, + "step": 2857, + "time_per_iteration": 2.8660449981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086545, + "balance_loss_mlp": 1.05612314, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.07506614425197558, + "language_loss": 0.84203708, + "learning_rate": 0.00044373030103700645, + "loss": 0.85290253, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.30395508, + "step": 2858, + "time_per_iteration": 2.589571475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.05769968, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.06400511299844665, + "language_loss": 0.80211353, + "learning_rate": 0.000443420748816257, + "loss": 0.81297493, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28442383, + "step": 2859, + "time_per_iteration": 2.775573492050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089751, + "balance_loss_mlp": 1.05894732, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.05990515883462961, + "language_loss": 0.78525764, + "learning_rate": 0.0004431112185615208, + "loss": 0.7961551, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.30786133, + "step": 2860, + "time_per_iteration": 2.79428768157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099065, + "balance_loss_mlp": 1.06942964, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.08012396807897051, + "language_loss": 0.80142951, + "learning_rate": 0.00044280171039296845, + "loss": 0.81242013, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29589844, + "step": 2861, + "time_per_iteration": 2.6075713634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097414, + "balance_loss_mlp": 1.06808829, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.055527438655266555, + "language_loss": 0.88317382, + "learning_rate": 0.0004424922244307616, + "loss": 0.89414799, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.29321289, + "step": 2862, + "time_per_iteration": 2.6453704833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093253, + "balance_loss_mlp": 1.06306958, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.0988044596240084, + "language_loss": 0.82273299, + "learning_rate": 0.00044218276079505315, + "loss": 0.83366549, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.30151367, + "step": 2863, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093494, + "balance_loss_mlp": 1.0636915, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.15366013773450377, + "language_loss": 0.74783754, + "learning_rate": 0.0004418733196059876, + "loss": 0.75877243, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29760742, + "step": 2864, + "time_per_iteration": 2.6593546867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092739, + "balance_loss_mlp": 1.06398571, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.05392307081741782, + "language_loss": 0.80104017, + "learning_rate": 0.0004415639009837008, + "loss": 0.81196761, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28759766, + "step": 2865, + "time_per_iteration": 2.8184585571289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096337, + "balance_loss_mlp": 1.06660628, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.0621710106813525, + "language_loss": 0.8235333, + "learning_rate": 0.00044125450504831955, + "loss": 0.83449662, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.29711914, + "step": 2866, + "time_per_iteration": 2.734349489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086542, + "balance_loss_mlp": 1.05592918, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.06271512147953057, + "language_loss": 0.82752901, + "learning_rate": 0.0004409451319199622, + "loss": 0.83839446, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.30566406, + "step": 2867, + "time_per_iteration": 2.683742046356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095264, + "balance_loss_mlp": 1.06417394, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07258101504897169, + "language_loss": 0.84457368, + "learning_rate": 0.0004406357817187381, + "loss": 0.85552633, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.31054688, + "step": 2868, + "time_per_iteration": 3.0147883892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103312, + "balance_loss_mlp": 1.07379591, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.05164398294731223, + "language_loss": 0.81673765, + "learning_rate": 0.0004403264545647474, + "loss": 0.82777071, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29492188, + "step": 2869, + "time_per_iteration": 3.5095975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107006, + "balance_loss_mlp": 1.07603574, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.04919714399659635, + "language_loss": 0.85006267, + "learning_rate": 0.00044001715057808154, + "loss": 0.86113274, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.30932617, + "step": 2870, + "time_per_iteration": 2.759791851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114514, + "balance_loss_mlp": 1.08330536, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06727866309699267, + "language_loss": 0.81942332, + "learning_rate": 0.0004397078698788232, + "loss": 0.83056843, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.31176758, + "step": 2871, + "time_per_iteration": 3.21431040763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104908, + "balance_loss_mlp": 1.09441757, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.04310408533027141, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81547272, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.10498047, + "step": 2872, + "time_per_iteration": 4.941087484359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114234, + "balance_loss_mlp": 1.082739, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.05898932962157328, + "language_loss": 0.78340954, + "learning_rate": 0.00043908937882281343, + "loss": 0.79455185, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.31469727, + "step": 2873, + "time_per_iteration": 2.577866554260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08501506, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.05969066171006231, + "language_loss": 0.82846034, + "learning_rate": 0.0004387801687061814, + "loss": 0.83962971, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.3190918, + "step": 2874, + "time_per_iteration": 2.8184196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117603, + "balance_loss_mlp": 1.08489251, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.05481480847886404, + "language_loss": 0.80685902, + "learning_rate": 0.0004384709823571958, + "loss": 0.81803501, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.32714844, + "step": 2875, + "time_per_iteration": 2.7496426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.07519674, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0703745986604158, + "language_loss": 0.83230788, + "learning_rate": 0.0004381618198958932, + "loss": 0.84336388, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.30371094, + "step": 2876, + "time_per_iteration": 3.4905495643615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110669, + "balance_loss_mlp": 1.07662511, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.06448913307816859, + "language_loss": 0.84021735, + "learning_rate": 0.00043785268144230137, + "loss": 0.85128427, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.30029297, + "step": 2877, + "time_per_iteration": 2.907133102416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102032, + "balance_loss_mlp": 1.07203865, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.0731230974557418, + "language_loss": 0.82496381, + "learning_rate": 0.00043754356711643837, + "loss": 0.83598411, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29980469, + "step": 2878, + "time_per_iteration": 2.715023994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097061, + "balance_loss_mlp": 1.06609011, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.0760081782140183, + "language_loss": 0.83909559, + "learning_rate": 0.0004372344770383132, + "loss": 0.85006618, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30932617, + "step": 2879, + "time_per_iteration": 2.822368621826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097203, + "balance_loss_mlp": 1.06756735, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.06372253861737541, + "language_loss": 0.83305293, + "learning_rate": 0.00043692541132792507, + "loss": 0.84402496, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29614258, + "step": 2880, + "time_per_iteration": 2.7154414653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093507, + "balance_loss_mlp": 1.06349051, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.057885594640944824, + "language_loss": 0.83464789, + "learning_rate": 0.00043661637010526384, + "loss": 0.84558296, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.30004883, + "step": 2881, + "time_per_iteration": 2.507059097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092859, + "balance_loss_mlp": 1.06255555, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.08329174894233551, + "language_loss": 0.83249325, + "learning_rate": 0.00043630735349031025, + "loss": 0.84342188, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30273438, + "step": 2882, + "time_per_iteration": 2.644418478012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06216836, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.047753182436236, + "language_loss": 0.81861913, + "learning_rate": 0.00043599836160303495, + "loss": 0.82952571, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.28491211, + "step": 2883, + "time_per_iteration": 2.8971407413482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090292, + "balance_loss_mlp": 1.06160986, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.057456379562134556, + "language_loss": 0.77759755, + "learning_rate": 0.0004356893945633995, + "loss": 0.78850043, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.28649902, + "step": 2884, + "time_per_iteration": 2.937133312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094576, + "balance_loss_mlp": 1.06620383, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.05754228747661135, + "language_loss": 0.81617516, + "learning_rate": 0.0004353804524913551, + "loss": 0.82712096, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.28344727, + "step": 2885, + "time_per_iteration": 2.579535722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109656, + "balance_loss_mlp": 1.08028293, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.06485446309889223, + "language_loss": 0.81926423, + "learning_rate": 0.0004350715355068441, + "loss": 0.83036083, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.29345703, + "step": 2886, + "time_per_iteration": 2.709717273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111013, + "balance_loss_mlp": 1.08142567, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.066893347852213, + "language_loss": 0.7961694, + "learning_rate": 0.00043476264372979847, + "loss": 0.80727959, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.2956543, + "step": 2887, + "time_per_iteration": 2.5216078758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113, + "balance_loss_mlp": 1.08441329, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.0640996529430707, + "language_loss": 0.78604692, + "learning_rate": 0.0004344537772801408, + "loss": 0.7971769, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.28540039, + "step": 2888, + "time_per_iteration": 3.8132436275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.05838752, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.028482200170008867, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.7448833, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.07470703, + "step": 2889, + "time_per_iteration": 4.947216987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117814, + "balance_loss_mlp": 1.08801198, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.06792095354006551, + "language_loss": 0.83771884, + "learning_rate": 0.0004338361208426298, + "loss": 0.84889698, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.29760742, + "step": 2890, + "time_per_iteration": 2.631476879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113406, + "balance_loss_mlp": 1.08350825, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.05967481099781226, + "language_loss": 0.81602627, + "learning_rate": 0.00043352733109457164, + "loss": 0.82716036, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.29858398, + "step": 2891, + "time_per_iteration": 2.907500743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111722, + "balance_loss_mlp": 1.08194315, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.04670195587242621, + "language_loss": 0.84789026, + "learning_rate": 0.00043321856715349244, + "loss": 0.85900748, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29760742, + "step": 2892, + "time_per_iteration": 2.9401984214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110696, + "balance_loss_mlp": 1.0810132, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.05439165995621742, + "language_loss": 0.80422115, + "learning_rate": 0.00043290982913926466, + "loss": 0.81532812, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.29614258, + "step": 2893, + "time_per_iteration": 2.7956430912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113402, + "balance_loss_mlp": 1.08312285, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.09922355360532673, + "language_loss": 0.8448714, + "learning_rate": 0.0004326011171717514, + "loss": 0.85600543, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30297852, + "step": 2894, + "time_per_iteration": 2.8997769355773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108454, + "balance_loss_mlp": 1.07676816, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06224988402754836, + "language_loss": 0.81240308, + "learning_rate": 0.0004322924313708051, + "loss": 0.82348764, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.31689453, + "step": 2895, + "time_per_iteration": 2.511643648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07825518, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.0621054083596477, + "language_loss": 0.84500259, + "learning_rate": 0.0004319837718562681, + "loss": 0.85607862, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.29321289, + "step": 2896, + "time_per_iteration": 2.580003023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106887, + "balance_loss_mlp": 1.07667959, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.05844968671659234, + "language_loss": 0.83570629, + "learning_rate": 0.0004316751387479726, + "loss": 0.84677517, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30175781, + "step": 2897, + "time_per_iteration": 2.7676987648010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122549, + "balance_loss_mlp": 1.0925082, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.06543352873326957, + "language_loss": 0.82800293, + "learning_rate": 0.0004313665321657409, + "loss": 0.83922845, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.30004883, + "step": 2898, + "time_per_iteration": 3.7584402561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120576, + "balance_loss_mlp": 1.08917618, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.06787906742385669, + "language_loss": 0.80272007, + "learning_rate": 0.00043105795222938436, + "loss": 0.81392586, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.31396484, + "step": 2899, + "time_per_iteration": 2.718045711517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.07795143, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.06698960298708169, + "language_loss": 0.78827435, + "learning_rate": 0.00043074939905870467, + "loss": 0.79937094, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.31713867, + "step": 2900, + "time_per_iteration": 2.639775514602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.08230579, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.09759490745534659, + "language_loss": 0.80356312, + "learning_rate": 0.0004304408727734927, + "loss": 0.81467754, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.29125977, + "step": 2901, + "time_per_iteration": 2.6272940635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107959, + "balance_loss_mlp": 1.07889545, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.06875313821095587, + "language_loss": 0.89200485, + "learning_rate": 0.0004301323734935288, + "loss": 0.9030844, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.29052734, + "step": 2902, + "time_per_iteration": 2.652219533920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_mlp": 1.07164121, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.05706751216847301, + "language_loss": 0.87298477, + "learning_rate": 0.000429823901338583, + "loss": 0.8839913, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.2902832, + "step": 2903, + "time_per_iteration": 2.611798048019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099623, + "balance_loss_mlp": 1.06872356, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.053536411753063035, + "language_loss": 0.87032712, + "learning_rate": 0.00042951545642841513, + "loss": 0.88132328, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.30883789, + "step": 2904, + "time_per_iteration": 3.067237377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099073, + "balance_loss_mlp": 1.06979561, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.04987560026618122, + "language_loss": 0.86746645, + "learning_rate": 0.0004292070388827737, + "loss": 0.87845719, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.29272461, + "step": 2905, + "time_per_iteration": 2.5981948375701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06426287, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06265536693897518, + "language_loss": 0.81292248, + "learning_rate": 0.00042889864882139753, + "loss": 0.82385433, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.2890625, + "step": 2906, + "time_per_iteration": 2.581113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107989, + "balance_loss_mlp": 1.07811511, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06493240221059006, + "language_loss": 0.81897962, + "learning_rate": 0.0004285902863640139, + "loss": 0.83005953, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29858398, + "step": 2907, + "time_per_iteration": 2.6115305423736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109856, + "balance_loss_mlp": 1.06973481, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.07849480564056018, + "language_loss": 0.8626982, + "learning_rate": 0.00042828195163033966, + "loss": 0.87368375, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.28833008, + "step": 2908, + "time_per_iteration": 2.6564390659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07654572, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07707498056388652, + "language_loss": 0.79454792, + "learning_rate": 0.0004279736447400812, + "loss": 0.80562025, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30664062, + "step": 2909, + "time_per_iteration": 2.580448627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_mlp": 1.07343817, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.055339920225342294, + "language_loss": 0.78979003, + "learning_rate": 0.00042766536581293385, + "loss": 0.80081677, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.29223633, + "step": 2910, + "time_per_iteration": 2.714306116104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112106, + "balance_loss_mlp": 1.09004188, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.06660982321180627, + "language_loss": 0.79863673, + "learning_rate": 0.0004273571149685819, + "loss": 0.80984735, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30981445, + "step": 2911, + "time_per_iteration": 2.738189220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117749, + "balance_loss_mlp": 1.08794653, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.07453286241806684, + "language_loss": 0.83875954, + "learning_rate": 0.00042704889232669937, + "loss": 0.84993702, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29785156, + "step": 2912, + "time_per_iteration": 2.7153878211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119265, + "balance_loss_mlp": 1.09003508, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.06505374842280261, + "language_loss": 0.85808718, + "learning_rate": 0.0004267406980069484, + "loss": 0.8692798, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29248047, + "step": 2913, + "time_per_iteration": 2.7438042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105259, + "balance_loss_mlp": 1.07490873, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.045730944132966495, + "language_loss": 0.79707301, + "learning_rate": 0.0004264325321289808, + "loss": 0.80812562, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.30322266, + "step": 2914, + "time_per_iteration": 2.787429094314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101375, + "balance_loss_mlp": 1.07131052, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.05941371213730478, + "language_loss": 0.8624413, + "learning_rate": 0.00042612439481243736, + "loss": 0.87345505, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.30078125, + "step": 2915, + "time_per_iteration": 2.7993295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_mlp": 1.06113064, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.06435914288601326, + "language_loss": 0.90124059, + "learning_rate": 0.00042581628617694735, + "loss": 0.91214895, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.296875, + "step": 2916, + "time_per_iteration": 2.744046449661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089958, + "balance_loss_mlp": 1.06032228, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.05771140503361017, + "language_loss": 0.81953394, + "learning_rate": 0.0004255082063421296, + "loss": 0.83043355, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.29638672, + "step": 2917, + "time_per_iteration": 2.705963134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.0655117, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.0764674514791775, + "language_loss": 0.84947777, + "learning_rate": 0.00042520015542759065, + "loss": 0.86043298, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.29980469, + "step": 2918, + "time_per_iteration": 2.9078075885772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085954, + "balance_loss_mlp": 1.05662882, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.049198929687541054, + "language_loss": 0.88353539, + "learning_rate": 0.00042489213355292687, + "loss": 0.89439487, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.29296875, + "step": 2919, + "time_per_iteration": 2.862194776535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093265, + "balance_loss_mlp": 1.06300998, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.0619251317266344, + "language_loss": 0.81301886, + "learning_rate": 0.00042458414083772276, + "loss": 0.82395148, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.30224609, + "step": 2920, + "time_per_iteration": 2.5329933166503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095136, + "balance_loss_mlp": 1.0651195, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.05517349890350355, + "language_loss": 0.8525691, + "learning_rate": 0.000424276177401552, + "loss": 0.86352038, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.29956055, + "step": 2921, + "time_per_iteration": 2.787318468093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092355, + "balance_loss_mlp": 1.06200445, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06500569481536145, + "language_loss": 0.85831988, + "learning_rate": 0.0004239682433639763, + "loss": 0.86924338, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.3034668, + "step": 2922, + "time_per_iteration": 2.697091817855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093283, + "balance_loss_mlp": 1.06386256, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.08309086608315261, + "language_loss": 0.85596514, + "learning_rate": 0.0004236603388445467, + "loss": 0.86689794, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.29394531, + "step": 2923, + "time_per_iteration": 2.5720105171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097102, + "balance_loss_mlp": 1.0683012, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07246274776201297, + "language_loss": 0.82229364, + "learning_rate": 0.00042335246396280166, + "loss": 0.83326471, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.28808594, + "step": 2924, + "time_per_iteration": 2.7669975757598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093178, + "balance_loss_mlp": 1.06320906, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06414999121973448, + "language_loss": 0.90646857, + "learning_rate": 0.0004230446188382693, + "loss": 0.91740036, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.29956055, + "step": 2925, + "time_per_iteration": 2.5662741661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.0595876, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.05389869275215176, + "language_loss": 0.80918074, + "learning_rate": 0.0004227368035904654, + "loss": 0.82006967, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.29296875, + "step": 2926, + "time_per_iteration": 2.964599370956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092675, + "balance_loss_mlp": 1.06249142, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06261422618617216, + "language_loss": 0.82895541, + "learning_rate": 0.00042242901833889474, + "loss": 0.83988214, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30151367, + "step": 2927, + "time_per_iteration": 2.6312665939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093424, + "balance_loss_mlp": 1.06376481, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.06041695665754469, + "language_loss": 0.86030155, + "learning_rate": 0.0004221212632030501, + "loss": 0.87123579, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.29614258, + "step": 2928, + "time_per_iteration": 3.0977063179016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06351972, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.06366283736150324, + "language_loss": 0.80857551, + "learning_rate": 0.0004218135383024124, + "loss": 0.81951618, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30541992, + "step": 2929, + "time_per_iteration": 2.749244213104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088519, + "balance_loss_mlp": 1.0590266, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.12143433952472552, + "language_loss": 0.85715157, + "learning_rate": 0.0004215058437564511, + "loss": 0.86803675, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.29467773, + "step": 2930, + "time_per_iteration": 2.593238115310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_mlp": 1.05512953, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.056033125460513485, + "language_loss": 0.82132083, + "learning_rate": 0.00042119817968462397, + "loss": 0.83216375, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.29125977, + "step": 2931, + "time_per_iteration": 2.591958522796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092676, + "balance_loss_mlp": 1.06275427, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.07812059497351068, + "language_loss": 0.87152535, + "learning_rate": 0.0004208905462063766, + "loss": 0.88245207, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.29907227, + "step": 2932, + "time_per_iteration": 2.6288535594940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086522, + "balance_loss_mlp": 1.0571723, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.06389283518633071, + "language_loss": 0.84869772, + "learning_rate": 0.00042058294344114315, + "loss": 0.85956293, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.29345703, + "step": 2933, + "time_per_iteration": 2.6064674854278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05672109, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.05718901807458546, + "language_loss": 0.77749109, + "learning_rate": 0.0004202753715083456, + "loss": 0.78835702, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.29858398, + "step": 2934, + "time_per_iteration": 3.075186014175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093891, + "balance_loss_mlp": 1.0630157, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07168087831316133, + "language_loss": 0.81911719, + "learning_rate": 0.0004199678305273936, + "loss": 0.83005607, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30883789, + "step": 2935, + "time_per_iteration": 2.6289923191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091967, + "balance_loss_mlp": 1.06316626, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.0664481148229904, + "language_loss": 0.81315005, + "learning_rate": 0.0004196603206176854, + "loss": 0.82406974, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.28808594, + "step": 2936, + "time_per_iteration": 2.941150426864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093274, + "balance_loss_mlp": 1.06404424, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07427925135014142, + "language_loss": 0.83779049, + "learning_rate": 0.000419352841898607, + "loss": 0.84872323, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29199219, + "step": 2937, + "time_per_iteration": 2.977189302444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092016, + "balance_loss_mlp": 1.06273842, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.061049572757767595, + "language_loss": 0.77780819, + "learning_rate": 0.000419045394489532, + "loss": 0.78872836, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29296875, + "step": 2938, + "time_per_iteration": 2.6722819805145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086194, + "balance_loss_mlp": 1.05622458, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.05727154642915785, + "language_loss": 0.77326584, + "learning_rate": 0.0004187379785098224, + "loss": 0.78412783, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.29931641, + "step": 2939, + "time_per_iteration": 3.100283622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.05665886, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.06949350877551969, + "language_loss": 0.83849806, + "learning_rate": 0.00041843059407882744, + "loss": 0.84936267, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.29785156, + "step": 2940, + "time_per_iteration": 2.9837162494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.05257499, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.068553917777786, + "language_loss": 0.82768112, + "learning_rate": 0.0004181232413158842, + "loss": 0.83850372, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.29638672, + "step": 2941, + "time_per_iteration": 2.636819839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083664, + "balance_loss_mlp": 1.05371857, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.06960240931548377, + "language_loss": 0.82932127, + "learning_rate": 0.0004178159203403179, + "loss": 0.84015793, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29931641, + "step": 2942, + "time_per_iteration": 2.822134494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_mlp": 1.0547837, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.05318601865014104, + "language_loss": 0.81807715, + "learning_rate": 0.0004175086312714409, + "loss": 0.8289094, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.28442383, + "step": 2943, + "time_per_iteration": 2.571985960006714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086509, + "balance_loss_mlp": 1.05625343, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05713331418457596, + "language_loss": 0.84120524, + "learning_rate": 0.00041720137422855366, + "loss": 0.85207033, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.30224609, + "step": 2944, + "time_per_iteration": 2.7213711738586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086525, + "balance_loss_mlp": 1.05758142, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.1661240742061477, + "language_loss": 0.79230917, + "learning_rate": 0.00041689414933094383, + "loss": 0.80317438, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.28930664, + "step": 2945, + "time_per_iteration": 2.628525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088063, + "balance_loss_mlp": 1.05954862, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.06338169436240754, + "language_loss": 0.81427538, + "learning_rate": 0.00041658695669788653, + "loss": 0.82515597, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.28515625, + "step": 2946, + "time_per_iteration": 2.736955404281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084859, + "balance_loss_mlp": 1.0541029, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.0612697940531113, + "language_loss": 0.81293368, + "learning_rate": 0.00041627979644864453, + "loss": 0.82378221, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.30712891, + "step": 2947, + "time_per_iteration": 2.780796766281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.05436563, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06710047446863547, + "language_loss": 0.81410027, + "learning_rate": 0.0004159726687024683, + "loss": 0.82493049, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.28662109, + "step": 2948, + "time_per_iteration": 2.6072115898132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108621, + "balance_loss_mlp": 1.05757558, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.06141378811636639, + "language_loss": 0.79485345, + "learning_rate": 0.00041566557357859506, + "loss": 0.80571556, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.28613281, + "step": 2949, + "time_per_iteration": 2.911865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.05443358, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.052257384193144164, + "language_loss": 0.79611081, + "learning_rate": 0.0004153585111962502, + "loss": 0.806961, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.30566406, + "step": 2950, + "time_per_iteration": 3.2808187007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_mlp": 1.05606341, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06672147261233864, + "language_loss": 0.84739614, + "learning_rate": 0.0004150514816746453, + "loss": 0.85826337, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.30639648, + "step": 2951, + "time_per_iteration": 2.680326461791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089698, + "balance_loss_mlp": 1.0602051, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.06944116544696582, + "language_loss": 0.85944223, + "learning_rate": 0.0004147444851329802, + "loss": 0.87033927, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29443359, + "step": 2952, + "time_per_iteration": 2.6477670669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086586, + "balance_loss_mlp": 1.05680704, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.054427499920313586, + "language_loss": 0.86026949, + "learning_rate": 0.00041443752169044126, + "loss": 0.87113535, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.29736328, + "step": 2953, + "time_per_iteration": 2.997781276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092153, + "balance_loss_mlp": 1.061993, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.055407826164880256, + "language_loss": 0.84948021, + "learning_rate": 0.0004141305914662025, + "loss": 0.86040175, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.30126953, + "step": 2954, + "time_per_iteration": 2.704019069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05688024, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0673052072270573, + "language_loss": 0.80911326, + "learning_rate": 0.0004138236945794246, + "loss": 0.81998718, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.30493164, + "step": 2955, + "time_per_iteration": 2.88403058052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108918, + "balance_loss_mlp": 1.05911565, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.06799730214965168, + "language_loss": 0.8379457, + "learning_rate": 0.00041351683114925576, + "loss": 0.84883749, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.30053711, + "step": 2956, + "time_per_iteration": 3.0439462661743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087169, + "balance_loss_mlp": 1.0562458, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06948923214023794, + "language_loss": 0.86469889, + "learning_rate": 0.0004132100012948308, + "loss": 0.87557054, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.30883789, + "step": 2957, + "time_per_iteration": 2.6431198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090494, + "balance_loss_mlp": 1.05959463, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.0655655158566539, + "language_loss": 0.84699452, + "learning_rate": 0.00041290320513527145, + "loss": 0.85789943, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.30883789, + "step": 2958, + "time_per_iteration": 2.5978519916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_mlp": 1.05528057, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.05333030562061355, + "language_loss": 0.8519215, + "learning_rate": 0.0004125964427896867, + "loss": 0.86277229, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29760742, + "step": 2959, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_mlp": 1.05468178, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06459683266000829, + "language_loss": 0.79222417, + "learning_rate": 0.0004122897143771723, + "loss": 0.80306756, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.29663086, + "step": 2960, + "time_per_iteration": 2.5457372665405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.05713725, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.057309213891239566, + "language_loss": 0.81961918, + "learning_rate": 0.0004119830200168109, + "loss": 0.83049381, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.30297852, + "step": 2961, + "time_per_iteration": 2.66658091545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.05180621, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.0578679247611712, + "language_loss": 0.88614476, + "learning_rate": 0.0004116763598276714, + "loss": 0.89694846, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.28564453, + "step": 2962, + "time_per_iteration": 2.5355417728424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083269, + "balance_loss_mlp": 1.05394387, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.05524318032555551, + "language_loss": 0.81030452, + "learning_rate": 0.00041136973392881017, + "loss": 0.82113719, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.29345703, + "step": 2963, + "time_per_iteration": 2.8497612476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.05540633, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.06477225886122127, + "language_loss": 0.82179135, + "learning_rate": 0.00041106314243926983, + "loss": 0.83264679, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.30102539, + "step": 2964, + "time_per_iteration": 2.735269069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080389, + "balance_loss_mlp": 1.05103993, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.05516182837620622, + "language_loss": 0.87329233, + "learning_rate": 0.0004107565854780798, + "loss": 0.88409621, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29296875, + "step": 2965, + "time_per_iteration": 2.6157355308532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.05596685, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.07414316825555053, + "language_loss": 0.81466991, + "learning_rate": 0.000410450063164256, + "loss": 0.82552361, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29370117, + "step": 2966, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083362, + "balance_loss_mlp": 1.05291581, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.06746080357230834, + "language_loss": 0.82004952, + "learning_rate": 0.00041014357561680115, + "loss": 0.83088315, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30395508, + "step": 2967, + "time_per_iteration": 2.51119065284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085478, + "balance_loss_mlp": 1.05519855, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.053142332405834165, + "language_loss": 0.86128843, + "learning_rate": 0.0004098371229547039, + "loss": 0.87214315, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.30249023, + "step": 2968, + "time_per_iteration": 2.6994621753692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022253, + "balance_loss_mlp": 1.01390862, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.025900339106917806, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81033063, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.08349609, + "step": 2969, + "time_per_iteration": 4.718291997909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092241, + "balance_loss_mlp": 1.06179523, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.05366083523781242, + "language_loss": 0.80585647, + "learning_rate": 0.00040922432276247107, + "loss": 0.8167789, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.30419922, + "step": 2970, + "time_per_iteration": 2.55259108543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.0609777, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.049420251796361614, + "language_loss": 0.84874177, + "learning_rate": 0.0004089179754702457, + "loss": 0.85966122, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.30932617, + "step": 2971, + "time_per_iteration": 2.771068572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109211, + "balance_loss_mlp": 1.06090152, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.06283275659801735, + "language_loss": 0.7981565, + "learning_rate": 0.00040861166353919843, + "loss": 0.80907762, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.31176758, + "step": 2972, + "time_per_iteration": 2.7827725410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091117, + "balance_loss_mlp": 1.06069493, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06507135137823726, + "language_loss": 0.818784, + "learning_rate": 0.00040830538708824983, + "loss": 0.82969517, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.30395508, + "step": 2973, + "time_per_iteration": 2.845456600189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108959, + "balance_loss_mlp": 1.05966854, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.07493195148818688, + "language_loss": 0.81968939, + "learning_rate": 0.000407999146236307, + "loss": 0.8305853, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29882812, + "step": 2974, + "time_per_iteration": 2.531430244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093173, + "balance_loss_mlp": 1.06284618, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.06121365308687838, + "language_loss": 0.8362776, + "learning_rate": 0.0004076929411022634, + "loss": 0.84720927, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.30322266, + "step": 2975, + "time_per_iteration": 2.645341634750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096437, + "balance_loss_mlp": 1.06591964, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.05509159729755976, + "language_loss": 0.79606473, + "learning_rate": 0.0004073867718049982, + "loss": 0.80702913, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.30493164, + "step": 2976, + "time_per_iteration": 3.085145950317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.07137978, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.06232705756749319, + "language_loss": 0.82691067, + "learning_rate": 0.00040708063846337704, + "loss": 0.83793509, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.31054688, + "step": 2977, + "time_per_iteration": 2.738443613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099181, + "balance_loss_mlp": 1.06813931, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.061703964741206326, + "language_loss": 0.81214464, + "learning_rate": 0.00040677454119625143, + "loss": 0.82313639, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.31005859, + "step": 2978, + "time_per_iteration": 2.6232175827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108887, + "balance_loss_mlp": 1.07758296, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07073355508195153, + "language_loss": 0.83247018, + "learning_rate": 0.0004064684801224587, + "loss": 0.84355903, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.31274414, + "step": 2979, + "time_per_iteration": 2.577918767929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101163, + "balance_loss_mlp": 1.07085991, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.05699497583041508, + "language_loss": 0.80492741, + "learning_rate": 0.00040616245536082224, + "loss": 0.81593907, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30273438, + "step": 2980, + "time_per_iteration": 2.6298904418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101523, + "balance_loss_mlp": 1.07167256, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.04979780559516064, + "language_loss": 0.81357765, + "learning_rate": 0.00040585646703015165, + "loss": 0.82459289, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29833984, + "step": 2981, + "time_per_iteration": 2.8170647621154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_mlp": 1.07118809, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07486213422343042, + "language_loss": 0.78689104, + "learning_rate": 0.0004055505152492419, + "loss": 0.79791927, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.31616211, + "step": 2982, + "time_per_iteration": 2.6379241943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_mlp": 1.06825066, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.05681665302183781, + "language_loss": 0.74231875, + "learning_rate": 0.00040524460013687425, + "loss": 0.75331908, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.31762695, + "step": 2983, + "time_per_iteration": 2.7545318603515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097699, + "balance_loss_mlp": 1.0663712, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.04476617807489617, + "language_loss": 0.81250238, + "learning_rate": 0.0004049387218118155, + "loss": 0.82347941, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.31298828, + "step": 2984, + "time_per_iteration": 2.9756665229797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108902, + "balance_loss_mlp": 1.05816841, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07928255171477795, + "language_loss": 0.85245347, + "learning_rate": 0.00040463288039281777, + "loss": 0.8633436, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30810547, + "step": 2985, + "time_per_iteration": 2.706669807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_mlp": 1.02681565, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.02538869827055974, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78910911, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.07666016, + "step": 2986, + "time_per_iteration": 4.949368953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.05462396, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.060127228305881374, + "language_loss": 0.82366645, + "learning_rate": 0.0004040213087479444, + "loss": 0.83451408, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.30102539, + "step": 2987, + "time_per_iteration": 2.9205455780029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086178, + "balance_loss_mlp": 1.05723405, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.05965622667733625, + "language_loss": 0.85328299, + "learning_rate": 0.0004037155787595018, + "loss": 0.86414474, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.2890625, + "step": 2988, + "time_per_iteration": 2.574509859085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088474, + "balance_loss_mlp": 1.0593158, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.05784717048255493, + "language_loss": 0.80869853, + "learning_rate": 0.000403409886151987, + "loss": 0.8195833, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29125977, + "step": 2989, + "time_per_iteration": 2.945080041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016432, + "balance_loss_mlp": 1.00894582, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.009946927491071988, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83015537, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.07470703, + "step": 2990, + "time_per_iteration": 4.807205677032471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015586, + "balance_loss_mlp": 1.00809932, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.009078458393910433, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79214191, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.07470703, + "step": 2991, + "time_per_iteration": 4.805190563201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.05380619, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.05637441563568418, + "language_loss": 0.76644433, + "learning_rate": 0.00040249303380173807, + "loss": 0.77728564, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.30322266, + "step": 2992, + "time_per_iteration": 3.049729108810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.05780125, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06616333205601678, + "language_loss": 0.79290402, + "learning_rate": 0.00040218749190459126, + "loss": 0.80381036, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.32836914, + "step": 2993, + "time_per_iteration": 2.7314000129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087925, + "balance_loss_mlp": 1.05795622, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.06422497492556134, + "language_loss": 0.82827115, + "learning_rate": 0.00040188198798162775, + "loss": 0.83915043, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29956055, + "step": 2994, + "time_per_iteration": 2.605794668197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089955, + "balance_loss_mlp": 1.06022453, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.05264744908201922, + "language_loss": 0.85358101, + "learning_rate": 0.000401576522151455, + "loss": 0.8644805, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29711914, + "step": 2995, + "time_per_iteration": 2.8504650592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_mlp": 1.05664682, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.05051873290535222, + "language_loss": 0.83133811, + "learning_rate": 0.0004012710945326651, + "loss": 0.8421973, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.29248047, + "step": 2996, + "time_per_iteration": 2.7823193073272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094348, + "balance_loss_mlp": 1.06545174, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.0711371625716349, + "language_loss": 0.81093514, + "learning_rate": 0.0004009657052438355, + "loss": 0.82187867, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28881836, + "step": 2997, + "time_per_iteration": 2.7743020057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091289, + "balance_loss_mlp": 1.06184435, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.06367440987852575, + "language_loss": 0.85650682, + "learning_rate": 0.00040066035440352904, + "loss": 0.86741972, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.29418945, + "step": 2998, + "time_per_iteration": 2.6359331607818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014946, + "balance_loss_mlp": 1.0071255, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.01828635150904939, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8030808, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.078125, + "step": 2999, + "time_per_iteration": 4.881432056427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104047, + "balance_loss_mlp": 1.07417345, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.0709915390631299, + "language_loss": 0.76451176, + "learning_rate": 0.00040004976854266145, + "loss": 0.77555221, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.2980957, + "step": 3000, + "time_per_iteration": 2.5374131202697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101005, + "balance_loss_mlp": 1.07017779, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.051209677129469174, + "language_loss": 0.81337965, + "learning_rate": 0.0003997445337591505, + "loss": 0.8243897, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.30810547, + "step": 3001, + "time_per_iteration": 2.647610902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102438, + "balance_loss_mlp": 1.07351804, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.0611265357111255, + "language_loss": 0.74261576, + "learning_rate": 0.0003994393378982635, + "loss": 0.75364017, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28979492, + "step": 3002, + "time_per_iteration": 2.602245330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013935, + "balance_loss_mlp": 1.00611448, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.01032263408282017, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80551934, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.078125, + "step": 3003, + "time_per_iteration": 4.818480968475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104089, + "balance_loss_mlp": 1.07304692, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.0604287320481862, + "language_loss": 0.88041145, + "learning_rate": 0.0003988290634182961, + "loss": 0.89145231, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.31005859, + "step": 3004, + "time_per_iteration": 2.7484169006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.07284904, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.06655998832299866, + "language_loss": 0.80592918, + "learning_rate": 0.0003985239850361453, + "loss": 0.81695324, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.29541016, + "step": 3005, + "time_per_iteration": 2.6148018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_mlp": 1.07281876, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.0659443256400084, + "language_loss": 0.84734911, + "learning_rate": 0.0003982189460504777, + "loss": 0.85836959, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.29199219, + "step": 3006, + "time_per_iteration": 2.7011501789093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105808, + "balance_loss_mlp": 1.07540917, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.06531961229333205, + "language_loss": 0.7939682, + "learning_rate": 0.00039791394657971935, + "loss": 0.80502629, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.30371094, + "step": 3007, + "time_per_iteration": 2.7082760334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102193, + "balance_loss_mlp": 1.07234263, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.06476760562978502, + "language_loss": 0.8421638, + "learning_rate": 0.00039760898674228205, + "loss": 0.85318571, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.29858398, + "step": 3008, + "time_per_iteration": 2.650878429412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105056, + "balance_loss_mlp": 1.07475293, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.05525540739637584, + "language_loss": 0.80765337, + "learning_rate": 0.0003973040666565613, + "loss": 0.81870395, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.30273438, + "step": 3009, + "time_per_iteration": 3.1226985454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100227, + "balance_loss_mlp": 1.07030547, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06024611276807751, + "language_loss": 0.82195163, + "learning_rate": 0.000396999186440938, + "loss": 0.83295393, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.29882812, + "step": 3010, + "time_per_iteration": 2.844270944595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096543, + "balance_loss_mlp": 1.06533396, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.06262665363935188, + "language_loss": 0.85208702, + "learning_rate": 0.000396694346213777, + "loss": 0.86305249, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.31176758, + "step": 3011, + "time_per_iteration": 2.613032817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109368, + "balance_loss_mlp": 1.06492627, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.05937601459412264, + "language_loss": 0.83947617, + "learning_rate": 0.0003963895460934276, + "loss": 0.85041296, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.28735352, + "step": 3012, + "time_per_iteration": 3.124514102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091421, + "balance_loss_mlp": 1.05992579, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07020347624432877, + "language_loss": 0.8493948, + "learning_rate": 0.00039608478619822376, + "loss": 0.86030906, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.31494141, + "step": 3013, + "time_per_iteration": 2.411346912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_mlp": 1.05544281, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.05715104374994747, + "language_loss": 0.826662, + "learning_rate": 0.00039578006664648394, + "loss": 0.83750206, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.28564453, + "step": 3014, + "time_per_iteration": 2.7363553047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.05310702, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06684904609650524, + "language_loss": 0.81588256, + "learning_rate": 0.0003954753875565105, + "loss": 0.82670951, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.2956543, + "step": 3015, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107747, + "balance_loss_mlp": 1.04890752, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06478579772376787, + "language_loss": 0.82758343, + "learning_rate": 0.00039517074904659057, + "loss": 0.83835804, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.28564453, + "step": 3016, + "time_per_iteration": 2.7099101543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084798, + "balance_loss_mlp": 1.05454302, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.05410468367994604, + "language_loss": 0.84939837, + "learning_rate": 0.00039486615123499535, + "loss": 0.8602463, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.30224609, + "step": 3017, + "time_per_iteration": 2.8504526615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085654, + "balance_loss_mlp": 1.05532694, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.05526317953318916, + "language_loss": 0.85137427, + "learning_rate": 0.00039456159423997996, + "loss": 0.86223084, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.30297852, + "step": 3018, + "time_per_iteration": 2.633484363555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.0523833, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.07104600615119407, + "language_loss": 0.8999185, + "learning_rate": 0.00039425707817978406, + "loss": 0.91074204, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29956055, + "step": 3019, + "time_per_iteration": 2.6299033164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082814, + "balance_loss_mlp": 1.05241609, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.05724387536038855, + "language_loss": 0.83951199, + "learning_rate": 0.00039395260317263124, + "loss": 0.85034013, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.30395508, + "step": 3020, + "time_per_iteration": 2.5456759929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080319, + "balance_loss_mlp": 1.04996824, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.07612516842687451, + "language_loss": 0.85048491, + "learning_rate": 0.0003936481693367291, + "loss": 0.86128807, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.3034668, + "step": 3021, + "time_per_iteration": 2.7192864418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094567, + "balance_loss_mlp": 1.06259549, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08707963459833061, + "language_loss": 0.882092, + "learning_rate": 0.0003933437767902697, + "loss": 0.89303768, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.31958008, + "step": 3022, + "time_per_iteration": 2.7938294410705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.05792677, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07541432505918821, + "language_loss": 0.7834546, + "learning_rate": 0.00039303942565142825, + "loss": 0.79433668, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.30249023, + "step": 3023, + "time_per_iteration": 2.7417471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091198, + "balance_loss_mlp": 1.06089532, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.05482425315239383, + "language_loss": 0.76731157, + "learning_rate": 0.0003927351160383644, + "loss": 0.77822357, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.30249023, + "step": 3024, + "time_per_iteration": 2.804474353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091546, + "balance_loss_mlp": 1.06193483, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05202928961884776, + "language_loss": 0.77983212, + "learning_rate": 0.000392430848069222, + "loss": 0.79074758, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.530200958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097141, + "balance_loss_mlp": 1.06814933, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.058580785743037773, + "language_loss": 0.82503867, + "learning_rate": 0.00039212662186212795, + "loss": 0.8360101, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.28979492, + "step": 3026, + "time_per_iteration": 2.592423677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.06676841, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.04855017878997747, + "language_loss": 0.7719928, + "learning_rate": 0.0003918224375351934, + "loss": 0.78294182, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.28149414, + "step": 3027, + "time_per_iteration": 2.7347710132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101546, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.05175541468331668, + "language_loss": 0.7881335, + "learning_rate": 0.0003915182952065135, + "loss": 0.79914892, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29858398, + "step": 3028, + "time_per_iteration": 2.698678493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_mlp": 1.07684946, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.051679899573834884, + "language_loss": 0.87814313, + "learning_rate": 0.0003912141949941664, + "loss": 0.88920105, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.2890625, + "step": 3029, + "time_per_iteration": 2.703824520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107968, + "balance_loss_mlp": 1.07675922, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.07311487113166662, + "language_loss": 0.82985795, + "learning_rate": 0.0003909101370162143, + "loss": 0.84093761, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.31201172, + "step": 3030, + "time_per_iteration": 2.601590633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101355, + "balance_loss_mlp": 1.00611103, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.01566462127280147, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73447442, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.07421875, + "step": 3031, + "time_per_iteration": 4.907916307449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103812, + "balance_loss_mlp": 1.07403314, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.05748921462157389, + "language_loss": 0.8307178, + "learning_rate": 0.0003903021482356622, + "loss": 0.84175599, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29760742, + "step": 3032, + "time_per_iteration": 2.8251240253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_mlp": 1.07525432, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.054780146703337314, + "language_loss": 0.82722723, + "learning_rate": 0.00038999821766910465, + "loss": 0.83827209, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.29248047, + "step": 3033, + "time_per_iteration": 2.9882729053497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108478, + "balance_loss_mlp": 1.07996285, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.08031037628307693, + "language_loss": 0.86154497, + "learning_rate": 0.00038969432980902606, + "loss": 0.87262976, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.28540039, + "step": 3034, + "time_per_iteration": 2.597313642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018692, + "balance_loss_mlp": 1.01149189, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.013503469394203483, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80803192, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.07177734, + "step": 3035, + "time_per_iteration": 4.801652669906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113026, + "balance_loss_mlp": 1.08374798, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0646542819028206, + "language_loss": 0.82506442, + "learning_rate": 0.00038908668268020953, + "loss": 0.83619463, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29223633, + "step": 3036, + "time_per_iteration": 2.6857457160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112518, + "balance_loss_mlp": 1.08381224, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.21422512196310703, + "language_loss": 0.85166728, + "learning_rate": 0.00038878292364738097, + "loss": 0.86279243, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.28662109, + "step": 3037, + "time_per_iteration": 2.776686191558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106641, + "balance_loss_mlp": 1.07726789, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0719771880124652, + "language_loss": 0.87355781, + "learning_rate": 0.0003884792077928508, + "loss": 0.88462424, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.5682616233825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_mlp": 1.07304573, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.06153670645771429, + "language_loss": 0.7661767, + "learning_rate": 0.0003881755352345322, + "loss": 0.77719897, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29174805, + "step": 3039, + "time_per_iteration": 2.5531814098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104927, + "balance_loss_mlp": 1.07560194, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05739173880603102, + "language_loss": 0.86896229, + "learning_rate": 0.0003878719060903207, + "loss": 0.88001162, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29296875, + "step": 3040, + "time_per_iteration": 2.593386650085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098868, + "balance_loss_mlp": 1.06908977, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.068924296543817, + "language_loss": 0.84256113, + "learning_rate": 0.0003875683204780961, + "loss": 0.85354984, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29785156, + "step": 3041, + "time_per_iteration": 2.6921916007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_mlp": 1.07145464, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.07404975426077917, + "language_loss": 0.85055083, + "learning_rate": 0.00038726477851572043, + "loss": 0.86155903, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.29394531, + "step": 3042, + "time_per_iteration": 2.76772403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090937, + "balance_loss_mlp": 1.06249356, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.06423863125550561, + "language_loss": 0.80573255, + "learning_rate": 0.0003869612803210395, + "loss": 0.81664193, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.28442383, + "step": 3043, + "time_per_iteration": 2.6271820068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092493, + "balance_loss_mlp": 1.06314421, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.07232729129784332, + "language_loss": 0.83455092, + "learning_rate": 0.0003866578260118817, + "loss": 0.84547591, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29345703, + "step": 3044, + "time_per_iteration": 2.583698272705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05616593, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.059856611418728146, + "language_loss": 0.83175647, + "learning_rate": 0.0003863544157060581, + "loss": 0.84260201, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.28369141, + "step": 3045, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090685, + "balance_loss_mlp": 1.06166923, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.05199684229497746, + "language_loss": 0.82254589, + "learning_rate": 0.0003860510495213634, + "loss": 0.8334527, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.28979492, + "step": 3046, + "time_per_iteration": 2.7998342514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090034, + "balance_loss_mlp": 1.05946922, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08208062967584176, + "language_loss": 0.78349328, + "learning_rate": 0.0003857477275755746, + "loss": 0.7943936, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.30517578, + "step": 3047, + "time_per_iteration": 2.6120448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088733, + "balance_loss_mlp": 1.05940795, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0525859268526321, + "language_loss": 0.83988523, + "learning_rate": 0.00038544444998645167, + "loss": 0.8507725, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.29296875, + "step": 3048, + "time_per_iteration": 2.9847609996795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085173, + "balance_loss_mlp": 1.0563724, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.06739730522499447, + "language_loss": 0.82059789, + "learning_rate": 0.00038514121687173767, + "loss": 0.83144969, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.28808594, + "step": 3049, + "time_per_iteration": 2.619170904159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081984, + "balance_loss_mlp": 1.0529443, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07072588382777995, + "language_loss": 0.82076973, + "learning_rate": 0.00038483802834915807, + "loss": 0.83158958, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.29003906, + "step": 3050, + "time_per_iteration": 2.9947521686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04742062, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.0556694240307722, + "language_loss": 0.7980268, + "learning_rate": 0.00038453488453642074, + "loss": 0.80879277, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29174805, + "step": 3051, + "time_per_iteration": 2.659647226333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081993, + "balance_loss_mlp": 1.05133235, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.055022006168623364, + "language_loss": 0.8682425, + "learning_rate": 0.00038423178555121697, + "loss": 0.87906241, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.30664062, + "step": 3052, + "time_per_iteration": 2.682971954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078016, + "balance_loss_mlp": 1.0489769, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.05776598371070369, + "language_loss": 0.85701603, + "learning_rate": 0.00038392873151121994, + "loss": 0.86779618, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.29052734, + "step": 3053, + "time_per_iteration": 3.060055732727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077537, + "balance_loss_mlp": 1.04883146, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.06401371867882108, + "language_loss": 0.83262593, + "learning_rate": 0.0003836257225340859, + "loss": 0.84340131, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.28686523, + "step": 3054, + "time_per_iteration": 2.680649995803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.04853082, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.058869654242756926, + "language_loss": 0.82344568, + "learning_rate": 0.00038332275873745336, + "loss": 0.83423615, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.3046875, + "step": 3055, + "time_per_iteration": 3.036266565322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108387, + "balance_loss_mlp": 1.05485463, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05256953045681507, + "language_loss": 0.83349717, + "learning_rate": 0.0003830198402389431, + "loss": 0.84433585, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.2902832, + "step": 3056, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.06163549, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.04626706953255302, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78418016, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07421875, + "step": 3057, + "time_per_iteration": 4.978636026382446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082198, + "balance_loss_mlp": 1.05349255, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.07448060145489646, + "language_loss": 0.83136308, + "learning_rate": 0.0003824141396066855, + "loss": 0.84218502, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28710938, + "step": 3058, + "time_per_iteration": 2.5531108379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088619, + "balance_loss_mlp": 1.05910254, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.059082946906010764, + "language_loss": 0.82999164, + "learning_rate": 0.000382111357708092, + "loss": 0.84087777, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29541016, + "step": 3059, + "time_per_iteration": 2.699920654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088385, + "balance_loss_mlp": 1.05917883, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.071653907002528, + "language_loss": 0.84021831, + "learning_rate": 0.00038180862157792864, + "loss": 0.85110211, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.29174805, + "step": 3060, + "time_per_iteration": 2.8073549270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_mlp": 1.05642152, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.05679216094879844, + "language_loss": 0.82328987, + "learning_rate": 0.0003815059313337279, + "loss": 0.83413565, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28198242, + "step": 3061, + "time_per_iteration": 2.6649534702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086963, + "balance_loss_mlp": 1.05906773, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.07322136366051005, + "language_loss": 0.78155029, + "learning_rate": 0.00038120328709300436, + "loss": 0.79241997, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.27905273, + "step": 3062, + "time_per_iteration": 2.9070422649383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091769, + "balance_loss_mlp": 1.06191885, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.07246450050077374, + "language_loss": 0.83913672, + "learning_rate": 0.0003809006889732549, + "loss": 0.85005438, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.29833984, + "step": 3063, + "time_per_iteration": 2.803724527359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06420445, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.05969034427320992, + "language_loss": 0.88370293, + "learning_rate": 0.0003805981370919589, + "loss": 0.89462918, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28442383, + "step": 3064, + "time_per_iteration": 2.495248556137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086784, + "balance_loss_mlp": 1.05877018, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.05081424319280643, + "language_loss": 0.83982229, + "learning_rate": 0.0003802956315665771, + "loss": 0.85069013, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28027344, + "step": 3065, + "time_per_iteration": 2.6511592864990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091365, + "balance_loss_mlp": 1.06182539, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.06728201091458674, + "language_loss": 0.81791949, + "learning_rate": 0.0003799931725145529, + "loss": 0.8288331, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.29516602, + "step": 3066, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095665, + "balance_loss_mlp": 1.06729341, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.05193283223246739, + "language_loss": 0.86020327, + "learning_rate": 0.00037969076005331083, + "loss": 0.87115991, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28369141, + "step": 3067, + "time_per_iteration": 2.763853073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06713736, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.05663918686290471, + "language_loss": 0.88129491, + "learning_rate": 0.00037938839430025817, + "loss": 0.89225829, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.29248047, + "step": 3068, + "time_per_iteration": 2.6258280277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089417, + "balance_loss_mlp": 1.06092644, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.05275324094783275, + "language_loss": 0.85889924, + "learning_rate": 0.0003790860753727835, + "loss": 0.86979342, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.28491211, + "step": 3069, + "time_per_iteration": 2.7926387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.05799568, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0573953914976859, + "language_loss": 0.8280952, + "learning_rate": 0.00037878380338825766, + "loss": 0.83896416, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28881836, + "step": 3070, + "time_per_iteration": 2.6791534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089394, + "balance_loss_mlp": 1.06209493, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.054269754776710775, + "language_loss": 0.81082213, + "learning_rate": 0.00037848157846403287, + "loss": 0.82171613, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.2734375, + "step": 3071, + "time_per_iteration": 2.897139549255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095364, + "balance_loss_mlp": 1.06792235, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.0725138562855444, + "language_loss": 0.83259237, + "learning_rate": 0.0003781794007174435, + "loss": 0.84354603, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.2746582, + "step": 3072, + "time_per_iteration": 2.724810838699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_mlp": 1.02988398, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.01939748854391394, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75111091, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.06689453, + "step": 3073, + "time_per_iteration": 4.9330198764801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090512, + "balance_loss_mlp": 1.06285512, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.048822002095482486, + "language_loss": 0.81208611, + "learning_rate": 0.0003775751872264152, + "loss": 0.82299125, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.27661133, + "step": 3074, + "time_per_iteration": 2.7631497383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084721, + "balance_loss_mlp": 1.05599189, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.06348444489710649, + "language_loss": 0.86787391, + "learning_rate": 0.0003772731517165527, + "loss": 0.87872112, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28710938, + "step": 3075, + "time_per_iteration": 2.7517099380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089134, + "balance_loss_mlp": 1.06069052, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.059695821747375526, + "language_loss": 0.83545357, + "learning_rate": 0.0003769711638534784, + "loss": 0.84634489, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28466797, + "step": 3076, + "time_per_iteration": 2.9352333545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090964, + "balance_loss_mlp": 1.06209183, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.08879190082108672, + "language_loss": 0.79118001, + "learning_rate": 0.00037666922375443446, + "loss": 0.80208963, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28857422, + "step": 3077, + "time_per_iteration": 2.5947184562683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093967, + "balance_loss_mlp": 1.06578577, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06374349472109522, + "language_loss": 0.81828058, + "learning_rate": 0.00037636733153664396, + "loss": 0.82922018, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.28149414, + "step": 3078, + "time_per_iteration": 2.8191051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109303, + "balance_loss_mlp": 1.0645864, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.06406278721713668, + "language_loss": 0.80298102, + "learning_rate": 0.0003760654873173124, + "loss": 0.81391132, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.28466797, + "step": 3079, + "time_per_iteration": 2.656822919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089541, + "balance_loss_mlp": 1.06081128, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.04854482848269962, + "language_loss": 0.82530022, + "learning_rate": 0.00037576369121362566, + "loss": 0.83619559, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.28759766, + "step": 3080, + "time_per_iteration": 2.589050531387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06840181, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.05673956944694001, + "language_loss": 0.82090509, + "learning_rate": 0.0003754619433427516, + "loss": 0.83188212, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29272461, + "step": 3081, + "time_per_iteration": 2.8826987743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086639, + "balance_loss_mlp": 1.05845797, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.06493823771045844, + "language_loss": 0.78039849, + "learning_rate": 0.0003751602438218392, + "loss": 0.79126489, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.28222656, + "step": 3082, + "time_per_iteration": 2.815852642059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087731, + "balance_loss_mlp": 1.05952644, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.08102695368832301, + "language_loss": 0.83818078, + "learning_rate": 0.0003748585927680186, + "loss": 0.84905803, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.28198242, + "step": 3083, + "time_per_iteration": 2.6566061973571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_mlp": 1.05651248, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.0619003043193751, + "language_loss": 0.8314001, + "learning_rate": 0.00037455699029840086, + "loss": 0.84224129, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.27612305, + "step": 3084, + "time_per_iteration": 2.609382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081588, + "balance_loss_mlp": 1.05436099, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.05433571826648474, + "language_loss": 0.84684891, + "learning_rate": 0.0003742554365300787, + "loss": 0.85766476, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.27270508, + "step": 3085, + "time_per_iteration": 2.725409746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05927253, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.05832989485618193, + "language_loss": 0.79031849, + "learning_rate": 0.0003739539315801255, + "loss": 0.80118442, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.27331543, + "step": 3086, + "time_per_iteration": 2.9751360416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092425, + "balance_loss_mlp": 1.06493533, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.05988774460659005, + "language_loss": 0.9182803, + "learning_rate": 0.000373652475565596, + "loss": 0.92920458, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.27490234, + "step": 3087, + "time_per_iteration": 2.535181999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090266, + "balance_loss_mlp": 1.06144142, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.07303028521355714, + "language_loss": 0.81608456, + "learning_rate": 0.00037335106860352587, + "loss": 0.82698727, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.28808594, + "step": 3088, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545377, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.0577260245362681, + "language_loss": 0.83174306, + "learning_rate": 0.00037304971081093146, + "loss": 0.84268945, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.29199219, + "step": 3089, + "time_per_iteration": 2.5568172931671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06479192, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.05440667028717182, + "language_loss": 0.80792761, + "learning_rate": 0.00037274840230481024, + "loss": 0.81884158, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.26635742, + "step": 3090, + "time_per_iteration": 2.7040512561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089877, + "balance_loss_mlp": 1.06152868, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07197994401815008, + "language_loss": 0.79483205, + "learning_rate": 0.00037244714320214077, + "loss": 0.80573082, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.28369141, + "step": 3091, + "time_per_iteration": 2.527803659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091312, + "balance_loss_mlp": 1.06317902, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06270949928795992, + "language_loss": 0.83166003, + "learning_rate": 0.000372145933619882, + "loss": 0.84257317, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.28137207, + "step": 3092, + "time_per_iteration": 2.869267225265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092404, + "balance_loss_mlp": 1.06455636, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.059066436199884755, + "language_loss": 0.82841283, + "learning_rate": 0.000371844773674974, + "loss": 0.83933693, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.27856445, + "step": 3093, + "time_per_iteration": 2.6301257610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097042, + "balance_loss_mlp": 1.06793106, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.06442112613973276, + "language_loss": 0.82118666, + "learning_rate": 0.0003715436634843375, + "loss": 0.83215708, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29101562, + "step": 3094, + "time_per_iteration": 2.8569583892822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091347, + "balance_loss_mlp": 1.06466842, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.04641072419683149, + "language_loss": 0.80758119, + "learning_rate": 0.00037124260316487355, + "loss": 0.81849468, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.26708984, + "step": 3095, + "time_per_iteration": 2.8417470455169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095419, + "balance_loss_mlp": 1.06838274, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.05475651988922655, + "language_loss": 0.89790189, + "learning_rate": 0.0003709415928334643, + "loss": 0.90885603, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.27075195, + "step": 3096, + "time_per_iteration": 2.5519328117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092318, + "balance_loss_mlp": 1.06382728, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09831894239475095, + "language_loss": 0.80721879, + "learning_rate": 0.00037064063260697233, + "loss": 0.818142, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.28491211, + "step": 3097, + "time_per_iteration": 2.8612656593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099184, + "balance_loss_mlp": 1.07157493, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.058836420710008684, + "language_loss": 0.78798771, + "learning_rate": 0.0003703397226022407, + "loss": 0.79897952, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.27612305, + "step": 3098, + "time_per_iteration": 3.069542169570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_mlp": 1.03243947, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.024027627375554906, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76539135, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.06835938, + "step": 3099, + "time_per_iteration": 4.940065860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109756, + "balance_loss_mlp": 1.06966519, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.059128365336986094, + "language_loss": 0.83247489, + "learning_rate": 0.0003697380537253339, + "loss": 0.84345049, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.27929688, + "step": 3100, + "time_per_iteration": 2.638352632522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098347, + "balance_loss_mlp": 1.06973624, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.05513129923941457, + "language_loss": 0.82084006, + "learning_rate": 0.0003694372950867471, + "loss": 0.83182353, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28637695, + "step": 3101, + "time_per_iteration": 2.7355875968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101199, + "balance_loss_mlp": 1.07282722, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.05863829677079808, + "language_loss": 0.77766848, + "learning_rate": 0.0003691365871370976, + "loss": 0.78868043, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.28393555, + "step": 3102, + "time_per_iteration": 3.0227084159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110316, + "balance_loss_mlp": 1.07533622, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06404713166930852, + "language_loss": 0.85323572, + "learning_rate": 0.00036883592999313093, + "loss": 0.86426735, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27832031, + "step": 3103, + "time_per_iteration": 2.659637689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.0700587, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.05340010645713243, + "language_loss": 0.79008019, + "learning_rate": 0.0003685353237715722, + "loss": 0.80105591, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27563477, + "step": 3104, + "time_per_iteration": 2.9019625186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109062, + "balance_loss_mlp": 1.06272471, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.053396202956180965, + "language_loss": 0.81746447, + "learning_rate": 0.0003682347685891274, + "loss": 0.82837057, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.27893066, + "step": 3105, + "time_per_iteration": 2.8479247093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093319, + "balance_loss_mlp": 1.06535256, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.061940030050424234, + "language_loss": 0.80626607, + "learning_rate": 0.0003679342645624822, + "loss": 0.81719923, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.2800293, + "step": 3106, + "time_per_iteration": 2.988600015640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088181, + "balance_loss_mlp": 1.06088209, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.06552701347411696, + "language_loss": 0.82154477, + "learning_rate": 0.0003676338118083025, + "loss": 0.83242655, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.2734375, + "step": 3107, + "time_per_iteration": 3.0211057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091455, + "balance_loss_mlp": 1.06372714, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.05808577111452716, + "language_loss": 0.79585344, + "learning_rate": 0.0003673334104432347, + "loss": 0.806768, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.27758789, + "step": 3108, + "time_per_iteration": 2.6277918815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109043, + "balance_loss_mlp": 1.06255877, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.05782699460566696, + "language_loss": 0.83817154, + "learning_rate": 0.0003670330605839048, + "loss": 0.84907585, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.27856445, + "step": 3109, + "time_per_iteration": 2.786181926727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094155, + "balance_loss_mlp": 1.06685627, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.06234839208499282, + "language_loss": 0.76878405, + "learning_rate": 0.0003667327623469191, + "loss": 0.77972555, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27319336, + "step": 3110, + "time_per_iteration": 2.731876850128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089583, + "balance_loss_mlp": 1.0621767, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.06451414709321307, + "language_loss": 0.78028917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79118496, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27429199, + "step": 3111, + "time_per_iteration": 2.796886682510376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088313, + "balance_loss_mlp": 1.06072783, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.06854484980093518, + "language_loss": 0.82222939, + "learning_rate": 0.00036613232120630393, + "loss": 0.83311254, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.27587891, + "step": 3112, + "time_per_iteration": 2.6065847873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_mlp": 1.05594933, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.06819300023171558, + "language_loss": 0.80318254, + "learning_rate": 0.00036583217853578643, + "loss": 0.81402361, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.28173828, + "step": 3113, + "time_per_iteration": 2.5723838806152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_mlp": 1.06200337, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.05495468357602656, + "language_loss": 0.77783948, + "learning_rate": 0.000365532087953837, + "loss": 0.78872508, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.26586914, + "step": 3114, + "time_per_iteration": 3.622190475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081565, + "balance_loss_mlp": 1.05359864, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07841874273871757, + "language_loss": 0.89431345, + "learning_rate": 0.00036523204957696065, + "loss": 0.90512908, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.27978516, + "step": 3115, + "time_per_iteration": 2.6414806842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084485, + "balance_loss_mlp": 1.05627978, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.0586823821525485, + "language_loss": 0.80958188, + "learning_rate": 0.00036493206352164324, + "loss": 0.8204267, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.28222656, + "step": 3116, + "time_per_iteration": 2.896613121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.05184269, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.05558165654665051, + "language_loss": 0.85426074, + "learning_rate": 0.000364632129904349, + "loss": 0.86506593, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.28662109, + "step": 3117, + "time_per_iteration": 2.7053070068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079719, + "balance_loss_mlp": 1.05215788, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.05806752486487043, + "language_loss": 0.78326154, + "learning_rate": 0.00036433224884152283, + "loss": 0.79405868, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.27587891, + "step": 3118, + "time_per_iteration": 2.6854429244995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083264, + "balance_loss_mlp": 1.0547967, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.06710995797512392, + "language_loss": 0.78089821, + "learning_rate": 0.00036403242044958875, + "loss": 0.79173082, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28466797, + "step": 3119, + "time_per_iteration": 2.53751540184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04949808, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.059219046094812676, + "language_loss": 0.91922826, + "learning_rate": 0.0003637326448449507, + "loss": 0.93000555, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.28222656, + "step": 3120, + "time_per_iteration": 2.7070553302764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075191, + "balance_loss_mlp": 1.04855967, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.05784920643643932, + "language_loss": 0.86244148, + "learning_rate": 0.00036343292214399177, + "loss": 0.87319338, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.2668457, + "step": 3121, + "time_per_iteration": 2.790273904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.05368924, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.061762558273937264, + "language_loss": 0.77535498, + "learning_rate": 0.00036313325246307456, + "loss": 0.78617358, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.28149414, + "step": 3122, + "time_per_iteration": 2.8160674571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085554, + "balance_loss_mlp": 1.05804014, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.06096373394010022, + "language_loss": 0.8757152, + "learning_rate": 0.0003628336359185411, + "loss": 0.88657075, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.27539062, + "step": 3123, + "time_per_iteration": 2.6819381713867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083737, + "balance_loss_mlp": 1.05708146, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.07022869927973763, + "language_loss": 0.75776213, + "learning_rate": 0.000362534072626713, + "loss": 0.76859951, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.2668457, + "step": 3124, + "time_per_iteration": 2.740907907485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083476, + "balance_loss_mlp": 1.05572367, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.05823250121923288, + "language_loss": 0.81532884, + "learning_rate": 0.00036223456270389093, + "loss": 0.82616365, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.27758789, + "step": 3125, + "time_per_iteration": 2.9345879554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.06254041, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.05438265607227417, + "language_loss": 0.81106913, + "learning_rate": 0.00036193510626635517, + "loss": 0.82197487, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.28076172, + "step": 3126, + "time_per_iteration": 2.719505786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092388, + "balance_loss_mlp": 1.06581664, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.06352965026992909, + "language_loss": 0.8166849, + "learning_rate": 0.0003616357034303649, + "loss": 0.82760876, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.26623535, + "step": 3127, + "time_per_iteration": 2.917137861251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.0748688, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.06152140222119449, + "language_loss": 0.7902928, + "learning_rate": 0.0003613363543121584, + "loss": 0.80131161, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.27050781, + "step": 3128, + "time_per_iteration": 2.8336853981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098033, + "balance_loss_mlp": 1.07082987, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.1105531777946672, + "language_loss": 0.85000741, + "learning_rate": 0.00036103705902795357, + "loss": 0.86098778, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.2722168, + "step": 3129, + "time_per_iteration": 2.6958324909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107326, + "balance_loss_mlp": 1.07933569, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.08057315277867966, + "language_loss": 0.79751796, + "learning_rate": 0.0003607378176939471, + "loss": 0.80859125, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.2800293, + "step": 3130, + "time_per_iteration": 2.609400510787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109547, + "balance_loss_mlp": 1.06817079, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.0756423011045038, + "language_loss": 0.82227194, + "learning_rate": 0.00036043863042631465, + "loss": 0.83322662, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.2734375, + "step": 3131, + "time_per_iteration": 2.6571097373962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.06409097, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.07645469837417121, + "language_loss": 0.76662207, + "learning_rate": 0.00036013949734121133, + "loss": 0.77753073, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.26782227, + "step": 3132, + "time_per_iteration": 3.118265390396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096725, + "balance_loss_mlp": 1.06995106, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.0687931043319398, + "language_loss": 0.82291925, + "learning_rate": 0.00035984041855477043, + "loss": 0.83388644, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.26794434, + "step": 3133, + "time_per_iteration": 2.777459144592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019209, + "balance_loss_mlp": 1.01186562, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.01616325084905853, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79729104, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.07324219, + "step": 3134, + "time_per_iteration": 4.925475597381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.05736887, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.06318710690497562, + "language_loss": 0.79746044, + "learning_rate": 0.00035924242434230637, + "loss": 0.80829811, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.2644043, + "step": 3135, + "time_per_iteration": 2.7011537551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085192, + "balance_loss_mlp": 1.05767858, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.07716145908862651, + "language_loss": 0.79063201, + "learning_rate": 0.00035894350914844516, + "loss": 0.80148399, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.27514648, + "step": 3136, + "time_per_iteration": 2.6126935482025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088376, + "balance_loss_mlp": 1.05995679, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.06075860838457364, + "language_loss": 0.827613, + "learning_rate": 0.0003586446487175703, + "loss": 0.83849669, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.28417969, + "step": 3137, + "time_per_iteration": 2.675171375274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088716, + "balance_loss_mlp": 1.06041527, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.0544690611172434, + "language_loss": 0.85478795, + "learning_rate": 0.0003583458431657099, + "loss": 0.86567509, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.28320312, + "step": 3138, + "time_per_iteration": 2.7620253562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089567, + "balance_loss_mlp": 1.06336451, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.07515995216766168, + "language_loss": 0.83139801, + "learning_rate": 0.00035804709260887056, + "loss": 0.84229362, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.26220703, + "step": 3139, + "time_per_iteration": 2.6879465579986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087318, + "balance_loss_mlp": 1.05985248, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.052915045266918946, + "language_loss": 0.89835536, + "learning_rate": 0.0003577483971630373, + "loss": 0.90922856, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.27514648, + "step": 3140, + "time_per_iteration": 2.6586039066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06398129, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.045195992370632855, + "language_loss": 0.85010505, + "learning_rate": 0.00035744975694417414, + "loss": 0.86101902, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.27416992, + "step": 3141, + "time_per_iteration": 2.8448941707611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084418, + "balance_loss_mlp": 1.05757236, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.07912966064455412, + "language_loss": 0.82233572, + "learning_rate": 0.00035715117206822344, + "loss": 0.83317983, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.26867676, + "step": 3142, + "time_per_iteration": 2.7542483806610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087563, + "balance_loss_mlp": 1.06026399, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.0701313453845953, + "language_loss": 0.80890429, + "learning_rate": 0.0003568526426511065, + "loss": 0.81977987, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.27331543, + "step": 3143, + "time_per_iteration": 2.6046767234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081818, + "balance_loss_mlp": 1.05658114, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.07379330487049819, + "language_loss": 0.83015585, + "learning_rate": 0.000356554168808722, + "loss": 0.84097409, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.25244141, + "step": 3144, + "time_per_iteration": 2.9466705322265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087357, + "balance_loss_mlp": 1.06141686, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.06250797721947925, + "language_loss": 0.84944713, + "learning_rate": 0.00035625575065694837, + "loss": 0.86032069, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.25952148, + "step": 3145, + "time_per_iteration": 2.9049606323242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083147, + "balance_loss_mlp": 1.05501366, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.05947586112106144, + "language_loss": 0.77504069, + "learning_rate": 0.0003559573883116415, + "loss": 0.78587222, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.28125, + "step": 3146, + "time_per_iteration": 2.70141339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095869, + "balance_loss_mlp": 1.06964314, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.050725714839995426, + "language_loss": 0.85750544, + "learning_rate": 0.00035565908188863604, + "loss": 0.86846411, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.26269531, + "step": 3147, + "time_per_iteration": 2.822096586227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097988, + "balance_loss_mlp": 1.07181001, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.06536005907217222, + "language_loss": 0.79801714, + "learning_rate": 0.00035536083150374464, + "loss": 0.80899704, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.26220703, + "step": 3148, + "time_per_iteration": 2.883934736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_mlp": 1.01980209, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.01728788780398527, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75775194, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.07421875, + "step": 3149, + "time_per_iteration": 4.850924015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07784474, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.06213460160212929, + "language_loss": 0.85822916, + "learning_rate": 0.0003547644993114475, + "loss": 0.8692801, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.27246094, + "step": 3150, + "time_per_iteration": 2.8107762336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102124, + "balance_loss_mlp": 1.0744915, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.06674612399311457, + "language_loss": 0.79958618, + "learning_rate": 0.00035446641773555806, + "loss": 0.81060743, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.27636719, + "step": 3151, + "time_per_iteration": 2.7216579914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101102, + "balance_loss_mlp": 1.07332611, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.052040510091589255, + "language_loss": 0.87343258, + "learning_rate": 0.000354168392660816, + "loss": 0.88444364, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.27758789, + "step": 3152, + "time_per_iteration": 2.726529836654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091719, + "balance_loss_mlp": 1.06484938, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.05990276634138019, + "language_loss": 0.82825845, + "learning_rate": 0.0003538704242029252, + "loss": 0.83917564, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.26879883, + "step": 3153, + "time_per_iteration": 2.695416212081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109717, + "balance_loss_mlp": 1.06884539, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.07600523103772844, + "language_loss": 0.77972901, + "learning_rate": 0.0003535725124775672, + "loss": 0.79070067, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.28320312, + "step": 3154, + "time_per_iteration": 2.8397514820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094038, + "balance_loss_mlp": 1.0649513, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.058609076283542554, + "language_loss": 0.86659074, + "learning_rate": 0.00035327465760040126, + "loss": 0.87753117, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.29077148, + "step": 3155, + "time_per_iteration": 2.6624228954315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.05640316, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.09292554424112281, + "language_loss": 0.84951353, + "learning_rate": 0.00035297685968706526, + "loss": 0.8603462, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.26867676, + "step": 3156, + "time_per_iteration": 2.7303812503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084205, + "balance_loss_mlp": 1.05590463, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.06445223486110697, + "language_loss": 0.83064741, + "learning_rate": 0.00035267911885317454, + "loss": 0.84148943, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.28271484, + "step": 3157, + "time_per_iteration": 2.6405463218688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06032109, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.05575059306658705, + "language_loss": 0.81712598, + "learning_rate": 0.0003523814352143222, + "loss": 0.82800603, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.27709961, + "step": 3158, + "time_per_iteration": 2.830343723297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093502, + "balance_loss_mlp": 1.06400919, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.06682067437398732, + "language_loss": 0.91622639, + "learning_rate": 0.00035208380888607937, + "loss": 0.9271614, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.29455566, + "step": 3159, + "time_per_iteration": 2.796640634536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_mlp": 1.01944304, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.020734540297120695, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80488676, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.07226562, + "step": 3160, + "time_per_iteration": 4.843371391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021138, + "balance_loss_mlp": 1.01393795, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.018390389893633168, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76713371, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.07177734, + "step": 3161, + "time_per_iteration": 5.065373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06481421, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.06046903146728731, + "language_loss": 0.81903481, + "learning_rate": 0.00035119127492038446, + "loss": 0.82996982, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.28637695, + "step": 3162, + "time_per_iteration": 2.7967278957366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083192, + "balance_loss_mlp": 1.0550108, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.05880363430465999, + "language_loss": 0.82486427, + "learning_rate": 0.00035089387898984436, + "loss": 0.83569616, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.28198242, + "step": 3163, + "time_per_iteration": 3.0665948390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089596, + "balance_loss_mlp": 1.06079483, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.064612412597244, + "language_loss": 0.8164137, + "learning_rate": 0.0003505965409474343, + "loss": 0.82730967, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.28808594, + "step": 3164, + "time_per_iteration": 2.9265527725219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078635, + "balance_loss_mlp": 1.05164599, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.0535577439830692, + "language_loss": 0.86276996, + "learning_rate": 0.0003502992609085913, + "loss": 0.87355632, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.27001953, + "step": 3165, + "time_per_iteration": 2.6794493198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082317, + "balance_loss_mlp": 1.05463672, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.05150827346349905, + "language_loss": 0.82492924, + "learning_rate": 0.00035000203898872954, + "loss": 0.83575243, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.27734375, + "step": 3166, + "time_per_iteration": 2.9775314331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081161, + "balance_loss_mlp": 1.0533855, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.0631204311292361, + "language_loss": 0.84789312, + "learning_rate": 0.0003497048753032406, + "loss": 0.85870469, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.27783203, + "step": 3167, + "time_per_iteration": 2.8659260272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082152, + "balance_loss_mlp": 1.05567539, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.05504676322369481, + "language_loss": 0.80827415, + "learning_rate": 0.000349407769967494, + "loss": 0.81909573, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.26525879, + "step": 3168, + "time_per_iteration": 3.3787014484405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081057, + "balance_loss_mlp": 1.05447292, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.05919699008213893, + "language_loss": 0.84490019, + "learning_rate": 0.0003491107230968361, + "loss": 0.85571074, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.26611328, + "step": 3169, + "time_per_iteration": 2.6599555015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078954, + "balance_loss_mlp": 1.05251288, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.05367575554300243, + "language_loss": 0.81929743, + "learning_rate": 0.00034881373480659085, + "loss": 0.83008707, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.26489258, + "step": 3170, + "time_per_iteration": 2.828599214553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089157, + "balance_loss_mlp": 1.06092811, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.07372507054287164, + "language_loss": 0.77562344, + "learning_rate": 0.0003485168052120594, + "loss": 0.78651506, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.28198242, + "step": 3171, + "time_per_iteration": 2.55070161819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092713, + "balance_loss_mlp": 1.06579578, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.06238864549227849, + "language_loss": 0.80073476, + "learning_rate": 0.00034821993442851973, + "loss": 0.81166196, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.26940918, + "step": 3172, + "time_per_iteration": 2.585115909576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.06593776, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.07089619767063425, + "language_loss": 0.82434714, + "learning_rate": 0.00034792312257122735, + "loss": 0.83527064, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.26428223, + "step": 3173, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109217, + "balance_loss_mlp": 1.06535971, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.06191738998776062, + "language_loss": 0.8083055, + "learning_rate": 0.00034762636975541506, + "loss": 0.81922722, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.26843262, + "step": 3174, + "time_per_iteration": 2.661529779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097622, + "balance_loss_mlp": 1.07096648, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.07934443203127389, + "language_loss": 0.81213707, + "learning_rate": 0.0003473296760962923, + "loss": 0.82311332, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.2668457, + "step": 3175, + "time_per_iteration": 2.730571746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105783, + "balance_loss_mlp": 1.05005765, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.03785121855584389, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79591566, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.07763672, + "step": 3176, + "time_per_iteration": 4.720510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094316, + "balance_loss_mlp": 1.06782722, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.05949259191251309, + "language_loss": 0.81672812, + "learning_rate": 0.00034673646670883976, + "loss": 0.82767129, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.26538086, + "step": 3177, + "time_per_iteration": 3.025146722793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_mlp": 1.02812171, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.027049018431207196, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76750535, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.07373047, + "step": 3178, + "time_per_iteration": 5.000125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085963, + "balance_loss_mlp": 1.05978417, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.07013069416081287, + "language_loss": 0.81980824, + "learning_rate": 0.0003461434953300865, + "loss": 0.83066785, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.26220703, + "step": 3179, + "time_per_iteration": 2.922963857650757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081362, + "balance_loss_mlp": 1.05501699, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.06339313471396843, + "language_loss": 0.81228697, + "learning_rate": 0.0003458470991817515, + "loss": 0.82310063, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.2635498, + "step": 3180, + "time_per_iteration": 2.9837453365325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.06111443, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.05673755911203457, + "language_loss": 0.84994721, + "learning_rate": 0.0003455507628808802, + "loss": 0.86083156, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.27319336, + "step": 3181, + "time_per_iteration": 2.6381750106811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088853, + "balance_loss_mlp": 1.06133974, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.08338525943087875, + "language_loss": 0.85169065, + "learning_rate": 0.00034525448654252076, + "loss": 0.86257923, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.27539062, + "step": 3182, + "time_per_iteration": 2.6688461303710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089639, + "balance_loss_mlp": 1.06263769, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.09017686395034887, + "language_loss": 0.83182716, + "learning_rate": 0.0003449582702816976, + "loss": 0.84272361, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.2701416, + "step": 3183, + "time_per_iteration": 2.6863620281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091522, + "balance_loss_mlp": 1.06479537, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.0548908554977987, + "language_loss": 0.82581168, + "learning_rate": 0.0003446621142134122, + "loss": 0.8367269, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.26757812, + "step": 3184, + "time_per_iteration": 2.673337459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093536, + "balance_loss_mlp": 1.06649971, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.06227229540090399, + "language_loss": 0.84346098, + "learning_rate": 0.0003443660184526424, + "loss": 0.85439634, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.27050781, + "step": 3185, + "time_per_iteration": 2.4706175327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092866, + "balance_loss_mlp": 1.06506586, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.05120570826610392, + "language_loss": 0.86619818, + "learning_rate": 0.0003440699831143429, + "loss": 0.87712687, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.27832031, + "step": 3186, + "time_per_iteration": 2.778033971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095882, + "balance_loss_mlp": 1.06884551, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.05392794478467523, + "language_loss": 0.82370943, + "learning_rate": 0.0003437740083134449, + "loss": 0.83466822, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.27050781, + "step": 3187, + "time_per_iteration": 2.6768150329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.06453919, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.07437759552236513, + "language_loss": 0.8353374, + "learning_rate": 0.00034347809416485574, + "loss": 0.84625435, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.27197266, + "step": 3188, + "time_per_iteration": 2.6008822917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085287, + "balance_loss_mlp": 1.05835748, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.053009634337547046, + "language_loss": 0.81880438, + "learning_rate": 0.0003431822407834597, + "loss": 0.82965726, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.26940918, + "step": 3189, + "time_per_iteration": 2.8121964931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090882, + "balance_loss_mlp": 1.06315422, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.06178667045305147, + "language_loss": 0.84739751, + "learning_rate": 0.00034288644828411706, + "loss": 0.85830629, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.27758789, + "step": 3190, + "time_per_iteration": 3.4740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087763, + "balance_loss_mlp": 1.06052327, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.08089532706522883, + "language_loss": 0.75991279, + "learning_rate": 0.0003425907167816649, + "loss": 0.77079034, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.27258301, + "step": 3191, + "time_per_iteration": 2.8420307636260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05866492, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.06652830958672214, + "language_loss": 0.84765488, + "learning_rate": 0.00034229504639091623, + "loss": 0.85850024, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.2590332, + "step": 3192, + "time_per_iteration": 2.805717945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079151, + "balance_loss_mlp": 1.05240059, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.06825133592780937, + "language_loss": 0.80015457, + "learning_rate": 0.0003419994372266606, + "loss": 0.81094611, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.26782227, + "step": 3193, + "time_per_iteration": 3.0882303714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084925, + "balance_loss_mlp": 1.05800796, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.061422659425354964, + "language_loss": 0.82002676, + "learning_rate": 0.00034170388940366335, + "loss": 0.83087599, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.26953125, + "step": 3194, + "time_per_iteration": 2.68253755569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085529, + "balance_loss_mlp": 1.0581584, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.0574427380686639, + "language_loss": 0.801368, + "learning_rate": 0.0003414084030366667, + "loss": 0.81222332, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.27380371, + "step": 3195, + "time_per_iteration": 3.079050302505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05421329, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.05079595978056556, + "language_loss": 0.83029908, + "learning_rate": 0.0003411129782403883, + "loss": 0.84111041, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.26953125, + "step": 3196, + "time_per_iteration": 2.632840871810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086159, + "balance_loss_mlp": 1.05881214, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.06298738141067967, + "language_loss": 0.85384542, + "learning_rate": 0.0003408176151295225, + "loss": 0.86470699, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.27392578, + "step": 3197, + "time_per_iteration": 2.5977203845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.05186343, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.07239010053944613, + "language_loss": 0.77357507, + "learning_rate": 0.00034052231381873944, + "loss": 0.78436762, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.2746582, + "step": 3198, + "time_per_iteration": 2.604996919631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078982, + "balance_loss_mlp": 1.05149233, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.060831146063345755, + "language_loss": 0.85285568, + "learning_rate": 0.00034022707442268494, + "loss": 0.86364555, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.27514648, + "step": 3199, + "time_per_iteration": 2.6032421588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_mlp": 1.05297375, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.04692312170218308, + "language_loss": 0.82051641, + "learning_rate": 0.0003399318970559813, + "loss": 0.83131248, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.26660156, + "step": 3200, + "time_per_iteration": 2.8085906505584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_mlp": 1.0479418, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.057177124175777416, + "language_loss": 0.8485775, + "learning_rate": 0.00033963678183322656, + "loss": 0.85931993, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.26330566, + "step": 3201, + "time_per_iteration": 3.032761335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.05809593, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.053866229864627496, + "language_loss": 0.82829523, + "learning_rate": 0.0003393417288689945, + "loss": 0.8391425, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.26623535, + "step": 3202, + "time_per_iteration": 2.6627390384674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084716, + "balance_loss_mlp": 1.05858481, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.08436696100910436, + "language_loss": 0.76289904, + "learning_rate": 0.00033904673827783504, + "loss": 0.77374619, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.26171875, + "step": 3203, + "time_per_iteration": 2.914370059967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082648, + "balance_loss_mlp": 1.05615926, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.06562773431598554, + "language_loss": 0.81864727, + "learning_rate": 0.00033875181017427357, + "loss": 0.82947373, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.26501465, + "step": 3204, + "time_per_iteration": 2.5992236137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.05155659, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.05911238695185789, + "language_loss": 0.8101759, + "learning_rate": 0.00033845694467281133, + "loss": 0.82095909, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.26782227, + "step": 3205, + "time_per_iteration": 2.857226848602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079906, + "balance_loss_mlp": 1.05366778, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.056333384320929165, + "language_loss": 0.83590877, + "learning_rate": 0.00033816214188792516, + "loss": 0.84670782, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.26281738, + "step": 3206, + "time_per_iteration": 3.133683443069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108227, + "balance_loss_mlp": 1.05523372, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.06870835299002895, + "language_loss": 0.85362953, + "learning_rate": 0.00033786740193406784, + "loss": 0.86445218, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.27050781, + "step": 3207, + "time_per_iteration": 2.5766866207122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_mlp": 1.05775416, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.16525433487855157, + "language_loss": 0.81557286, + "learning_rate": 0.00033757272492566736, + "loss": 0.82641208, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.26184082, + "step": 3208, + "time_per_iteration": 2.8717997074127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.05363393, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.050446978523455026, + "language_loss": 0.8752228, + "learning_rate": 0.0003372781109771278, + "loss": 0.88603711, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.27832031, + "step": 3209, + "time_per_iteration": 2.740673303604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_mlp": 1.05973852, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.060596341147957054, + "language_loss": 0.76554525, + "learning_rate": 0.0003369835602028281, + "loss": 0.77641892, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.27661133, + "step": 3210, + "time_per_iteration": 2.813253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078855, + "balance_loss_mlp": 1.05222404, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.060877692494739295, + "language_loss": 0.7966795, + "learning_rate": 0.0003366890727171232, + "loss": 0.80746806, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.26647949, + "step": 3211, + "time_per_iteration": 2.7572054862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083411, + "balance_loss_mlp": 1.05717349, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.07113437774281188, + "language_loss": 0.78650188, + "learning_rate": 0.00033639464863434313, + "loss": 0.79733604, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.26257324, + "step": 3212, + "time_per_iteration": 2.616605520248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_mlp": 1.0275538, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.020977694975075144, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79477704, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.07666016, + "step": 3213, + "time_per_iteration": 4.706260919570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077401, + "balance_loss_mlp": 1.05035281, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.055780003903401355, + "language_loss": 0.79908657, + "learning_rate": 0.00033580599113475543, + "loss": 0.80986065, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.27087402, + "step": 3214, + "time_per_iteration": 2.976040840148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068298, + "balance_loss_mlp": 1.04207242, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.06538485262612419, + "language_loss": 0.86450571, + "learning_rate": 0.00033551175794648507, + "loss": 0.87518871, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.2623291, + "step": 3215, + "time_per_iteration": 2.5857200622558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074795, + "balance_loss_mlp": 1.0478301, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.05115792818317019, + "language_loss": 0.81974953, + "learning_rate": 0.00033521758861821365, + "loss": 0.8304975, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.27001953, + "step": 3216, + "time_per_iteration": 2.6541965007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070889, + "balance_loss_mlp": 1.04368544, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.053233870679950265, + "language_loss": 0.89132476, + "learning_rate": 0.0003349234832641479, + "loss": 0.90203357, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.27246094, + "step": 3217, + "time_per_iteration": 2.5898375511169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072387, + "balance_loss_mlp": 1.04567194, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.06188675281587152, + "language_loss": 0.81109393, + "learning_rate": 0.00033462944199846975, + "loss": 0.82181776, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.26721191, + "step": 3218, + "time_per_iteration": 3.049302101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068592, + "balance_loss_mlp": 1.04186571, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07980114958498462, + "language_loss": 0.86682892, + "learning_rate": 0.00033433546493533606, + "loss": 0.87751484, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.26757812, + "step": 3219, + "time_per_iteration": 2.4988718032836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072803, + "balance_loss_mlp": 1.04562318, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.06437622918216304, + "language_loss": 0.84503907, + "learning_rate": 0.00033404155218887897, + "loss": 0.85576707, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.27246094, + "step": 3220, + "time_per_iteration": 2.755687952041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069297, + "balance_loss_mlp": 1.04323733, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.054816937967161604, + "language_loss": 0.87677366, + "learning_rate": 0.00033374770387320534, + "loss": 0.88746661, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.26074219, + "step": 3221, + "time_per_iteration": 2.806687831878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073987, + "balance_loss_mlp": 1.0476656, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.05319951203525016, + "language_loss": 0.85096419, + "learning_rate": 0.00033345392010239737, + "loss": 0.86170411, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.2635498, + "step": 3222, + "time_per_iteration": 2.726924419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078737, + "balance_loss_mlp": 1.05248737, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.06204822794999188, + "language_loss": 0.82752097, + "learning_rate": 0.0003331602009905118, + "loss": 0.83830827, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.26245117, + "step": 3223, + "time_per_iteration": 2.8067080974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074324, + "balance_loss_mlp": 1.04770494, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.06248384558092708, + "language_loss": 0.83894855, + "learning_rate": 0.00033286654665158085, + "loss": 0.84969175, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.26635742, + "step": 3224, + "time_per_iteration": 2.973839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071798, + "balance_loss_mlp": 1.04578674, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.058715923927156195, + "language_loss": 0.87385452, + "learning_rate": 0.0003325729571996109, + "loss": 0.88457251, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.26037598, + "step": 3225, + "time_per_iteration": 2.6299448013305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_mlp": 1.05295992, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.05622680554800681, + "language_loss": 0.84078681, + "learning_rate": 0.000332279432748584, + "loss": 0.85158628, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.27001953, + "step": 3226, + "time_per_iteration": 2.713651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_mlp": 1.05334759, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.05334260963219639, + "language_loss": 0.8767364, + "learning_rate": 0.00033198597341245576, + "loss": 0.88753092, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.26147461, + "step": 3227, + "time_per_iteration": 2.5617635250091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_mlp": 1.05208337, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.05016111197588362, + "language_loss": 0.82129073, + "learning_rate": 0.00033169257930515763, + "loss": 0.83207709, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.26611328, + "step": 3228, + "time_per_iteration": 3.025502920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080729, + "balance_loss_mlp": 1.0543834, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.08161989388785439, + "language_loss": 0.82274306, + "learning_rate": 0.0003313992505405951, + "loss": 0.83355033, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.26367188, + "step": 3229, + "time_per_iteration": 2.705948829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083196, + "balance_loss_mlp": 1.05582547, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.06417417083544033, + "language_loss": 0.81270546, + "learning_rate": 0.0003311059872326487, + "loss": 0.82353741, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.27368164, + "step": 3230, + "time_per_iteration": 2.6827783584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080325, + "balance_loss_mlp": 1.05426574, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.060558133954529886, + "language_loss": 0.79513329, + "learning_rate": 0.0003308127894951734, + "loss": 0.80593657, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.26074219, + "step": 3231, + "time_per_iteration": 2.621156692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_mlp": 1.05295873, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.05872122895707264, + "language_loss": 0.86601388, + "learning_rate": 0.00033051965744199834, + "loss": 0.87681365, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.27075195, + "step": 3232, + "time_per_iteration": 2.7616896629333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089938, + "balance_loss_mlp": 1.06414127, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.05765951293021458, + "language_loss": 0.90613365, + "learning_rate": 0.0003302265911869276, + "loss": 0.91703308, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.25830078, + "step": 3233, + "time_per_iteration": 2.911309242248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.04950833, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.0568406918617455, + "language_loss": 0.84234416, + "learning_rate": 0.0003299335908437397, + "loss": 0.8531056, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.26660156, + "step": 3234, + "time_per_iteration": 2.5690464973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083439, + "balance_loss_mlp": 1.05614042, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08458123774573062, + "language_loss": 0.79892743, + "learning_rate": 0.0003296406565261873, + "loss": 0.80976182, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.27294922, + "step": 3235, + "time_per_iteration": 2.519242763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082513, + "balance_loss_mlp": 1.05619192, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.04986850206195379, + "language_loss": 0.85312378, + "learning_rate": 0.0003293477883479978, + "loss": 0.86394894, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.26367188, + "step": 3236, + "time_per_iteration": 2.8095037937164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_mlp": 1.05704379, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.105420899843356, + "language_loss": 0.79857445, + "learning_rate": 0.0003290549864228727, + "loss": 0.80941153, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.2668457, + "step": 3237, + "time_per_iteration": 2.9599437713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092317, + "balance_loss_mlp": 1.0648514, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.05485346042827634, + "language_loss": 0.86677277, + "learning_rate": 0.0003287622508644875, + "loss": 0.87769592, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.27514648, + "step": 3238, + "time_per_iteration": 2.7735140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108562, + "balance_loss_mlp": 1.05971575, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.06697855581394702, + "language_loss": 0.86312807, + "learning_rate": 0.0003284695817864923, + "loss": 0.87398434, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.25939941, + "step": 3239, + "time_per_iteration": 2.5213680267333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086258, + "balance_loss_mlp": 1.05822039, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.0670229685198235, + "language_loss": 0.84466362, + "learning_rate": 0.0003281769793025116, + "loss": 0.85552621, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.28051758, + "step": 3240, + "time_per_iteration": 2.7121944427490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_mlp": 1.05725467, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.0702959592195009, + "language_loss": 0.89746368, + "learning_rate": 0.00032788444352614346, + "loss": 0.90830505, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.2689209, + "step": 3241, + "time_per_iteration": 2.5015249252319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05672646, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.06846716492041297, + "language_loss": 0.80880868, + "learning_rate": 0.0003275919745709606, + "loss": 0.81963438, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.25793457, + "step": 3242, + "time_per_iteration": 2.5576865673065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108516, + "balance_loss_mlp": 1.05925632, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.07943089105939449, + "language_loss": 0.82206035, + "learning_rate": 0.00032729957255050936, + "loss": 0.83291197, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.25939941, + "step": 3243, + "time_per_iteration": 2.6432876586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088614, + "balance_loss_mlp": 1.06160164, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.05697537119999913, + "language_loss": 0.81798017, + "learning_rate": 0.0003270072375783102, + "loss": 0.82886636, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.2701416, + "step": 3244, + "time_per_iteration": 2.8988003730773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.06048417, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.06396151885165319, + "language_loss": 0.79621661, + "learning_rate": 0.00032671496976785774, + "loss": 0.80708826, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.2668457, + "step": 3245, + "time_per_iteration": 2.619020938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.054075, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06315966353761295, + "language_loss": 0.75718981, + "learning_rate": 0.0003264227692326205, + "loss": 0.76798642, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.25610352, + "step": 3246, + "time_per_iteration": 3.0977470874786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092736, + "balance_loss_mlp": 1.0656991, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.05900529395790117, + "language_loss": 0.86342973, + "learning_rate": 0.00032613063608604055, + "loss": 0.8743571, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.27075195, + "step": 3247, + "time_per_iteration": 2.535694122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088316, + "balance_loss_mlp": 1.06239939, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.06304682930858534, + "language_loss": 0.83798397, + "learning_rate": 0.0003258385704415343, + "loss": 0.84886706, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.25952148, + "step": 3248, + "time_per_iteration": 2.5745623111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108485, + "balance_loss_mlp": 1.05835032, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.05590667245526839, + "language_loss": 0.83388865, + "learning_rate": 0.0003255465724124915, + "loss": 0.84473717, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.26550293, + "step": 3249, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088994, + "balance_loss_mlp": 1.06236219, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.05421846052548684, + "language_loss": 0.83201844, + "learning_rate": 0.00032525464211227587, + "loss": 0.84290838, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.2668457, + "step": 3250, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089648, + "balance_loss_mlp": 1.0646019, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.05949618394649944, + "language_loss": 0.85687059, + "learning_rate": 0.0003249627796542249, + "loss": 0.8677671, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.25048828, + "step": 3251, + "time_per_iteration": 2.657060384750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086593, + "balance_loss_mlp": 1.06070042, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06427979506588448, + "language_loss": 0.84404004, + "learning_rate": 0.00032467098515164943, + "loss": 0.85490596, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.25927734, + "step": 3252, + "time_per_iteration": 2.849217414855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095705, + "balance_loss_mlp": 1.06928802, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.07156536364550367, + "language_loss": 0.8424539, + "learning_rate": 0.00032437925871783456, + "loss": 0.85341096, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.26428223, + "step": 3253, + "time_per_iteration": 2.6556756496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089818, + "balance_loss_mlp": 1.06392598, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.06713167353527402, + "language_loss": 0.84369826, + "learning_rate": 0.00032408760046603803, + "loss": 0.85459638, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.25915527, + "step": 3254, + "time_per_iteration": 2.8115572929382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_mlp": 1.06096649, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.057744790831179095, + "language_loss": 0.77781522, + "learning_rate": 0.00032379601050949193, + "loss": 0.78869808, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.27319336, + "step": 3255, + "time_per_iteration": 3.076742649078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086608, + "balance_loss_mlp": 1.06120479, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.07189629851165658, + "language_loss": 0.88155556, + "learning_rate": 0.0003235044889614013, + "loss": 0.8924216, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.25390625, + "step": 3256, + "time_per_iteration": 2.5873968601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089869, + "balance_loss_mlp": 1.06373787, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.05771783178096878, + "language_loss": 0.83524096, + "learning_rate": 0.0003232130359349451, + "loss": 0.84613967, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.26147461, + "step": 3257, + "time_per_iteration": 2.819540500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.05381322, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.06862538521016108, + "language_loss": 0.81873524, + "learning_rate": 0.0003229216515432751, + "loss": 0.82953036, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.25732422, + "step": 3258, + "time_per_iteration": 2.7515103816986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081926, + "balance_loss_mlp": 1.05611742, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.0620904280551254, + "language_loss": 0.79984063, + "learning_rate": 0.0003226303358995174, + "loss": 0.81065989, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.25805664, + "step": 3259, + "time_per_iteration": 2.601327896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108309, + "balance_loss_mlp": 1.05641103, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.06264498249495759, + "language_loss": 0.88746321, + "learning_rate": 0.00032233908911677, + "loss": 0.89829409, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.26672363, + "step": 3260, + "time_per_iteration": 2.8746490478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108161, + "balance_loss_mlp": 1.0554074, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.05524835690099731, + "language_loss": 0.81054789, + "learning_rate": 0.0003220479113081053, + "loss": 0.82136405, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.26245117, + "step": 3261, + "time_per_iteration": 2.7250542640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086528, + "balance_loss_mlp": 1.06051612, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.07333417495650836, + "language_loss": 0.79077941, + "learning_rate": 0.00032175680258656836, + "loss": 0.80164468, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.26049805, + "step": 3262, + "time_per_iteration": 2.7318856716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084833, + "balance_loss_mlp": 1.0588572, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.054494688128012655, + "language_loss": 0.80530143, + "learning_rate": 0.00032146576306517794, + "loss": 0.81614971, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.26000977, + "step": 3263, + "time_per_iteration": 2.7811925411224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080063, + "balance_loss_mlp": 1.05290699, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.056666242848552414, + "language_loss": 0.81309682, + "learning_rate": 0.0003211747928569255, + "loss": 0.82389748, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.27197266, + "step": 3264, + "time_per_iteration": 2.7700881958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109074, + "balance_loss_mlp": 1.06416845, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.05464471596038141, + "language_loss": 0.82094646, + "learning_rate": 0.0003208838920747754, + "loss": 0.83185387, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.26599121, + "step": 3265, + "time_per_iteration": 2.8446507453918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090884, + "balance_loss_mlp": 1.06463385, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.056349937520850824, + "language_loss": 0.77076876, + "learning_rate": 0.0003205930608316656, + "loss": 0.7816776, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.26269531, + "step": 3266, + "time_per_iteration": 3.491666555404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_mlp": 1.07074392, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.06651261940051902, + "language_loss": 0.84897095, + "learning_rate": 0.00032030229924050673, + "loss": 0.85995495, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.27661133, + "step": 3267, + "time_per_iteration": 2.647298812866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089439, + "balance_loss_mlp": 1.06264114, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.055917272399638666, + "language_loss": 0.8022815, + "learning_rate": 0.00032001160741418247, + "loss": 0.81317586, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.26843262, + "step": 3268, + "time_per_iteration": 2.652388334274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094844, + "balance_loss_mlp": 1.06809378, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.059838942630291256, + "language_loss": 0.82543945, + "learning_rate": 0.0003197209854655494, + "loss": 0.83638787, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.26757812, + "step": 3269, + "time_per_iteration": 2.6375179290771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.07439375, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.061513094819297384, + "language_loss": 0.74642974, + "learning_rate": 0.0003194304335074371, + "loss": 0.75742888, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.25537109, + "step": 3270, + "time_per_iteration": 2.8767266273498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.06736612, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.08816092137491774, + "language_loss": 0.8863402, + "learning_rate": 0.0003191399516526475, + "loss": 0.89727688, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.26342773, + "step": 3271, + "time_per_iteration": 2.4882290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103501, + "balance_loss_mlp": 1.07775187, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.05301391071022918, + "language_loss": 0.80040759, + "learning_rate": 0.0003188495400139559, + "loss": 0.81144261, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.25732422, + "step": 3272, + "time_per_iteration": 2.755535364151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109861, + "balance_loss_mlp": 1.0714066, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.06865914840158399, + "language_loss": 0.84610647, + "learning_rate": 0.00031855919870411013, + "loss": 0.85709262, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.27246094, + "step": 3273, + "time_per_iteration": 2.8569116592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093778, + "balance_loss_mlp": 1.06794524, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.05727346843797417, + "language_loss": 0.84962982, + "learning_rate": 0.0003182689278358305, + "loss": 0.86056757, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.25866699, + "step": 3274, + "time_per_iteration": 2.690037727355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104729, + "balance_loss_mlp": 1.07783532, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.06020653166460469, + "language_loss": 0.80404747, + "learning_rate": 0.0003179787275218105, + "loss": 0.81509471, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.26928711, + "step": 3275, + "time_per_iteration": 2.5266408920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07448089, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.052538589715391014, + "language_loss": 0.84480745, + "learning_rate": 0.0003176885978747155, + "loss": 0.85581249, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.26037598, + "step": 3276, + "time_per_iteration": 2.639855146408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097356, + "balance_loss_mlp": 1.07041466, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.060305073155881905, + "language_loss": 0.82594693, + "learning_rate": 0.0003173985390071839, + "loss": 0.83692044, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.26977539, + "step": 3277, + "time_per_iteration": 2.860373020172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_mlp": 1.02755451, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.022211191249075446, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78934395, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.06396484, + "step": 3278, + "time_per_iteration": 4.8053810596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109346, + "balance_loss_mlp": 1.06688833, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.06392036419346926, + "language_loss": 0.8159709, + "learning_rate": 0.00031681863406122704, + "loss": 0.82690549, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.26574707, + "step": 3279, + "time_per_iteration": 2.7899298667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090056, + "balance_loss_mlp": 1.06425917, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.08623088614213353, + "language_loss": 0.85931206, + "learning_rate": 0.00031652878820794087, + "loss": 0.87021261, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.25817871, + "step": 3280, + "time_per_iteration": 2.9887900352478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099524, + "balance_loss_mlp": 1.07296467, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.06411033205686746, + "language_loss": 0.85853314, + "learning_rate": 0.00031623901358449627, + "loss": 0.86952841, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.26574707, + "step": 3281, + "time_per_iteration": 2.638303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.06183434, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.058317756156366925, + "language_loss": 0.88884354, + "learning_rate": 0.0003159493103033936, + "loss": 0.89973223, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.27038574, + "step": 3282, + "time_per_iteration": 2.577678918838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021333, + "balance_loss_mlp": 1.01494348, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.01678733827998209, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.8094039, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.06396484, + "step": 3283, + "time_per_iteration": 4.869993209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086941, + "balance_loss_mlp": 1.06035781, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.060116799925982296, + "language_loss": 0.82495177, + "learning_rate": 0.0003153701182180776, + "loss": 0.83582127, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.26611328, + "step": 3284, + "time_per_iteration": 2.792370319366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108947, + "balance_loss_mlp": 1.06271982, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.05700688218578944, + "language_loss": 0.81939638, + "learning_rate": 0.00031508062963872655, + "loss": 0.83029103, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.26757812, + "step": 3285, + "time_per_iteration": 2.5983989238739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080002, + "balance_loss_mlp": 1.05334699, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.06791630533273198, + "language_loss": 0.79373753, + "learning_rate": 0.0003147912128514423, + "loss": 0.80453753, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.2668457, + "step": 3286, + "time_per_iteration": 2.7027578353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085262, + "balance_loss_mlp": 1.05848765, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.061011344504073056, + "language_loss": 0.87480241, + "learning_rate": 0.0003145018679685859, + "loss": 0.88565505, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.26831055, + "step": 3287, + "time_per_iteration": 2.7283802032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081421, + "balance_loss_mlp": 1.05552864, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.05025789787573444, + "language_loss": 0.87796986, + "learning_rate": 0.00031421259510249134, + "loss": 0.88878405, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.25927734, + "step": 3288, + "time_per_iteration": 2.879518985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089164, + "balance_loss_mlp": 1.06193662, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.06343698340560998, + "language_loss": 0.81597275, + "learning_rate": 0.00031392339436546414, + "loss": 0.82686442, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.27246094, + "step": 3289, + "time_per_iteration": 2.8542826175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_mlp": 1.05521417, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.06408220142950623, + "language_loss": 0.83260751, + "learning_rate": 0.00031363426586978205, + "loss": 0.84343785, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.27832031, + "step": 3290, + "time_per_iteration": 2.79167103767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075181, + "balance_loss_mlp": 1.04847813, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.05376353557308444, + "language_loss": 0.84848088, + "learning_rate": 0.0003133452097276947, + "loss": 0.85923266, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.26708984, + "step": 3291, + "time_per_iteration": 2.751204252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108266, + "balance_loss_mlp": 1.05583799, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.07438043439458697, + "language_loss": 0.84737223, + "learning_rate": 0.0003130562260514238, + "loss": 0.85819882, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.26831055, + "step": 3292, + "time_per_iteration": 2.7716188430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083518, + "balance_loss_mlp": 1.05695808, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.050395454055006096, + "language_loss": 0.81929183, + "learning_rate": 0.0003127673149531626, + "loss": 0.830127, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.26550293, + "step": 3293, + "time_per_iteration": 2.7863051891326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_mlp": 1.05757475, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.05747867938132279, + "language_loss": 0.8319236, + "learning_rate": 0.0003124784765450762, + "loss": 0.84276778, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.26867676, + "step": 3294, + "time_per_iteration": 2.551786184310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109352, + "balance_loss_mlp": 1.0665071, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.0638400369710873, + "language_loss": 0.80339384, + "learning_rate": 0.0003121897109393017, + "loss": 0.81432903, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.27050781, + "step": 3295, + "time_per_iteration": 2.7408554553985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010907, + "balance_loss_mlp": 1.06406879, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.05476078823788279, + "language_loss": 0.89262557, + "learning_rate": 0.0003119010182479481, + "loss": 0.90353251, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.26623535, + "step": 3296, + "time_per_iteration": 2.658127784729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088039, + "balance_loss_mlp": 1.06214714, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.062377346698915814, + "language_loss": 0.82762587, + "learning_rate": 0.00031161239858309563, + "loss": 0.83850628, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.25915527, + "step": 3297, + "time_per_iteration": 2.5747482776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092867, + "balance_loss_mlp": 1.06669998, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.0650323770737515, + "language_loss": 0.83421898, + "learning_rate": 0.0003113238520567964, + "loss": 0.84514761, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.26208496, + "step": 3298, + "time_per_iteration": 2.6627304553985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089642, + "balance_loss_mlp": 1.06351149, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.06322814562663621, + "language_loss": 0.81827015, + "learning_rate": 0.00031103537878107403, + "loss": 0.82916659, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.26147461, + "step": 3299, + "time_per_iteration": 2.7386014461517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091865, + "balance_loss_mlp": 1.06578207, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.11045697323578996, + "language_loss": 0.80332845, + "learning_rate": 0.0003107469788679238, + "loss": 0.81424707, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.26086426, + "step": 3300, + "time_per_iteration": 2.7655692100524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084258, + "balance_loss_mlp": 1.05724525, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.06273525226286222, + "language_loss": 0.8685059, + "learning_rate": 0.00031045865242931267, + "loss": 0.8793484, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.27026367, + "step": 3301, + "time_per_iteration": 2.8187057971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092787, + "balance_loss_mlp": 1.06582153, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.06022790921544637, + "language_loss": 0.82959229, + "learning_rate": 0.00031017039957717877, + "loss": 0.84052014, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.27001953, + "step": 3302, + "time_per_iteration": 2.994527578353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088849, + "balance_loss_mlp": 1.0623126, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.2662546903518702, + "language_loss": 0.8874619, + "learning_rate": 0.0003098822204234318, + "loss": 0.89835036, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.265625, + "step": 3303, + "time_per_iteration": 2.6759462356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086338, + "balance_loss_mlp": 1.06104219, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.06306835331817585, + "language_loss": 0.87812388, + "learning_rate": 0.00030959411507995273, + "loss": 0.88898724, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.25317383, + "step": 3304, + "time_per_iteration": 3.2179057598114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06150627, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.09855035049223494, + "language_loss": 0.81458223, + "learning_rate": 0.00030930608365859407, + "loss": 0.82547283, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.27563477, + "step": 3305, + "time_per_iteration": 2.743131399154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093486, + "balance_loss_mlp": 1.06724787, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.08448546978670643, + "language_loss": 0.87924969, + "learning_rate": 0.00030901812627117943, + "loss": 0.89018464, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.26257324, + "step": 3306, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090032, + "balance_loss_mlp": 1.06258953, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.06217165595181868, + "language_loss": 0.85291284, + "learning_rate": 0.000308730243029504, + "loss": 0.86381316, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.27416992, + "step": 3307, + "time_per_iteration": 2.604104995727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091578, + "balance_loss_mlp": 1.06420732, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.05998324584382658, + "language_loss": 0.79413563, + "learning_rate": 0.0003084424340453339, + "loss": 0.80505145, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.27392578, + "step": 3308, + "time_per_iteration": 2.808955192565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.06555986, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.06232682903729, + "language_loss": 0.82260096, + "learning_rate": 0.0003081546994304064, + "loss": 0.83353913, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.28222656, + "step": 3309, + "time_per_iteration": 2.786863327026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090344, + "balance_loss_mlp": 1.06326008, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.059865329496528966, + "language_loss": 0.82539266, + "learning_rate": 0.0003078670392964298, + "loss": 0.83629608, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.27148438, + "step": 3310, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096243, + "balance_loss_mlp": 1.06832409, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.060559947779739796, + "language_loss": 0.82883835, + "learning_rate": 0.00030757945375508406, + "loss": 0.83980078, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.27929688, + "step": 3311, + "time_per_iteration": 2.6342813968658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102084, + "balance_loss_mlp": 1.07375956, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.06259292774484726, + "language_loss": 0.81409383, + "learning_rate": 0.00030729194291801944, + "loss": 0.82511473, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.28283691, + "step": 3312, + "time_per_iteration": 2.6879191398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102141, + "balance_loss_mlp": 1.07455623, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.07257562907286343, + "language_loss": 0.77341402, + "learning_rate": 0.00030700450689685787, + "loss": 0.78443545, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.27636719, + "step": 3313, + "time_per_iteration": 2.5379741191864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093269, + "balance_loss_mlp": 1.06732869, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.05810286920956277, + "language_loss": 0.85484838, + "learning_rate": 0.00030671714580319186, + "loss": 0.86578107, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.25952148, + "step": 3314, + "time_per_iteration": 2.800306797027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095409, + "balance_loss_mlp": 1.06806278, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07119187429341393, + "language_loss": 0.83300906, + "learning_rate": 0.0003064298597485846, + "loss": 0.84396315, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.27392578, + "step": 3315, + "time_per_iteration": 2.822500467300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089294, + "balance_loss_mlp": 1.06213832, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.07085511575360878, + "language_loss": 0.84058923, + "learning_rate": 0.00030614264884457054, + "loss": 0.85148215, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.27197266, + "step": 3316, + "time_per_iteration": 2.670797348022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090907, + "balance_loss_mlp": 1.06443071, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.0775113841286029, + "language_loss": 0.77307498, + "learning_rate": 0.000305855513202655, + "loss": 0.78398407, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.26477051, + "step": 3317, + "time_per_iteration": 2.585374355316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088816, + "balance_loss_mlp": 1.06235111, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.06790961033266373, + "language_loss": 0.77846622, + "learning_rate": 0.0003055684529343138, + "loss": 0.78935432, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.26501465, + "step": 3318, + "time_per_iteration": 2.4445385932922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085331, + "balance_loss_mlp": 1.0597012, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.06232442900596772, + "language_loss": 0.78594166, + "learning_rate": 0.00030528146815099374, + "loss": 0.79679501, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.25634766, + "step": 3319, + "time_per_iteration": 2.654273509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085377, + "balance_loss_mlp": 1.06078434, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.06473309855040241, + "language_loss": 0.72311449, + "learning_rate": 0.00030499455896411203, + "loss": 0.73396826, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.24597168, + "step": 3320, + "time_per_iteration": 2.60524320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043431, + "balance_loss_mlp": 1.03561127, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.03712674177895302, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77344245, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.078125, + "step": 3321, + "time_per_iteration": 4.941630601882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.06535614, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.06705543469002004, + "language_loss": 0.7662977, + "learning_rate": 0.0003044209678251865, + "loss": 0.77721143, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.26013184, + "step": 3322, + "time_per_iteration": 2.877448320388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091614, + "balance_loss_mlp": 1.06602025, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.07788084148223126, + "language_loss": 0.84920502, + "learning_rate": 0.0003041342860958306, + "loss": 0.86012113, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.25610352, + "step": 3323, + "time_per_iteration": 2.8169727325439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093622, + "balance_loss_mlp": 1.06809974, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.09386152906491808, + "language_loss": 0.91524851, + "learning_rate": 0.00030384768040828857, + "loss": 0.92618477, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.25537109, + "step": 3324, + "time_per_iteration": 2.6935789585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087009, + "balance_loss_mlp": 1.06294096, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.06024043560940697, + "language_loss": 0.85838866, + "learning_rate": 0.00030356115087383094, + "loss": 0.86925876, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.24047852, + "step": 3325, + "time_per_iteration": 2.645054340362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108735, + "balance_loss_mlp": 1.06102872, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.054064191473810044, + "language_loss": 0.84931785, + "learning_rate": 0.00030327469760369803, + "loss": 0.86019135, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.26367188, + "step": 3326, + "time_per_iteration": 2.563873767852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085961, + "balance_loss_mlp": 1.05992579, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.06028685713056784, + "language_loss": 0.85342407, + "learning_rate": 0.0003029883207091009, + "loss": 0.86428368, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.26074219, + "step": 3327, + "time_per_iteration": 2.705343723297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_mlp": 1.05283976, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.06637165202459654, + "language_loss": 0.78691089, + "learning_rate": 0.00030270202030122095, + "loss": 0.7977106, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.27172852, + "step": 3328, + "time_per_iteration": 2.708845853805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081887, + "balance_loss_mlp": 1.05516016, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.06780948867889915, + "language_loss": 0.86619353, + "learning_rate": 0.00030241579649121, + "loss": 0.87701237, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.26782227, + "step": 3329, + "time_per_iteration": 2.9923856258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084819, + "balance_loss_mlp": 1.05859339, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.052278869794255514, + "language_loss": 0.79563975, + "learning_rate": 0.00030212964939018994, + "loss": 0.80648792, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.26220703, + "step": 3330, + "time_per_iteration": 2.5270252227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091276, + "balance_loss_mlp": 1.06483507, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.06193541615368343, + "language_loss": 0.85849935, + "learning_rate": 0.0003018435791092527, + "loss": 0.86941212, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.26489258, + "step": 3331, + "time_per_iteration": 2.4754018783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081779, + "balance_loss_mlp": 1.05531454, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.08536903731672153, + "language_loss": 0.81342864, + "learning_rate": 0.00030155758575946083, + "loss": 0.82424641, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.26489258, + "step": 3332, + "time_per_iteration": 2.626554489135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087465, + "balance_loss_mlp": 1.06135845, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.05880203513690982, + "language_loss": 0.83905303, + "learning_rate": 0.0003012716694518467, + "loss": 0.84992766, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.26135254, + "step": 3333, + "time_per_iteration": 2.563870906829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088918, + "balance_loss_mlp": 1.06233454, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.060655550998304664, + "language_loss": 0.85066408, + "learning_rate": 0.000300985830297413, + "loss": 0.86155331, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.26635742, + "step": 3334, + "time_per_iteration": 2.720207691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085977, + "balance_loss_mlp": 1.05846334, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.0660382374422698, + "language_loss": 0.87618732, + "learning_rate": 0.00030070006840713205, + "loss": 0.88704705, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.27563477, + "step": 3335, + "time_per_iteration": 3.3882405757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086202, + "balance_loss_mlp": 1.06003511, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.05326551396050189, + "language_loss": 0.738437, + "learning_rate": 0.000300414383891947, + "loss": 0.74929905, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.26184082, + "step": 3336, + "time_per_iteration": 2.841377019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089325, + "balance_loss_mlp": 1.06317008, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.05652358101135248, + "language_loss": 0.88883501, + "learning_rate": 0.00030012877686276973, + "loss": 0.89972824, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.26196289, + "step": 3337, + "time_per_iteration": 2.729287624359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109448, + "balance_loss_mlp": 1.06825364, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.05602708574683237, + "language_loss": 0.87052727, + "learning_rate": 0.0002998432474304832, + "loss": 0.88147211, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.26269531, + "step": 3338, + "time_per_iteration": 2.763936996459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_mlp": 1.02664769, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.022262190661506177, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80270433, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.06445312, + "step": 3339, + "time_per_iteration": 4.899634838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085698, + "balance_loss_mlp": 1.06067634, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.05329063171196326, + "language_loss": 0.88739842, + "learning_rate": 0.00029927242179996107, + "loss": 0.89825541, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.25036621, + "step": 3340, + "time_per_iteration": 2.6731433868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_mlp": 1.05887282, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.05323225781899137, + "language_loss": 0.83480287, + "learning_rate": 0.0002989871258233398, + "loss": 0.84564984, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.25830078, + "step": 3341, + "time_per_iteration": 2.7728755474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092018, + "balance_loss_mlp": 1.06558967, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.07706425942828801, + "language_loss": 0.82514536, + "learning_rate": 0.0002987019078868373, + "loss": 0.83606553, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.26477051, + "step": 3342, + "time_per_iteration": 2.4401304721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.06178701, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.05844856820656981, + "language_loss": 0.81969512, + "learning_rate": 0.00029841676810118484, + "loss": 0.83056593, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.25317383, + "step": 3343, + "time_per_iteration": 2.662538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081725, + "balance_loss_mlp": 1.05664337, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.059827459557400715, + "language_loss": 0.8744089, + "learning_rate": 0.0002981317065770839, + "loss": 0.88522613, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.25097656, + "step": 3344, + "time_per_iteration": 3.0547289848327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084317, + "balance_loss_mlp": 1.05733991, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.06660327590373825, + "language_loss": 0.80995148, + "learning_rate": 0.00029784672342520493, + "loss": 0.8207947, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.2701416, + "step": 3345, + "time_per_iteration": 2.665701389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_mlp": 1.05967772, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.06646117456198827, + "language_loss": 0.83675933, + "learning_rate": 0.00029756181875618834, + "loss": 0.84762478, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.26904297, + "step": 3346, + "time_per_iteration": 2.5859789848327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.06036818, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.0635179791741207, + "language_loss": 0.83513415, + "learning_rate": 0.0002972769926806439, + "loss": 0.84600508, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.26757812, + "step": 3347, + "time_per_iteration": 2.4656190872192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087894, + "balance_loss_mlp": 1.06159616, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.0627778117475219, + "language_loss": 0.89043599, + "learning_rate": 0.0002969922453091508, + "loss": 0.90131485, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.26342773, + "step": 3348, + "time_per_iteration": 2.5913443565368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_mlp": 1.05721855, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.05415378993081624, + "language_loss": 0.85013533, + "learning_rate": 0.00029670757675225777, + "loss": 0.8609767, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.26953125, + "step": 3349, + "time_per_iteration": 2.739000082015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085102, + "balance_loss_mlp": 1.05906665, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.06396799690402781, + "language_loss": 0.79375887, + "learning_rate": 0.0002964229871204831, + "loss": 0.80460995, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.26049805, + "step": 3350, + "time_per_iteration": 2.6291356086730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079691, + "balance_loss_mlp": 1.0546335, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.05949012862270097, + "language_loss": 0.83774936, + "learning_rate": 0.00029613847652431403, + "loss": 0.84854627, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.25073242, + "step": 3351, + "time_per_iteration": 2.839716672897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077701, + "balance_loss_mlp": 1.05226183, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.056904070769954795, + "language_loss": 0.79438174, + "learning_rate": 0.0002958540450742078, + "loss": 0.80515873, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.2545166, + "step": 3352, + "time_per_iteration": 2.913639545440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077873, + "balance_loss_mlp": 1.05242181, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.058859243742432434, + "language_loss": 0.77210569, + "learning_rate": 0.0002955696928805901, + "loss": 0.78288442, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.2545166, + "step": 3353, + "time_per_iteration": 2.923433780670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081125, + "balance_loss_mlp": 1.05607951, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.061599648682054316, + "language_loss": 0.8637355, + "learning_rate": 0.0002952854200538563, + "loss": 0.87454677, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.25061035, + "step": 3354, + "time_per_iteration": 2.8201682567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.05513, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.055453256805671876, + "language_loss": 0.82204401, + "learning_rate": 0.000295001226704371, + "loss": 0.83286464, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.26965332, + "step": 3355, + "time_per_iteration": 2.555814743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073686, + "balance_loss_mlp": 1.04755521, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.07998397222578815, + "language_loss": 0.83098918, + "learning_rate": 0.00029471711294246783, + "loss": 0.84172606, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.26171875, + "step": 3356, + "time_per_iteration": 2.7683768272399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04853272, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.06636958548337468, + "language_loss": 0.82041395, + "learning_rate": 0.0002944330788784494, + "loss": 0.83114803, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.24865723, + "step": 3357, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070626, + "balance_loss_mlp": 1.04476953, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.05791600567825564, + "language_loss": 0.84893548, + "learning_rate": 0.00029414912462258786, + "loss": 0.85964179, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.25878906, + "step": 3358, + "time_per_iteration": 2.811368227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074683, + "balance_loss_mlp": 1.04814672, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.06332395198444683, + "language_loss": 0.81536913, + "learning_rate": 0.00029386525028512366, + "loss": 0.82611591, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.265625, + "step": 3359, + "time_per_iteration": 2.7373340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074991, + "balance_loss_mlp": 1.04820502, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.06353277324280042, + "language_loss": 0.87003738, + "learning_rate": 0.0002935814559762666, + "loss": 0.88078725, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.26794434, + "step": 3360, + "time_per_iteration": 2.7775824069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.04590034, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.05137930427454231, + "language_loss": 0.79679728, + "learning_rate": 0.0002932977418061957, + "loss": 0.80750829, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.25183105, + "step": 3361, + "time_per_iteration": 2.6293880939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073227, + "balance_loss_mlp": 1.04677427, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.06432809284202623, + "language_loss": 0.80709672, + "learning_rate": 0.00029301410788505833, + "loss": 0.81782901, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.26489258, + "step": 3362, + "time_per_iteration": 2.772700071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073962, + "balance_loss_mlp": 1.04715228, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.06908164950227988, + "language_loss": 0.81278014, + "learning_rate": 0.00029273055432297126, + "loss": 0.82351977, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.26782227, + "step": 3363, + "time_per_iteration": 2.479120969772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068037, + "balance_loss_mlp": 1.04115558, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06524076988807553, + "language_loss": 0.80934191, + "learning_rate": 0.00029244708123001917, + "loss": 0.82002234, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.26916504, + "step": 3364, + "time_per_iteration": 2.9441330432891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068068, + "balance_loss_mlp": 1.04217577, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.06372584124812569, + "language_loss": 0.84562182, + "learning_rate": 0.0002921636887162565, + "loss": 0.8563025, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.25927734, + "step": 3365, + "time_per_iteration": 2.732980489730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067865, + "balance_loss_mlp": 1.04277182, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.0749500155659675, + "language_loss": 0.83798963, + "learning_rate": 0.00029188037689170595, + "loss": 0.84866834, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.25109863, + "step": 3366, + "time_per_iteration": 2.9474096298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068033, + "balance_loss_mlp": 1.04130602, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.06502471406083535, + "language_loss": 0.84043062, + "learning_rate": 0.0002915971458663586, + "loss": 0.85111088, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.26782227, + "step": 3367, + "time_per_iteration": 3.0719544887542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069519, + "balance_loss_mlp": 1.04331708, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.05257796695915082, + "language_loss": 0.81762713, + "learning_rate": 0.00029131399575017494, + "loss": 0.82832229, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.26245117, + "step": 3368, + "time_per_iteration": 3.195772171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071807, + "balance_loss_mlp": 1.04481828, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.05387315925396133, + "language_loss": 0.86003518, + "learning_rate": 0.0002910309266530836, + "loss": 0.87075323, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.27026367, + "step": 3369, + "time_per_iteration": 2.790093421936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075463, + "balance_loss_mlp": 1.04854584, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.057981542205969905, + "language_loss": 0.85403055, + "learning_rate": 0.0002907479386849814, + "loss": 0.86478519, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.26977539, + "step": 3370, + "time_per_iteration": 2.628838062286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074904, + "balance_loss_mlp": 1.04926252, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.05712160703015161, + "language_loss": 0.80363882, + "learning_rate": 0.0002904650319557339, + "loss": 0.81438786, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.2565918, + "step": 3371, + "time_per_iteration": 2.9755005836486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073278, + "balance_loss_mlp": 1.04574049, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.07266117839515142, + "language_loss": 0.81511021, + "learning_rate": 0.0002901822065751758, + "loss": 0.82584298, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.27539062, + "step": 3372, + "time_per_iteration": 2.646740198135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079407, + "balance_loss_mlp": 1.05310917, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.060084455172548096, + "language_loss": 0.85821176, + "learning_rate": 0.0002898994626531093, + "loss": 0.8690058, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.26318359, + "step": 3373, + "time_per_iteration": 2.8307554721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079841, + "balance_loss_mlp": 1.05368662, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.06412256257505489, + "language_loss": 0.88422716, + "learning_rate": 0.00028961680029930526, + "loss": 0.89502561, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.26196289, + "step": 3374, + "time_per_iteration": 2.5427072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078278, + "balance_loss_mlp": 1.05246949, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.05984187516424017, + "language_loss": 0.77025837, + "learning_rate": 0.00028933421962350317, + "loss": 0.78104115, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.25830078, + "step": 3375, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076768, + "balance_loss_mlp": 1.05101824, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.06098588343511283, + "language_loss": 0.8395189, + "learning_rate": 0.0002890517207354104, + "loss": 0.8502866, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.2578125, + "step": 3376, + "time_per_iteration": 2.8559377193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108488, + "balance_loss_mlp": 1.05872583, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.061051185041057866, + "language_loss": 0.81743991, + "learning_rate": 0.0002887693037447029, + "loss": 0.82828867, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.26196289, + "step": 3377, + "time_per_iteration": 2.5842373371124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081133, + "balance_loss_mlp": 1.0550499, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.06328579672333946, + "language_loss": 0.82031405, + "learning_rate": 0.00028848696876102443, + "loss": 0.83112538, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.26086426, + "step": 3378, + "time_per_iteration": 2.6148552894592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085126, + "balance_loss_mlp": 1.05910289, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.06296964534395977, + "language_loss": 0.83665496, + "learning_rate": 0.00028820471589398723, + "loss": 0.84750628, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.26062012, + "step": 3379, + "time_per_iteration": 2.5984256267547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087258, + "balance_loss_mlp": 1.06153309, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.06986995614305117, + "language_loss": 0.77549016, + "learning_rate": 0.00028792254525317196, + "loss": 0.78636277, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.25732422, + "step": 3380, + "time_per_iteration": 2.6670660972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091352, + "balance_loss_mlp": 1.06519723, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.07163565487878029, + "language_loss": 0.81605381, + "learning_rate": 0.00028764045694812645, + "loss": 0.82696736, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.26159668, + "step": 3381, + "time_per_iteration": 2.7534923553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.06213295, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.07829383117608732, + "language_loss": 0.76753044, + "learning_rate": 0.0002873584510883671, + "loss": 0.77842152, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.26989746, + "step": 3382, + "time_per_iteration": 2.5738234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089393, + "balance_loss_mlp": 1.0616889, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.05561178380226362, + "language_loss": 0.86494762, + "learning_rate": 0.0002870765277833788, + "loss": 0.87584156, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.27709961, + "step": 3383, + "time_per_iteration": 2.6669375896453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080554, + "balance_loss_mlp": 1.05552006, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.06569130604090773, + "language_loss": 0.80749148, + "learning_rate": 0.00028679468714261347, + "loss": 0.81829703, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.25048828, + "step": 3384, + "time_per_iteration": 2.7443134784698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078779, + "balance_loss_mlp": 1.05354261, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.06683297733149338, + "language_loss": 0.76978695, + "learning_rate": 0.0002865129292754918, + "loss": 0.7805748, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.25256348, + "step": 3385, + "time_per_iteration": 2.553633213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077167, + "balance_loss_mlp": 1.05206108, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.07067523232573529, + "language_loss": 0.81812489, + "learning_rate": 0.00028623125429140105, + "loss": 0.82889658, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.25097656, + "step": 3386, + "time_per_iteration": 2.8174142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081783, + "balance_loss_mlp": 1.05555665, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.06558978791095729, + "language_loss": 0.8706044, + "learning_rate": 0.00028594966229969785, + "loss": 0.88142228, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.2623291, + "step": 3387, + "time_per_iteration": 2.680281639099121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078521, + "balance_loss_mlp": 1.05267668, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.06492635522068706, + "language_loss": 0.81586945, + "learning_rate": 0.00028566815340970577, + "loss": 0.82665467, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.25878906, + "step": 3388, + "time_per_iteration": 2.732487916946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075271, + "balance_loss_mlp": 1.05048704, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.06387919000258871, + "language_loss": 0.81219792, + "learning_rate": 0.0002853867277307162, + "loss": 0.8229506, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.2479248, + "step": 3389, + "time_per_iteration": 2.6404130458831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081113, + "balance_loss_mlp": 1.05424297, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.06082372499882378, + "language_loss": 0.82760382, + "learning_rate": 0.00028510538537198824, + "loss": 0.83841497, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.26928711, + "step": 3390, + "time_per_iteration": 2.5929770469665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079433, + "balance_loss_mlp": 1.05408919, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.055684590981588886, + "language_loss": 0.86515808, + "learning_rate": 0.00028482412644274867, + "loss": 0.87595236, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.25366211, + "step": 3391, + "time_per_iteration": 2.9085311889648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074445, + "balance_loss_mlp": 1.04809964, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.061522898278110257, + "language_loss": 0.74154258, + "learning_rate": 0.00028454295105219207, + "loss": 0.75228703, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.26367188, + "step": 3392, + "time_per_iteration": 2.604851245880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.05011857, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.04678981860923424, + "language_loss": 0.79518068, + "learning_rate": 0.0002842618593094802, + "loss": 0.805933, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.25134277, + "step": 3393, + "time_per_iteration": 3.0968527793884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073633, + "balance_loss_mlp": 1.04734683, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.08516397934584916, + "language_loss": 0.80839396, + "learning_rate": 0.00028398085132374243, + "loss": 0.8191303, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.26306152, + "step": 3394, + "time_per_iteration": 2.802588701248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071874, + "balance_loss_mlp": 1.04662573, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.059849085460161155, + "language_loss": 0.84382617, + "learning_rate": 0.0002836999272040761, + "loss": 0.85454488, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.25268555, + "step": 3395, + "time_per_iteration": 3.1209001541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073974, + "balance_loss_mlp": 1.04781914, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.07079508853897194, + "language_loss": 0.84454936, + "learning_rate": 0.00028341908705954575, + "loss": 0.8552891, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.26196289, + "step": 3396, + "time_per_iteration": 2.5430474281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014992, + "balance_loss_mlp": 1.00736308, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.020137853963587818, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82776797, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.07617188, + "step": 3397, + "time_per_iteration": 4.857236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073644, + "balance_loss_mlp": 1.04739439, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.05698390619804648, + "language_loss": 0.78328836, + "learning_rate": 0.00028285765913198604, + "loss": 0.79402483, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.26269531, + "step": 3398, + "time_per_iteration": 2.542471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076746, + "balance_loss_mlp": 1.05030537, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.05420440820194427, + "language_loss": 0.821926, + "learning_rate": 0.0002825770715669227, + "loss": 0.83269352, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.26489258, + "step": 3399, + "time_per_iteration": 2.718555450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106936, + "balance_loss_mlp": 1.04285991, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06072932855544304, + "language_loss": 0.81462443, + "learning_rate": 0.00028229656841292634, + "loss": 0.82531804, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.26525879, + "step": 3400, + "time_per_iteration": 2.6755053997039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074211, + "balance_loss_mlp": 1.04766357, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.06986785605378391, + "language_loss": 0.762591, + "learning_rate": 0.0002820161497788979, + "loss": 0.77333307, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.265625, + "step": 3401, + "time_per_iteration": 2.56740140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076961, + "balance_loss_mlp": 1.05193925, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.05855571008796804, + "language_loss": 0.87057543, + "learning_rate": 0.00028173581577370545, + "loss": 0.88134497, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.25036621, + "step": 3402, + "time_per_iteration": 2.7579104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.04957581, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.05140393354142716, + "language_loss": 0.79220372, + "learning_rate": 0.0002814555665061844, + "loss": 0.80294883, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.24938965, + "step": 3403, + "time_per_iteration": 2.7005770206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078214, + "balance_loss_mlp": 1.05273879, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.06470448826772422, + "language_loss": 0.77704948, + "learning_rate": 0.00028117540208513715, + "loss": 0.7878316, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.25476074, + "step": 3404, + "time_per_iteration": 2.6598384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.05403566, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.06510521460794984, + "language_loss": 0.84932673, + "learning_rate": 0.00028089532261933313, + "loss": 0.860116, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.24890137, + "step": 3405, + "time_per_iteration": 2.693470001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107722, + "balance_loss_mlp": 1.05179238, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.06574306959894075, + "language_loss": 0.85646415, + "learning_rate": 0.0002806153282175087, + "loss": 0.86723638, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.25439453, + "step": 3406, + "time_per_iteration": 2.5597920417785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.05415273, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.06979692390704297, + "language_loss": 0.83091819, + "learning_rate": 0.0002803354189883679, + "loss": 0.84171414, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.2545166, + "step": 3407, + "time_per_iteration": 2.8204212188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.05349612, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.05468628056475838, + "language_loss": 0.85987842, + "learning_rate": 0.00028005559504058053, + "loss": 0.8706665, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.2532959, + "step": 3408, + "time_per_iteration": 2.693559408187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076206, + "balance_loss_mlp": 1.05038548, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.07417771883494789, + "language_loss": 0.7684713, + "learning_rate": 0.0002797758564827838, + "loss": 0.77923334, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.25842285, + "step": 3409, + "time_per_iteration": 2.802828788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.05920529, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.06335346821862926, + "language_loss": 0.83560646, + "learning_rate": 0.0002794962034235824, + "loss": 0.84645367, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.25537109, + "step": 3410, + "time_per_iteration": 2.6147637367248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108148, + "balance_loss_mlp": 1.05519438, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.06069626440640027, + "language_loss": 0.74793261, + "learning_rate": 0.00027921663597154695, + "loss": 0.7587474, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.26281738, + "step": 3411, + "time_per_iteration": 2.7347841262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081407, + "balance_loss_mlp": 1.05633736, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.07186540610549816, + "language_loss": 0.81030178, + "learning_rate": 0.00027893715423521525, + "loss": 0.82111579, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.25085449, + "step": 3412, + "time_per_iteration": 2.4426064491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090629, + "balance_loss_mlp": 1.06429613, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.057164257181416274, + "language_loss": 0.83953196, + "learning_rate": 0.00027865775832309163, + "loss": 0.85043824, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.26379395, + "step": 3413, + "time_per_iteration": 2.661008358001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089453, + "balance_loss_mlp": 1.06320286, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.059355909745470246, + "language_loss": 0.86547339, + "learning_rate": 0.00027837844834364733, + "loss": 0.87636793, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.26269531, + "step": 3414, + "time_per_iteration": 2.6107146739959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108986, + "balance_loss_mlp": 1.06451583, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.058864717061538036, + "language_loss": 0.86578488, + "learning_rate": 0.00027809922440532, + "loss": 0.87668347, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.25366211, + "step": 3415, + "time_per_iteration": 2.8214099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085792, + "balance_loss_mlp": 1.05929208, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.06707421916858435, + "language_loss": 0.80825239, + "learning_rate": 0.00027782008661651406, + "loss": 0.81911027, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.26513672, + "step": 3416, + "time_per_iteration": 2.772441864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087765, + "balance_loss_mlp": 1.06200361, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.054600094461814935, + "language_loss": 0.87535822, + "learning_rate": 0.00027754103508560013, + "loss": 0.88623583, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.25769043, + "step": 3417, + "time_per_iteration": 2.5883491039276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108732, + "balance_loss_mlp": 1.06016374, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.057346286211937464, + "language_loss": 0.83059859, + "learning_rate": 0.0002772620699209163, + "loss": 0.84147179, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.27197266, + "step": 3418, + "time_per_iteration": 2.560173988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080511, + "balance_loss_mlp": 1.05552435, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.07342594011001312, + "language_loss": 0.80011356, + "learning_rate": 0.0002769831912307658, + "loss": 0.81091869, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.24987793, + "step": 3419, + "time_per_iteration": 2.5090081691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077943, + "balance_loss_mlp": 1.05116832, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.15397597060543888, + "language_loss": 0.80397606, + "learning_rate": 0.00027670439912341917, + "loss": 0.81475556, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.26782227, + "step": 3420, + "time_per_iteration": 2.6002025604248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107857, + "balance_loss_mlp": 1.05198634, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.05399899267227409, + "language_loss": 0.83793807, + "learning_rate": 0.0002764256937071129, + "loss": 0.84872377, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.26611328, + "step": 3421, + "time_per_iteration": 2.7873942852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074524, + "balance_loss_mlp": 1.04920375, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.0598445160882451, + "language_loss": 0.87503046, + "learning_rate": 0.00027614707509005036, + "loss": 0.88577569, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.25341797, + "step": 3422, + "time_per_iteration": 2.659196615219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079222, + "balance_loss_mlp": 1.05353248, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.05796849455801806, + "language_loss": 0.79051846, + "learning_rate": 0.0002758685433804008, + "loss": 0.80131066, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.25695801, + "step": 3423, + "time_per_iteration": 2.5024282932281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074298, + "balance_loss_mlp": 1.04835773, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.06008115307148776, + "language_loss": 0.79408616, + "learning_rate": 0.00027559009868630005, + "loss": 0.80482912, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.25964355, + "step": 3424, + "time_per_iteration": 3.0929386615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073444, + "balance_loss_mlp": 1.0477066, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.05902981727550509, + "language_loss": 0.80511308, + "learning_rate": 0.0002753117411158491, + "loss": 0.81584746, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.25744629, + "step": 3425, + "time_per_iteration": 3.0452723503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05611944, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.053958914804285704, + "language_loss": 0.8972351, + "learning_rate": 0.0002750334707771168, + "loss": 0.90806711, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.27124023, + "step": 3426, + "time_per_iteration": 2.626776695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108887, + "balance_loss_mlp": 1.06247699, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.06696403596262077, + "language_loss": 0.81474262, + "learning_rate": 0.0002747552877781369, + "loss": 0.82563138, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.26367188, + "step": 3427, + "time_per_iteration": 2.49870228767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082041, + "balance_loss_mlp": 1.05622029, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.056641096462852314, + "language_loss": 0.82350707, + "learning_rate": 0.0002744771922269097, + "loss": 0.83432746, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.25805664, + "step": 3428, + "time_per_iteration": 2.76737117767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083165, + "balance_loss_mlp": 1.05777287, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.05792922212348718, + "language_loss": 0.82232559, + "learning_rate": 0.0002741991842314015, + "loss": 0.83315718, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.25415039, + "step": 3429, + "time_per_iteration": 3.4959795475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082387, + "balance_loss_mlp": 1.05617321, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.05913775342689391, + "language_loss": 0.86208242, + "learning_rate": 0.0002739212638995445, + "loss": 0.87290633, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.26220703, + "step": 3430, + "time_per_iteration": 2.552647113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091567, + "balance_loss_mlp": 1.06441104, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.06592703083279383, + "language_loss": 0.83386678, + "learning_rate": 0.00027364343133923696, + "loss": 0.84478247, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.27172852, + "step": 3431, + "time_per_iteration": 2.639110565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05480886, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.06195217340834915, + "language_loss": 0.8308382, + "learning_rate": 0.0002733656866583431, + "loss": 0.84164518, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.25927734, + "step": 3432, + "time_per_iteration": 2.6898815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091031, + "balance_loss_mlp": 1.0637325, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.07646297806496907, + "language_loss": 0.83208609, + "learning_rate": 0.0002730880299646927, + "loss": 0.84299648, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.27307129, + "step": 3433, + "time_per_iteration": 3.0324153900146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.06028199, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.09642118703773885, + "language_loss": 0.85385412, + "learning_rate": 0.0002728104613660821, + "loss": 0.8647173, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.26074219, + "step": 3434, + "time_per_iteration": 2.8242013454437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082794, + "balance_loss_mlp": 1.0578196, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.06046346369252319, + "language_loss": 0.83065814, + "learning_rate": 0.0002725329809702729, + "loss": 0.8414861, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.25012207, + "step": 3435, + "time_per_iteration": 3.208373546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086877, + "balance_loss_mlp": 1.06015027, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.06729202687842574, + "language_loss": 0.76439357, + "learning_rate": 0.0002722555888849921, + "loss": 0.77526236, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.26757812, + "step": 3436, + "time_per_iteration": 3.455219030380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108327, + "balance_loss_mlp": 1.05748534, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.06326694519745679, + "language_loss": 0.80694687, + "learning_rate": 0.00027197828521793334, + "loss": 0.8177796, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.25793457, + "step": 3437, + "time_per_iteration": 2.500117301940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086414, + "balance_loss_mlp": 1.06089163, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.06352548474841713, + "language_loss": 0.84948212, + "learning_rate": 0.0002717010700767552, + "loss": 0.86034626, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.25549316, + "step": 3438, + "time_per_iteration": 2.7301025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.06205106, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.06533223637533662, + "language_loss": 0.75988388, + "learning_rate": 0.00027142394356908226, + "loss": 0.77076733, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.26318359, + "step": 3439, + "time_per_iteration": 2.5677285194396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086811, + "balance_loss_mlp": 1.06116903, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.0569940621311471, + "language_loss": 0.85089839, + "learning_rate": 0.00027114690580250456, + "loss": 0.86176658, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.25646973, + "step": 3440, + "time_per_iteration": 2.738121509552002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 1.06724405, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.05472871432656112, + "language_loss": 0.86912161, + "learning_rate": 0.0002708699568845776, + "loss": 0.88005286, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.25891113, + "step": 3441, + "time_per_iteration": 2.611889600753784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_mlp": 1.03374481, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.021890830835033067, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80329108, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.07568359, + "step": 3442, + "time_per_iteration": 4.8971052169799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090068, + "balance_loss_mlp": 1.06495047, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.064050238945667, + "language_loss": 0.83170366, + "learning_rate": 0.0002703163260247261, + "loss": 0.8426044, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.25134277, + "step": 3443, + "time_per_iteration": 2.5994081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06699824, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.06534456788919288, + "language_loss": 0.81938642, + "learning_rate": 0.0002700396442977399, + "loss": 0.83030105, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.24462891, + "step": 3444, + "time_per_iteration": 2.6017937660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091879, + "balance_loss_mlp": 1.06627333, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.06451262067496133, + "language_loss": 0.84422678, + "learning_rate": 0.0002697630518492817, + "loss": 0.85514563, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.25634766, + "step": 3445, + "time_per_iteration": 2.628159523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094697, + "balance_loss_mlp": 1.06956816, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.05416253097531709, + "language_loss": 0.85508287, + "learning_rate": 0.0002694865487867343, + "loss": 0.8660298, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.25134277, + "step": 3446, + "time_per_iteration": 2.604813814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088071, + "balance_loss_mlp": 1.06316853, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.052847331110623744, + "language_loss": 0.84946668, + "learning_rate": 0.0002692101352174453, + "loss": 0.86034739, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.24914551, + "step": 3447, + "time_per_iteration": 2.768223285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109622, + "balance_loss_mlp": 1.06981492, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.058874726069321814, + "language_loss": 0.8497262, + "learning_rate": 0.00026893381124872787, + "loss": 0.86068839, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.26452637, + "step": 3448, + "time_per_iteration": 2.6762025356292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090166, + "balance_loss_mlp": 1.06560886, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.057817999010546496, + "language_loss": 0.80621779, + "learning_rate": 0.00026865757698786097, + "loss": 0.81711942, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.24584961, + "step": 3449, + "time_per_iteration": 3.0353593826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088537, + "balance_loss_mlp": 1.06256163, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.06325061387502293, + "language_loss": 0.81828356, + "learning_rate": 0.000268381432542088, + "loss": 0.82916903, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.26000977, + "step": 3450, + "time_per_iteration": 2.8381845951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085731, + "balance_loss_mlp": 1.05967212, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.06107082028806233, + "language_loss": 0.80140352, + "learning_rate": 0.00026810537801861807, + "loss": 0.81226087, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.26074219, + "step": 3451, + "time_per_iteration": 2.755697727203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091691, + "balance_loss_mlp": 1.06455863, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.05182534623872074, + "language_loss": 0.8148368, + "learning_rate": 0.0002678294135246243, + "loss": 0.82575375, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.27087402, + "step": 3452, + "time_per_iteration": 2.7235701084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077401, + "balance_loss_mlp": 1.05224776, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.07490988727173932, + "language_loss": 0.86671561, + "learning_rate": 0.0002675535391672463, + "loss": 0.87748969, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.25170898, + "step": 3453, + "time_per_iteration": 3.0891692638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.05430508, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.05695144440492774, + "language_loss": 0.86551011, + "learning_rate": 0.0002672777550535877, + "loss": 0.8763122, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.25939941, + "step": 3454, + "time_per_iteration": 2.7647364139556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078097, + "balance_loss_mlp": 1.05288386, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.06003914399103326, + "language_loss": 0.85505843, + "learning_rate": 0.00026700206129071747, + "loss": 0.86583936, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.25231934, + "step": 3455, + "time_per_iteration": 2.5821306705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078808, + "balance_loss_mlp": 1.05316663, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.06471391174754697, + "language_loss": 0.88815629, + "learning_rate": 0.00026672645798566925, + "loss": 0.89894438, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.25671387, + "step": 3456, + "time_per_iteration": 2.536905288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073004, + "balance_loss_mlp": 1.04708791, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.06322098786419635, + "language_loss": 0.79450369, + "learning_rate": 0.00026645094524544225, + "loss": 0.80523372, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.25927734, + "step": 3457, + "time_per_iteration": 3.346942663192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.05416238, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.07380509782774128, + "language_loss": 0.75270724, + "learning_rate": 0.00026617552317699945, + "loss": 0.76351058, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.26220703, + "step": 3458, + "time_per_iteration": 2.8174753189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075769, + "balance_loss_mlp": 1.05087817, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.06466167118068906, + "language_loss": 0.87317026, + "learning_rate": 0.0002659001918872693, + "loss": 0.88392794, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.24890137, + "step": 3459, + "time_per_iteration": 2.620330810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.0529331, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.06328415655418428, + "language_loss": 0.81001127, + "learning_rate": 0.0002656249514831449, + "loss": 0.82079417, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.25378418, + "step": 3460, + "time_per_iteration": 2.6549599170684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079023, + "balance_loss_mlp": 1.05377483, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.054463111692168976, + "language_loss": 0.86972237, + "learning_rate": 0.00026534980207148416, + "loss": 0.8805126, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.25256348, + "step": 3461, + "time_per_iteration": 3.424241065979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.05996895, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.06786500256083805, + "language_loss": 0.7389307, + "learning_rate": 0.0002650747437591097, + "loss": 0.7497921, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.26208496, + "step": 3462, + "time_per_iteration": 3.037792921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020491, + "balance_loss_mlp": 1.01310015, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.010691660665593496, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82900071, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.07373047, + "step": 3463, + "time_per_iteration": 5.019932985305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077653, + "balance_loss_mlp": 1.05172443, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.0677151355970307, + "language_loss": 0.86401796, + "learning_rate": 0.00026452490085933155, + "loss": 0.87479448, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.25952148, + "step": 3464, + "time_per_iteration": 2.577608346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_mlp": 1.05381727, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.06950705493870243, + "language_loss": 0.90135396, + "learning_rate": 0.00026425011648539614, + "loss": 0.91214788, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.25622559, + "step": 3465, + "time_per_iteration": 2.5207860469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.0527184, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06360289256438866, + "language_loss": 0.83105028, + "learning_rate": 0.00026397542363768267, + "loss": 0.84183496, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.25769043, + "step": 3466, + "time_per_iteration": 2.662781238555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081527, + "balance_loss_mlp": 1.05476463, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.11778132677194894, + "language_loss": 0.8209849, + "learning_rate": 0.0002637008224228362, + "loss": 0.83180016, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.26794434, + "step": 3467, + "time_per_iteration": 2.5543577671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05868888, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.04775421920110858, + "language_loss": 0.8469578, + "learning_rate": 0.00026342631294746653, + "loss": 0.85780263, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.25842285, + "step": 3468, + "time_per_iteration": 2.7040185928344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086094, + "balance_loss_mlp": 1.06041682, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.049080807880720057, + "language_loss": 0.81080979, + "learning_rate": 0.0002631518953181476, + "loss": 0.82167077, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.25671387, + "step": 3469, + "time_per_iteration": 3.493414878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011784, + "balance_loss_mlp": 1.00391626, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.010939757170187329, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77337068, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.07861328, + "step": 3470, + "time_per_iteration": 4.9387853145599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_mlp": 1.06110907, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.0606952460981544, + "language_loss": 0.80340272, + "learning_rate": 0.00026260333602377985, + "loss": 0.81427646, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.26281738, + "step": 3471, + "time_per_iteration": 2.838916063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109892, + "balance_loss_mlp": 1.0729208, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.06496239585891986, + "language_loss": 0.87351251, + "learning_rate": 0.0002623291945717007, + "loss": 0.88450176, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.26000977, + "step": 3472, + "time_per_iteration": 2.4870412349700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097292, + "balance_loss_mlp": 1.07054186, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.04982364311813806, + "language_loss": 0.84127951, + "learning_rate": 0.00026205514539161175, + "loss": 0.85225236, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.26782227, + "step": 3473, + "time_per_iteration": 3.565732479095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102422, + "balance_loss_mlp": 1.07651806, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.06841158179572154, + "language_loss": 0.84113353, + "learning_rate": 0.00026178118858990773, + "loss": 0.85215771, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.2590332, + "step": 3474, + "time_per_iteration": 2.8573057651519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087697, + "balance_loss_mlp": 1.0619719, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.07905158602596217, + "language_loss": 0.84220064, + "learning_rate": 0.0002615073242729483, + "loss": 0.85307765, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.25732422, + "step": 3475, + "time_per_iteration": 2.6173481941223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090195, + "balance_loss_mlp": 1.06363511, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.04794889281343623, + "language_loss": 0.84776723, + "learning_rate": 0.0002612335525470573, + "loss": 0.85866916, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.26586914, + "step": 3476, + "time_per_iteration": 2.819981575012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.06361461, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.06414606112589924, + "language_loss": 0.7840637, + "learning_rate": 0.0002609598735185221, + "loss": 0.79496014, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.26062012, + "step": 3477, + "time_per_iteration": 2.6392619609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.0593915, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.054041595090679226, + "language_loss": 0.83408946, + "learning_rate": 0.00026068628729359445, + "loss": 0.8449471, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.26379395, + "step": 3478, + "time_per_iteration": 2.766197919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108263, + "balance_loss_mlp": 1.05621278, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.059772967228533376, + "language_loss": 0.76451987, + "learning_rate": 0.00026041279397848996, + "loss": 0.77534616, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.2644043, + "step": 3479, + "time_per_iteration": 2.8584389686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077924, + "balance_loss_mlp": 1.05261552, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.051702403588613846, + "language_loss": 0.82616276, + "learning_rate": 0.00026013939367938797, + "loss": 0.83694196, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.25317383, + "step": 3480, + "time_per_iteration": 2.891376495361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_mlp": 1.04828119, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.05419828241435922, + "language_loss": 0.81335235, + "learning_rate": 0.00025986608650243204, + "loss": 0.82409453, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.25952148, + "step": 3481, + "time_per_iteration": 2.77876353263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073761, + "balance_loss_mlp": 1.04680765, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.051697162904794, + "language_loss": 0.79773414, + "learning_rate": 0.0002595928725537293, + "loss": 0.8084718, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.26965332, + "step": 3482, + "time_per_iteration": 2.8413639068603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073841, + "balance_loss_mlp": 1.04836571, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.05767199414491062, + "language_loss": 0.88867986, + "learning_rate": 0.0002593197519393509, + "loss": 0.89941823, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.25500488, + "step": 3483, + "time_per_iteration": 2.603405475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069253, + "balance_loss_mlp": 1.04446936, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.06697980614257329, + "language_loss": 0.79532218, + "learning_rate": 0.00025904672476533165, + "loss": 0.80601466, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.2479248, + "step": 3484, + "time_per_iteration": 2.84698224067688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070985, + "balance_loss_mlp": 1.04459202, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.05331322450394034, + "language_loss": 0.82924032, + "learning_rate": 0.0002587737911376704, + "loss": 0.83995014, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.26416016, + "step": 3485, + "time_per_iteration": 2.585921049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074192, + "balance_loss_mlp": 1.04729843, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.06756987561009595, + "language_loss": 0.84183806, + "learning_rate": 0.00025850095116232885, + "loss": 0.85257995, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.26953125, + "step": 3486, + "time_per_iteration": 2.7065019607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075208, + "balance_loss_mlp": 1.04840994, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.05801175058434062, + "language_loss": 0.77675766, + "learning_rate": 0.000258228204945233, + "loss": 0.7875098, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.2677002, + "step": 3487, + "time_per_iteration": 2.8951704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071909, + "balance_loss_mlp": 1.04588532, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.05899101310847367, + "language_loss": 0.84739226, + "learning_rate": 0.00025795555259227254, + "loss": 0.85811132, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.26062012, + "step": 3488, + "time_per_iteration": 2.777141571044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072765, + "balance_loss_mlp": 1.04677725, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.0454202058547125, + "language_loss": 0.84104466, + "learning_rate": 0.00025768299420930046, + "loss": 0.85177231, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.2598877, + "step": 3489, + "time_per_iteration": 2.720435857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073548, + "balance_loss_mlp": 1.04736936, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.052981388085366045, + "language_loss": 0.83523858, + "learning_rate": 0.0002574105299021332, + "loss": 0.84597409, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.26220703, + "step": 3490, + "time_per_iteration": 2.874335289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072546, + "balance_loss_mlp": 1.04605818, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.05653925915184199, + "language_loss": 0.84515595, + "learning_rate": 0.00025713815977655084, + "loss": 0.85588139, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.26501465, + "step": 3491, + "time_per_iteration": 2.857795000076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107473, + "balance_loss_mlp": 1.04900455, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.0648375250519464, + "language_loss": 0.84809422, + "learning_rate": 0.0002568658839382969, + "loss": 0.85884148, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.25683594, + "step": 3492, + "time_per_iteration": 2.5480034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072741, + "balance_loss_mlp": 1.04731405, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.06366513295568171, + "language_loss": 0.84661782, + "learning_rate": 0.00025659370249307814, + "loss": 0.85734528, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.25439453, + "step": 3493, + "time_per_iteration": 2.602646589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072937, + "balance_loss_mlp": 1.04722357, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.05297099433671679, + "language_loss": 0.85274851, + "learning_rate": 0.00025632161554656473, + "loss": 0.86347795, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.25732422, + "step": 3494, + "time_per_iteration": 2.867612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071204, + "balance_loss_mlp": 1.04509759, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.05583877885035688, + "language_loss": 0.81951666, + "learning_rate": 0.00025604962320439017, + "loss": 0.83022875, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.26147461, + "step": 3495, + "time_per_iteration": 2.7493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107172, + "balance_loss_mlp": 1.04625738, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.056464737234764244, + "language_loss": 0.82197464, + "learning_rate": 0.0002557777255721516, + "loss": 0.83269185, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.2545166, + "step": 3496, + "time_per_iteration": 2.712113857269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068431, + "balance_loss_mlp": 1.04230046, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.0673285818829442, + "language_loss": 0.80758643, + "learning_rate": 0.0002555059227554087, + "loss": 0.8182708, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.26171875, + "step": 3497, + "time_per_iteration": 2.6871681213378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_mlp": 1.04655433, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.05408032607546607, + "language_loss": 0.7786265, + "learning_rate": 0.00025523421485968453, + "loss": 0.78934866, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.25695801, + "step": 3498, + "time_per_iteration": 2.822655439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.04613543, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.05805760425239871, + "language_loss": 0.85567248, + "learning_rate": 0.00025496260199046585, + "loss": 0.86639267, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.25891113, + "step": 3499, + "time_per_iteration": 2.9368207454681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073883, + "balance_loss_mlp": 1.04759765, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.05807897060622665, + "language_loss": 0.84593326, + "learning_rate": 0.000254691084253202, + "loss": 0.85667205, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.26293945, + "step": 3500, + "time_per_iteration": 2.812175750732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069637, + "balance_loss_mlp": 1.04343474, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.06730887087818041, + "language_loss": 0.77490008, + "learning_rate": 0.00025441966175330567, + "loss": 0.78559649, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.2623291, + "step": 3501, + "time_per_iteration": 2.6858127117156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074737, + "balance_loss_mlp": 1.04904723, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.05973627548594562, + "language_loss": 0.7990756, + "learning_rate": 0.00025414833459615183, + "loss": 0.80982292, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.2565918, + "step": 3502, + "time_per_iteration": 2.792283296585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079245, + "balance_loss_mlp": 1.05329359, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.054401429492937234, + "language_loss": 0.80582958, + "learning_rate": 0.0002538771028870796, + "loss": 0.81662202, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.2598877, + "step": 3503, + "time_per_iteration": 2.7585413455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073955, + "balance_loss_mlp": 1.04783654, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.064846065362636, + "language_loss": 0.81689268, + "learning_rate": 0.0002536059667313903, + "loss": 0.82763219, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.2611084, + "step": 3504, + "time_per_iteration": 2.71769118309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074293, + "balance_loss_mlp": 1.04768562, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.06348051765573881, + "language_loss": 0.89503717, + "learning_rate": 0.0002533349262343483, + "loss": 0.90578014, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.26635742, + "step": 3505, + "time_per_iteration": 2.660651445388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079263, + "balance_loss_mlp": 1.05396676, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.07580313985305334, + "language_loss": 0.81963527, + "learning_rate": 0.0002530639815011807, + "loss": 0.83042789, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.25317383, + "step": 3506, + "time_per_iteration": 2.4884142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107741, + "balance_loss_mlp": 1.05192339, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.07059145793354948, + "language_loss": 0.85113943, + "learning_rate": 0.0002527931326370781, + "loss": 0.86191356, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.25512695, + "step": 3507, + "time_per_iteration": 2.7946653366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078834, + "balance_loss_mlp": 1.05275106, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.06075343684572694, + "language_loss": 0.83284092, + "learning_rate": 0.00025252237974719276, + "loss": 0.84362924, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.26098633, + "step": 3508, + "time_per_iteration": 2.8548471927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108079, + "balance_loss_mlp": 1.05530286, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.06110839735898087, + "language_loss": 0.80529547, + "learning_rate": 0.00025225172293664056, + "loss": 0.81610334, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.25500488, + "step": 3509, + "time_per_iteration": 3.0396220684051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013373, + "balance_loss_mlp": 1.00583911, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.007570597102939453, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77946508, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.07519531, + "step": 3510, + "time_per_iteration": 4.9238317012786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081964, + "balance_loss_mlp": 1.05588078, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.06266149701009033, + "language_loss": 0.85147846, + "learning_rate": 0.00025171069797381106, + "loss": 0.86229801, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.26123047, + "step": 3511, + "time_per_iteration": 2.842026948928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107574, + "balance_loss_mlp": 1.05036068, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.05295851129049709, + "language_loss": 0.82071269, + "learning_rate": 0.00025144033003157864, + "loss": 0.83147007, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.25402832, + "step": 3512, + "time_per_iteration": 2.5853493213653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087572, + "balance_loss_mlp": 1.06216824, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.10878048166540129, + "language_loss": 0.78940082, + "learning_rate": 0.00025117005858876806, + "loss": 0.80027652, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.25402832, + "step": 3513, + "time_per_iteration": 2.683076858520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05658007, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.062477123984618736, + "language_loss": 0.8580628, + "learning_rate": 0.000250899883750308, + "loss": 0.86887884, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.25036621, + "step": 3514, + "time_per_iteration": 2.7132656574249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081057, + "balance_loss_mlp": 1.05621386, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.06208222280166845, + "language_loss": 0.82150948, + "learning_rate": 0.00025062980562109006, + "loss": 0.83232003, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.24841309, + "step": 3515, + "time_per_iteration": 4.169267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080203, + "balance_loss_mlp": 1.0545373, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.06255733263360135, + "language_loss": 0.83099926, + "learning_rate": 0.0002503598243059677, + "loss": 0.84180129, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.25683594, + "step": 3516, + "time_per_iteration": 2.7749977111816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.05966699, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.06025675944988047, + "language_loss": 0.8034898, + "learning_rate": 0.0002500899399097568, + "loss": 0.8143459, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.25976562, + "step": 3517, + "time_per_iteration": 2.638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087679, + "balance_loss_mlp": 1.06179833, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.06061041390288269, + "language_loss": 0.85528451, + "learning_rate": 0.0002498201525372359, + "loss": 0.86616129, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.25915527, + "step": 3518, + "time_per_iteration": 2.6280837059020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090365, + "balance_loss_mlp": 1.06465113, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.05678341042479038, + "language_loss": 0.83314502, + "learning_rate": 0.00024955046229314584, + "loss": 0.84404874, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.25732422, + "step": 3519, + "time_per_iteration": 2.598114013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090498, + "balance_loss_mlp": 1.06486833, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06076117053087645, + "language_loss": 0.87566268, + "learning_rate": 0.00024928086928218947, + "loss": 0.88656765, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.25646973, + "step": 3520, + "time_per_iteration": 2.4903347492218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088735, + "balance_loss_mlp": 1.06373692, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.07287675105407085, + "language_loss": 0.76298815, + "learning_rate": 0.00024901137360903216, + "loss": 0.77387547, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.25012207, + "step": 3521, + "time_per_iteration": 2.957127332687378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095619, + "balance_loss_mlp": 1.07063317, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.06312793336661301, + "language_loss": 0.80923325, + "learning_rate": 0.00024874197537830115, + "loss": 0.82018942, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.25, + "step": 3522, + "time_per_iteration": 2.5331904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088383, + "balance_loss_mlp": 1.06340837, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.06755999829243825, + "language_loss": 0.83245486, + "learning_rate": 0.00024847267469458684, + "loss": 0.84333861, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.24987793, + "step": 3523, + "time_per_iteration": 2.525132417678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087699, + "balance_loss_mlp": 1.06227136, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.06413222868120108, + "language_loss": 0.7768755, + "learning_rate": 0.00024820347166244034, + "loss": 0.78775245, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.2545166, + "step": 3524, + "time_per_iteration": 2.981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086699, + "balance_loss_mlp": 1.06202292, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.05504268755714505, + "language_loss": 0.85045242, + "learning_rate": 0.0002479343663863755, + "loss": 0.86131942, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.24682617, + "step": 3525, + "time_per_iteration": 2.8227763175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_mlp": 1.05880737, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.05863991257689852, + "language_loss": 0.76910073, + "learning_rate": 0.00024766535897086876, + "loss": 0.77995467, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.26623535, + "step": 3526, + "time_per_iteration": 2.5773653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_mlp": 1.06144333, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.09784293796140163, + "language_loss": 0.78738832, + "learning_rate": 0.0002473964495203578, + "loss": 0.79827124, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.26879883, + "step": 3527, + "time_per_iteration": 2.6880078315734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084335, + "balance_loss_mlp": 1.0582881, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.057535616480669176, + "language_loss": 0.85700953, + "learning_rate": 0.0002471276381392425, + "loss": 0.86785293, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.26062012, + "step": 3528, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_mlp": 1.02067733, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.014996437557936866, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79216838, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.07519531, + "step": 3529, + "time_per_iteration": 4.95120096206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088756, + "balance_loss_mlp": 1.06375766, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06208247419260481, + "language_loss": 0.84420717, + "learning_rate": 0.00024659031000260826, + "loss": 0.85509473, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.25, + "step": 3530, + "time_per_iteration": 2.8619091510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085263, + "balance_loss_mlp": 1.05816674, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.0739213834175869, + "language_loss": 0.80927098, + "learning_rate": 0.0002463217934556985, + "loss": 0.82012367, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.27111816, + "step": 3531, + "time_per_iteration": 2.6372668743133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015203, + "balance_loss_mlp": 1.00790787, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.011067583088495437, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77547294, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.07275391, + "step": 3532, + "time_per_iteration": 4.7275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.06364703, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.07509562800064129, + "language_loss": 0.83719718, + "learning_rate": 0.0002457850559259306, + "loss": 0.84809136, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.25769043, + "step": 3533, + "time_per_iteration": 2.9546730518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082617, + "balance_loss_mlp": 1.05801249, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.058098360832657354, + "language_loss": 0.82016122, + "learning_rate": 0.00024551683515145275, + "loss": 0.83098733, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.24597168, + "step": 3534, + "time_per_iteration": 2.675198793411255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080796, + "balance_loss_mlp": 1.05546427, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.05760567747955486, + "language_loss": 0.8703866, + "learning_rate": 0.0002452487131761014, + "loss": 0.88119459, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.25354004, + "step": 3535, + "time_per_iteration": 2.7560551166534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080116, + "balance_loss_mlp": 1.0540328, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.06295067828117173, + "language_loss": 0.80308378, + "learning_rate": 0.00024498069010397093, + "loss": 0.81388497, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.26123047, + "step": 3536, + "time_per_iteration": 2.7834858894348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081714, + "balance_loss_mlp": 1.05659688, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.05311413665555526, + "language_loss": 0.85467112, + "learning_rate": 0.00024471276603911697, + "loss": 0.86548829, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.2512207, + "step": 3537, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086182, + "balance_loss_mlp": 1.06095743, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.0668547033198753, + "language_loss": 0.79341853, + "learning_rate": 0.0002444449410855572, + "loss": 0.80428034, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.25231934, + "step": 3538, + "time_per_iteration": 2.790034532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083109, + "balance_loss_mlp": 1.0583849, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.056899188287429556, + "language_loss": 0.84389639, + "learning_rate": 0.00024417721534727033, + "loss": 0.85472751, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.24731445, + "step": 3539, + "time_per_iteration": 2.703143358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081155, + "balance_loss_mlp": 1.0562042, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.06562679569248508, + "language_loss": 0.83345222, + "learning_rate": 0.00024390958892819687, + "loss": 0.84426379, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.24938965, + "step": 3540, + "time_per_iteration": 2.5123190879821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083792, + "balance_loss_mlp": 1.0574708, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.0704351751694786, + "language_loss": 0.80845803, + "learning_rate": 0.0002436420619322381, + "loss": 0.81929594, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.26367188, + "step": 3541, + "time_per_iteration": 2.8810999393463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080954, + "balance_loss_mlp": 1.05532384, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.05740970422005706, + "language_loss": 0.82921457, + "learning_rate": 0.0002433746344632577, + "loss": 0.84002411, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.25634766, + "step": 3542, + "time_per_iteration": 2.7135009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085031, + "balance_loss_mlp": 1.0591507, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.09305819117462581, + "language_loss": 0.80352092, + "learning_rate": 0.00024310730662508006, + "loss": 0.81437123, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.25891113, + "step": 3543, + "time_per_iteration": 3.061795949935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080318, + "balance_loss_mlp": 1.05509281, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.05668741815102704, + "language_loss": 0.87538439, + "learning_rate": 0.0002428400785214911, + "loss": 0.88618755, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.25231934, + "step": 3544, + "time_per_iteration": 2.600311279296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077375, + "balance_loss_mlp": 1.05138755, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.05461889595804736, + "language_loss": 0.8282584, + "learning_rate": 0.00024257295025623794, + "loss": 0.83903217, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.26025391, + "step": 3545, + "time_per_iteration": 2.9303810596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.05181503, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.05463357395047058, + "language_loss": 0.80816013, + "learning_rate": 0.00024230592193302892, + "loss": 0.8189292, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.25085449, + "step": 3546, + "time_per_iteration": 3.0259780883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108338, + "balance_loss_mlp": 1.05730915, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.061332624341889866, + "language_loss": 0.84813237, + "learning_rate": 0.00024203899365553372, + "loss": 0.85896623, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.2611084, + "step": 3547, + "time_per_iteration": 2.5990257263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101826, + "balance_loss_mlp": 1.01120329, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.024302183931920462, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7775262, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.07080078, + "step": 3548, + "time_per_iteration": 4.529210090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082512, + "balance_loss_mlp": 1.05796695, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.06743291659407481, + "language_loss": 0.83211255, + "learning_rate": 0.00024150543765216848, + "loss": 0.84293771, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.2454834, + "step": 3549, + "time_per_iteration": 2.9848315715789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079547, + "balance_loss_mlp": 1.05444109, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.06339760092568236, + "language_loss": 0.83246768, + "learning_rate": 0.00024123881013344352, + "loss": 0.84326315, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.25109863, + "step": 3550, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078185, + "balance_loss_mlp": 1.05236471, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.05786884638385198, + "language_loss": 0.79739892, + "learning_rate": 0.00024097228307472202, + "loss": 0.80818081, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.25854492, + "step": 3551, + "time_per_iteration": 2.8328561782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078138, + "balance_loss_mlp": 1.0525794, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.06566140613628157, + "language_loss": 0.81969666, + "learning_rate": 0.00024070585657947846, + "loss": 0.83047807, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.25585938, + "step": 3552, + "time_per_iteration": 2.962819814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081884, + "balance_loss_mlp": 1.05676627, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.0534920389978937, + "language_loss": 0.8565321, + "learning_rate": 0.00024043953075114934, + "loss": 0.86735094, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.2512207, + "step": 3553, + "time_per_iteration": 2.638843059539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075297, + "balance_loss_mlp": 1.04947591, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.05485764052076591, + "language_loss": 0.88990396, + "learning_rate": 0.00024017330569313128, + "loss": 0.90065694, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.25842285, + "step": 3554, + "time_per_iteration": 2.7616748809814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078527, + "balance_loss_mlp": 1.05215812, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.07669249148194994, + "language_loss": 0.75058365, + "learning_rate": 0.0002399071815087821, + "loss": 0.76136887, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.26391602, + "step": 3555, + "time_per_iteration": 3.047292470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_mlp": 1.05511451, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.0595534971161133, + "language_loss": 0.84028351, + "learning_rate": 0.00023964115830142025, + "loss": 0.85108721, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.25256348, + "step": 3556, + "time_per_iteration": 2.708983898162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074295, + "balance_loss_mlp": 1.05035782, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.0757977451950182, + "language_loss": 0.88028133, + "learning_rate": 0.00023937523617432522, + "loss": 0.89102429, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.23950195, + "step": 3557, + "time_per_iteration": 2.454397201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077509, + "balance_loss_mlp": 1.05258226, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.08760866739293877, + "language_loss": 0.87423909, + "learning_rate": 0.00023910941523073705, + "loss": 0.88501424, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.24938965, + "step": 3558, + "time_per_iteration": 3.9113569259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.05796981, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.053991545736228864, + "language_loss": 0.86934376, + "learning_rate": 0.0002388436955738566, + "loss": 0.88018322, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.2598877, + "step": 3559, + "time_per_iteration": 2.837038040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080181, + "balance_loss_mlp": 1.05512345, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.06078167941241102, + "language_loss": 0.81248534, + "learning_rate": 0.00023857807730684523, + "loss": 0.82328713, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.25061035, + "step": 3560, + "time_per_iteration": 2.892477035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084991, + "balance_loss_mlp": 1.05795407, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.06645470458229728, + "language_loss": 0.82908154, + "learning_rate": 0.00023831256053282547, + "loss": 0.83993149, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.27050781, + "step": 3561, + "time_per_iteration": 2.724573850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081036, + "balance_loss_mlp": 1.05547762, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.06597218498580906, + "language_loss": 0.78399622, + "learning_rate": 0.00023804714535488003, + "loss": 0.7948066, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.25561523, + "step": 3562, + "time_per_iteration": 2.95060133934021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019571, + "balance_loss_mlp": 1.01251411, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.015166594487017694, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80829203, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.07080078, + "step": 3563, + "time_per_iteration": 4.933622360229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087969, + "balance_loss_mlp": 1.0631851, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.058645114783078524, + "language_loss": 0.81150877, + "learning_rate": 0.00023751662019934488, + "loss": 0.82238841, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.2479248, + "step": 3564, + "time_per_iteration": 2.551375150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080841, + "balance_loss_mlp": 1.05612862, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.05683958550718021, + "language_loss": 0.79323113, + "learning_rate": 0.00023725151042772364, + "loss": 0.80403948, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.24719238, + "step": 3565, + "time_per_iteration": 2.8488030433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081595, + "balance_loss_mlp": 1.05563116, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.06643768922422526, + "language_loss": 0.83425218, + "learning_rate": 0.00023698650266411276, + "loss": 0.84506816, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.26000977, + "step": 3566, + "time_per_iteration": 2.704754590988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079538, + "balance_loss_mlp": 1.05554175, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.06089372321072988, + "language_loss": 0.83402336, + "learning_rate": 0.00023672159701139755, + "loss": 0.84481871, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.23986816, + "step": 3567, + "time_per_iteration": 3.2112581729888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_mlp": 1.05952144, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.06475688467901158, + "language_loss": 0.86233699, + "learning_rate": 0.00023645679357242296, + "loss": 0.87318128, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.24890137, + "step": 3568, + "time_per_iteration": 2.618299961090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077978, + "balance_loss_mlp": 1.05325365, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.06930985258360142, + "language_loss": 0.84079957, + "learning_rate": 0.00023619209244999534, + "loss": 0.85157931, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.24694824, + "step": 3569, + "time_per_iteration": 2.5762784481048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.05775487, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.07239946064246126, + "language_loss": 0.84962302, + "learning_rate": 0.0002359274937468806, + "loss": 0.86045027, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.24975586, + "step": 3570, + "time_per_iteration": 2.507097005844116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080008, + "balance_loss_mlp": 1.0555582, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.052246818326421945, + "language_loss": 0.78233075, + "learning_rate": 0.00023566299756580512, + "loss": 0.79313087, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.2442627, + "step": 3571, + "time_per_iteration": 2.6490540504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.05523372, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.06589873086425142, + "language_loss": 0.78497767, + "learning_rate": 0.0002353986040094551, + "loss": 0.79579425, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.2644043, + "step": 3572, + "time_per_iteration": 2.525590419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05405378, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.058453848630334905, + "language_loss": 0.79833031, + "learning_rate": 0.00023513431318047796, + "loss": 0.80912042, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.24975586, + "step": 3573, + "time_per_iteration": 2.5652148723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081007, + "balance_loss_mlp": 1.0563786, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.12934714491167457, + "language_loss": 0.77343333, + "learning_rate": 0.00023487012518147977, + "loss": 0.78424335, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.24621582, + "step": 3574, + "time_per_iteration": 3.2728779315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.05660903, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.06788347581923994, + "language_loss": 0.8458752, + "learning_rate": 0.00023460604011502772, + "loss": 0.85669678, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.25549316, + "step": 3575, + "time_per_iteration": 3.650050163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071019, + "balance_loss_mlp": 1.04640222, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.06594699265094836, + "language_loss": 0.85666633, + "learning_rate": 0.00023434205808364845, + "loss": 0.86737645, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.24621582, + "step": 3576, + "time_per_iteration": 3.2174363136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081926, + "balance_loss_mlp": 1.05646336, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.073624827285274, + "language_loss": 0.85645866, + "learning_rate": 0.00023407817918982932, + "loss": 0.86727792, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.25488281, + "step": 3577, + "time_per_iteration": 2.8009090423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088022, + "balance_loss_mlp": 1.06271362, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.06549349473125507, + "language_loss": 0.79113662, + "learning_rate": 0.00023381440353601718, + "loss": 0.80201685, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.2532959, + "step": 3578, + "time_per_iteration": 3.023149251937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080761, + "balance_loss_mlp": 1.05627584, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.05959315999492073, + "language_loss": 0.86070436, + "learning_rate": 0.00023355073122461822, + "loss": 0.87151194, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.24487305, + "step": 3579, + "time_per_iteration": 2.9520890712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05880141, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.06355756593191678, + "language_loss": 0.82827502, + "learning_rate": 0.00023328716235799973, + "loss": 0.83911884, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.25598145, + "step": 3580, + "time_per_iteration": 3.351285219192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080812, + "balance_loss_mlp": 1.05680299, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.05871142943590934, + "language_loss": 0.84103072, + "learning_rate": 0.00023302369703848803, + "loss": 0.85183883, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.24023438, + "step": 3581, + "time_per_iteration": 2.7034530639648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088103, + "balance_loss_mlp": 1.06281841, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.05872811421519248, + "language_loss": 0.80432433, + "learning_rate": 0.00023276033536836937, + "loss": 0.81520534, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.25305176, + "step": 3582, + "time_per_iteration": 2.933551073074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077987, + "balance_loss_mlp": 1.05369234, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.06546577273757126, + "language_loss": 0.84750611, + "learning_rate": 0.00023249707744988984, + "loss": 0.85828596, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.24279785, + "step": 3583, + "time_per_iteration": 2.694974184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_mlp": 1.05804539, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.07473355522869814, + "language_loss": 0.82210362, + "learning_rate": 0.00023223392338525529, + "loss": 0.83294201, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.25830078, + "step": 3584, + "time_per_iteration": 2.5522758960723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078352, + "balance_loss_mlp": 1.05215001, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.05831544334966422, + "language_loss": 0.78814328, + "learning_rate": 0.00023197087327663107, + "loss": 0.79892683, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.26208496, + "step": 3585, + "time_per_iteration": 2.6880340576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083733, + "balance_loss_mlp": 1.05843663, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.6312762348239643, + "language_loss": 0.81380439, + "learning_rate": 0.00023170792722614243, + "loss": 0.8246417, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.25317383, + "step": 3586, + "time_per_iteration": 3.0318641662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079757, + "balance_loss_mlp": 1.05460346, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.05006567848129158, + "language_loss": 0.83709162, + "learning_rate": 0.00023144508533587377, + "loss": 0.84788913, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.25170898, + "step": 3587, + "time_per_iteration": 2.8474464416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06386399, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.06762785817059219, + "language_loss": 0.79246032, + "learning_rate": 0.0002311823477078698, + "loss": 0.80336785, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.26928711, + "step": 3588, + "time_per_iteration": 2.9889235496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097034, + "balance_loss_mlp": 1.0714879, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.09415937130110832, + "language_loss": 0.85614562, + "learning_rate": 0.00023091971444413428, + "loss": 0.86711591, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.2557373, + "step": 3589, + "time_per_iteration": 2.809373378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101509, + "balance_loss_mlp": 1.07424605, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.05794959755729282, + "language_loss": 0.82868153, + "learning_rate": 0.00023065718564663012, + "loss": 0.83969659, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.27307129, + "step": 3590, + "time_per_iteration": 2.7731661796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074688, + "balance_loss_mlp": 1.06705844, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.02655452112357536, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74986279, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.07617188, + "step": 3591, + "time_per_iteration": 4.988200664520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_mlp": 1.07704329, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.05972599436202674, + "language_loss": 0.81043237, + "learning_rate": 0.0002301324418579666, + "loss": 0.82145822, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.2557373, + "step": 3592, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010908, + "balance_loss_mlp": 1.0828371, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.028191154698104088, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79779273, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.07958984, + "step": 3593, + "time_per_iteration": 4.760195732116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108367, + "balance_loss_mlp": 1.08173525, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.065561015733832, + "language_loss": 0.809973, + "learning_rate": 0.00022960811715677415, + "loss": 0.82105672, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.26660156, + "step": 3594, + "time_per_iteration": 2.897792339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117, + "balance_loss_mlp": 1.08976054, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.0669935961338165, + "language_loss": 0.81794119, + "learning_rate": 0.00022934611221845608, + "loss": 0.82911116, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.27258301, + "step": 3595, + "time_per_iteration": 2.8457274436950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.08326638, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.05592882614281094, + "language_loss": 0.78289419, + "learning_rate": 0.00022908421235729609, + "loss": 0.79401559, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.28881836, + "step": 3596, + "time_per_iteration": 2.7383065223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108605, + "balance_loss_mlp": 1.08028126, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.10609288335258749, + "language_loss": 0.85567772, + "learning_rate": 0.0002288224176749728, + "loss": 0.86676377, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.28320312, + "step": 3597, + "time_per_iteration": 2.716928720474243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102474, + "balance_loss_mlp": 1.07592607, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.07082611334178894, + "language_loss": 0.78666878, + "learning_rate": 0.00022856072827312385, + "loss": 0.79769349, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.26525879, + "step": 3598, + "time_per_iteration": 2.9266068935394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102937, + "balance_loss_mlp": 1.07671118, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.06087087584265889, + "language_loss": 0.77196717, + "learning_rate": 0.00022829914425334598, + "loss": 0.78299654, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.26269531, + "step": 3599, + "time_per_iteration": 2.654209852218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.07294059, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.0619495663998332, + "language_loss": 0.80632389, + "learning_rate": 0.0002280376657171956, + "loss": 0.81731534, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.26245117, + "step": 3600, + "time_per_iteration": 2.699690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110091, + "balance_loss_mlp": 1.07408822, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.061197826149154644, + "language_loss": 0.76475906, + "learning_rate": 0.00022777629276618706, + "loss": 0.77576816, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.26855469, + "step": 3601, + "time_per_iteration": 3.2044432163238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07805634, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.05964780177227117, + "language_loss": 0.77385223, + "learning_rate": 0.0002275150255017947, + "loss": 0.78491223, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.2800293, + "step": 3602, + "time_per_iteration": 2.7982289791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.06525421, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.02252774148051873, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76806176, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.08251953, + "step": 3603, + "time_per_iteration": 5.054601192474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.06465173, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.023702106563631756, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76200008, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.08251953, + "step": 3604, + "time_per_iteration": 4.7977614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116592, + "balance_loss_mlp": 1.08946013, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.06388542496956687, + "language_loss": 0.85052603, + "learning_rate": 0.0002267318588424379, + "loss": 0.86169201, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.2713623, + "step": 3605, + "time_per_iteration": 2.654792308807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110494, + "balance_loss_mlp": 1.08425605, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.06333584007687255, + "language_loss": 0.87824345, + "learning_rate": 0.00022647101533842845, + "loss": 0.88934839, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.26257324, + "step": 3606, + "time_per_iteration": 2.8975396156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109552, + "balance_loss_mlp": 1.08295608, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.1091990827020025, + "language_loss": 0.76831424, + "learning_rate": 0.00022621027802778872, + "loss": 0.77940977, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.26623535, + "step": 3607, + "time_per_iteration": 2.63248348236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108534, + "balance_loss_mlp": 1.08149719, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.059104440190076296, + "language_loss": 0.78716248, + "learning_rate": 0.00022594964701174586, + "loss": 0.79824781, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.27075195, + "step": 3608, + "time_per_iteration": 2.681976079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111794, + "balance_loss_mlp": 1.08559155, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.07590462116844392, + "language_loss": 0.84867048, + "learning_rate": 0.00022568912239148586, + "loss": 0.85978842, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.26245117, + "step": 3609, + "time_per_iteration": 2.6417384147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101276, + "balance_loss_mlp": 1.07528806, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.058005826874071686, + "language_loss": 0.81464773, + "learning_rate": 0.00022542870426815344, + "loss": 0.82566053, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.26000977, + "step": 3610, + "time_per_iteration": 2.7006101608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109994, + "balance_loss_mlp": 1.08157444, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.056828094701861585, + "language_loss": 0.86496603, + "learning_rate": 0.00022516839274285173, + "loss": 0.87606597, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.28442383, + "step": 3611, + "time_per_iteration": 2.5740535259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094552, + "balance_loss_mlp": 1.06758666, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.08027595543675893, + "language_loss": 0.74892008, + "learning_rate": 0.00022490818791664265, + "loss": 0.75986564, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.26977539, + "step": 3612, + "time_per_iteration": 2.608222007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098839, + "balance_loss_mlp": 1.072685, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.059039400605863955, + "language_loss": 0.85845947, + "learning_rate": 0.00022464808989054676, + "loss": 0.86944789, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.26171875, + "step": 3613, + "time_per_iteration": 2.676614999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108263, + "balance_loss_mlp": 1.08036768, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.062091067173502004, + "language_loss": 0.76033241, + "learning_rate": 0.00022438809876554284, + "loss": 0.77141511, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.27905273, + "step": 3614, + "time_per_iteration": 2.6178860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104393, + "balance_loss_mlp": 1.07635498, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.07239671718654846, + "language_loss": 0.80618018, + "learning_rate": 0.00022412821464256873, + "loss": 0.81722414, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.28051758, + "step": 3615, + "time_per_iteration": 2.690284252166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094366, + "balance_loss_mlp": 1.06802058, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.05319621307951733, + "language_loss": 0.82896477, + "learning_rate": 0.00022386843762252023, + "loss": 0.83990836, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.2635498, + "step": 3616, + "time_per_iteration": 2.600942611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102453, + "balance_loss_mlp": 1.07486832, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.06580678033513349, + "language_loss": 0.79979908, + "learning_rate": 0.00022360876780625193, + "loss": 0.81082356, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.27587891, + "step": 3617, + "time_per_iteration": 2.645925998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095024, + "balance_loss_mlp": 1.06762934, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.0499393728898112, + "language_loss": 0.8003695, + "learning_rate": 0.00022334920529457604, + "loss": 0.81131971, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.27441406, + "step": 3618, + "time_per_iteration": 2.899454116821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.06254315, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.05309035379190974, + "language_loss": 0.87337005, + "learning_rate": 0.00022308975018826423, + "loss": 0.88426542, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.27026367, + "step": 3619, + "time_per_iteration": 2.912917375564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095664, + "balance_loss_mlp": 1.06719649, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.06796578820751965, + "language_loss": 0.84640574, + "learning_rate": 0.00022283040258804564, + "loss": 0.85736233, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.28466797, + "step": 3620, + "time_per_iteration": 2.8118083477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094233, + "balance_loss_mlp": 1.06671929, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.05989374057808202, + "language_loss": 0.8382861, + "learning_rate": 0.00022257116259460802, + "loss": 0.84922838, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.27539062, + "step": 3621, + "time_per_iteration": 2.8895604610443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087446, + "balance_loss_mlp": 1.06112456, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.08406713908768157, + "language_loss": 0.81423789, + "learning_rate": 0.00022231203030859725, + "loss": 0.82511234, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.26367188, + "step": 3622, + "time_per_iteration": 2.971266269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094655, + "balance_loss_mlp": 1.06714153, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.06551084245575202, + "language_loss": 0.83678401, + "learning_rate": 0.00022205300583061737, + "loss": 0.84773052, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.27539062, + "step": 3623, + "time_per_iteration": 2.585472822189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108403, + "balance_loss_mlp": 1.07649624, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.033083333186048725, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83922231, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.07519531, + "step": 3624, + "time_per_iteration": 4.895202159881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090164, + "balance_loss_mlp": 1.06300831, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.0660307641542608, + "language_loss": 0.77727789, + "learning_rate": 0.00022153528070095735, + "loss": 0.78817952, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.27197266, + "step": 3625, + "time_per_iteration": 2.701016902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085494, + "balance_loss_mlp": 1.05917203, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07343943993793525, + "language_loss": 0.88176632, + "learning_rate": 0.00022127658025027568, + "loss": 0.89262128, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.26330566, + "step": 3626, + "time_per_iteration": 2.66186261177063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087139, + "balance_loss_mlp": 1.0592438, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.05867128849985362, + "language_loss": 0.85380179, + "learning_rate": 0.00022101798800962258, + "loss": 0.86467314, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.27905273, + "step": 3627, + "time_per_iteration": 2.61289119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088464, + "balance_loss_mlp": 1.06195211, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.06919874176804652, + "language_loss": 0.78915298, + "learning_rate": 0.00022075950407939227, + "loss": 0.80003762, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.26550293, + "step": 3628, + "time_per_iteration": 2.6066434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082826, + "balance_loss_mlp": 1.05665994, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.06455342757001964, + "language_loss": 0.83102697, + "learning_rate": 0.0002205011285599367, + "loss": 0.84185529, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.26208496, + "step": 3629, + "time_per_iteration": 2.627265691757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084671, + "balance_loss_mlp": 1.05813527, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.05600849207785957, + "language_loss": 0.80451405, + "learning_rate": 0.00022024286155156658, + "loss": 0.81536078, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.26586914, + "step": 3630, + "time_per_iteration": 2.8945116996765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080924, + "balance_loss_mlp": 1.05462611, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.05471727557268105, + "language_loss": 0.86118478, + "learning_rate": 0.00021998470315454994, + "loss": 0.87199402, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.26306152, + "step": 3631, + "time_per_iteration": 2.711768627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05755305, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.05720000052164256, + "language_loss": 0.8692646, + "learning_rate": 0.00021972665346911275, + "loss": 0.8801105, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.27050781, + "step": 3632, + "time_per_iteration": 2.7766430377960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086616, + "balance_loss_mlp": 1.0609858, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.0722224306004379, + "language_loss": 0.79897952, + "learning_rate": 0.00021946871259543877, + "loss": 0.80984569, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.2565918, + "step": 3633, + "time_per_iteration": 2.600034713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079015, + "balance_loss_mlp": 1.05383754, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.0639524243068684, + "language_loss": 0.83284152, + "learning_rate": 0.00021921088063366957, + "loss": 0.84363163, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.25183105, + "step": 3634, + "time_per_iteration": 2.956197738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085501, + "balance_loss_mlp": 1.0596205, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.058476095641480985, + "language_loss": 0.81960422, + "learning_rate": 0.00021895315768390435, + "loss": 0.83045918, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.2590332, + "step": 3635, + "time_per_iteration": 2.5913336277008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05759156, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.04531341451753373, + "language_loss": 0.87785435, + "learning_rate": 0.00021869554384619999, + "loss": 0.88868463, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.25415039, + "step": 3636, + "time_per_iteration": 2.9603588581085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089766, + "balance_loss_mlp": 1.06315875, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.21159082474566934, + "language_loss": 0.80919135, + "learning_rate": 0.00021843803922057115, + "loss": 0.82008898, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.26660156, + "step": 3637, + "time_per_iteration": 2.708937406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087312, + "balance_loss_mlp": 1.0621587, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.060159968094543256, + "language_loss": 0.82011575, + "learning_rate": 0.00021818064390698977, + "loss": 0.83098888, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.25170898, + "step": 3638, + "time_per_iteration": 2.605764389038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086471, + "balance_loss_mlp": 1.06113935, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.06371626432210087, + "language_loss": 0.87017298, + "learning_rate": 0.0002179233580053861, + "loss": 0.88103765, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.25354004, + "step": 3639, + "time_per_iteration": 2.7112109661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.0573926, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.058687026763644914, + "language_loss": 0.86069989, + "learning_rate": 0.00021766618161564688, + "loss": 0.87153351, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.26013184, + "step": 3640, + "time_per_iteration": 2.6974241733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.05666459, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.05259786469009478, + "language_loss": 0.87277496, + "learning_rate": 0.00021740911483761677, + "loss": 0.88360298, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.26123047, + "step": 3641, + "time_per_iteration": 2.5836639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089745, + "balance_loss_mlp": 1.06368566, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.04971665087061583, + "language_loss": 0.9236384, + "learning_rate": 0.00021715215777109837, + "loss": 0.93453586, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.26074219, + "step": 3642, + "time_per_iteration": 2.974407911300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085504, + "balance_loss_mlp": 1.06024349, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.05973771415141703, + "language_loss": 0.84664541, + "learning_rate": 0.00021689531051585103, + "loss": 0.85750043, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.25280762, + "step": 3643, + "time_per_iteration": 2.577305316925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089117, + "balance_loss_mlp": 1.06186557, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.062367103447564735, + "language_loss": 0.80804634, + "learning_rate": 0.00021663857317159196, + "loss": 0.81893754, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.27294922, + "step": 3644, + "time_per_iteration": 2.640782356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085412, + "balance_loss_mlp": 1.05996037, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.10933947779444686, + "language_loss": 0.82007676, + "learning_rate": 0.00021638194583799487, + "loss": 0.83093089, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.25476074, + "step": 3645, + "time_per_iteration": 2.660571813583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080249, + "balance_loss_mlp": 1.05334401, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.0653990594073395, + "language_loss": 0.82918119, + "learning_rate": 0.00021612542861469176, + "loss": 0.83998358, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.26916504, + "step": 3646, + "time_per_iteration": 3.1750996112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082853, + "balance_loss_mlp": 1.05595946, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.060469177257194674, + "language_loss": 0.82402915, + "learning_rate": 0.00021586902160127135, + "loss": 0.8348577, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.26928711, + "step": 3647, + "time_per_iteration": 2.60231614112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083469, + "balance_loss_mlp": 1.05743361, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.10102975915851765, + "language_loss": 0.74238408, + "learning_rate": 0.00021561272489727974, + "loss": 0.75321877, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.26062012, + "step": 3648, + "time_per_iteration": 2.455183744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083725, + "balance_loss_mlp": 1.0581665, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.05896874636911686, + "language_loss": 0.80454385, + "learning_rate": 0.0002153565386022199, + "loss": 0.81538105, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.25585938, + "step": 3649, + "time_per_iteration": 2.6365654468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090223, + "balance_loss_mlp": 1.0643425, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.0684708856776036, + "language_loss": 0.82569027, + "learning_rate": 0.00021510046281555262, + "loss": 0.83659256, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.25915527, + "step": 3650, + "time_per_iteration": 2.8082711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088794, + "balance_loss_mlp": 1.06316423, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.06759336316034399, + "language_loss": 0.81458813, + "learning_rate": 0.0002148444976366949, + "loss": 0.82547605, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.2565918, + "step": 3651, + "time_per_iteration": 2.753706455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086485, + "balance_loss_mlp": 1.06129622, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.05344717871766575, + "language_loss": 0.82698804, + "learning_rate": 0.00021458864316502136, + "loss": 0.8378529, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.25183105, + "step": 3652, + "time_per_iteration": 2.737903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086912, + "balance_loss_mlp": 1.06264138, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.05962835254673255, + "language_loss": 0.87223494, + "learning_rate": 0.0002143328994998634, + "loss": 0.88310409, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.24267578, + "step": 3653, + "time_per_iteration": 2.504406213760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089816, + "balance_loss_mlp": 1.06336296, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.060478723540627326, + "language_loss": 0.78619695, + "learning_rate": 0.00021407726674050982, + "loss": 0.79709506, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.26477051, + "step": 3654, + "time_per_iteration": 2.8486123085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094096, + "balance_loss_mlp": 1.06856155, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.050916885962277426, + "language_loss": 0.87187326, + "learning_rate": 0.0002138217449862061, + "loss": 0.88281423, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.25549316, + "step": 3655, + "time_per_iteration": 2.7588388919830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108901, + "balance_loss_mlp": 1.06326032, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.05276360585412431, + "language_loss": 0.78396368, + "learning_rate": 0.00021356633433615403, + "loss": 0.79485381, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.25744629, + "step": 3656, + "time_per_iteration": 2.6218318939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079447, + "balance_loss_mlp": 1.05436552, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.048722851637787626, + "language_loss": 0.83386952, + "learning_rate": 0.0002133110348895133, + "loss": 0.84466398, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.25061035, + "step": 3657, + "time_per_iteration": 2.9466397762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086119, + "balance_loss_mlp": 1.06054902, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.10765454833188913, + "language_loss": 0.85102618, + "learning_rate": 0.0002130558467453999, + "loss": 0.86188745, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.25585938, + "step": 3658, + "time_per_iteration": 3.3578195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05956531, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06250625204972131, + "language_loss": 0.84476495, + "learning_rate": 0.0002128007700028865, + "loss": 0.85562122, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.26086426, + "step": 3659, + "time_per_iteration": 2.716048002243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.05420375, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.07665519307089459, + "language_loss": 0.845348, + "learning_rate": 0.00021254580476100276, + "loss": 0.85614467, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.25476074, + "step": 3660, + "time_per_iteration": 2.5458219051361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.05685711, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.058748946938806695, + "language_loss": 0.7943238, + "learning_rate": 0.00021229095111873497, + "loss": 0.80515134, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.25927734, + "step": 3661, + "time_per_iteration": 2.775683641433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05252695, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.051479556836423725, + "language_loss": 0.86013281, + "learning_rate": 0.0002120362091750261, + "loss": 0.87092221, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.26452637, + "step": 3662, + "time_per_iteration": 2.835092782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076634, + "balance_loss_mlp": 1.04978824, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.060876931500520017, + "language_loss": 0.86844277, + "learning_rate": 0.00021178157902877566, + "loss": 0.87920904, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.26879883, + "step": 3663, + "time_per_iteration": 2.440558910369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.0555284, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.061135120384029226, + "language_loss": 0.87179941, + "learning_rate": 0.0002115270607788397, + "loss": 0.88261312, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.25866699, + "step": 3664, + "time_per_iteration": 2.7565457820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107708, + "balance_loss_mlp": 1.05143833, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.0582225162514945, + "language_loss": 0.85968196, + "learning_rate": 0.00021127265452403133, + "loss": 0.87045276, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.25671387, + "step": 3665, + "time_per_iteration": 2.545664072036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032205, + "balance_loss_mlp": 1.02552938, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.013425187729100906, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85123837, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.06689453, + "step": 3666, + "time_per_iteration": 4.894615888595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076922, + "balance_loss_mlp": 1.04990888, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.05971260757424555, + "language_loss": 0.82980728, + "learning_rate": 0.00021076417839483065, + "loss": 0.84057647, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.27026367, + "step": 3667, + "time_per_iteration": 2.776766300201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_mlp": 1.04667187, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.06375812283048922, + "language_loss": 0.8522588, + "learning_rate": 0.00021051010871784589, + "loss": 0.86299354, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.26855469, + "step": 3668, + "time_per_iteration": 2.5415139198303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069942, + "balance_loss_mlp": 1.04501557, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.055214127492262476, + "language_loss": 0.79052877, + "learning_rate": 0.0002102561514308045, + "loss": 0.80122823, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.24926758, + "step": 3669, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072356, + "balance_loss_mlp": 1.04667854, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.07306534316954115, + "language_loss": 0.82677996, + "learning_rate": 0.00021000230663230135, + "loss": 0.83750349, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.25708008, + "step": 3670, + "time_per_iteration": 2.6818981170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074835, + "balance_loss_mlp": 1.04937172, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.06539460490463701, + "language_loss": 0.83441806, + "learning_rate": 0.00020974857442088762, + "loss": 0.84516644, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.25476074, + "step": 3671, + "time_per_iteration": 2.608067512512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075045, + "balance_loss_mlp": 1.04928422, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.05848649704443167, + "language_loss": 0.88856924, + "learning_rate": 0.00020949495489507104, + "loss": 0.89931971, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.25769043, + "step": 3672, + "time_per_iteration": 2.6813790798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076712, + "balance_loss_mlp": 1.050367, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.06054837689365347, + "language_loss": 0.84767634, + "learning_rate": 0.00020924144815331525, + "loss": 0.8584435, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.26367188, + "step": 3673, + "time_per_iteration": 2.542840003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076685, + "balance_loss_mlp": 1.05078053, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.05390499311408587, + "language_loss": 0.83514738, + "learning_rate": 0.00020898805429404044, + "loss": 0.84591424, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.25927734, + "step": 3674, + "time_per_iteration": 2.6225385665893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079471, + "balance_loss_mlp": 1.05372167, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.06276037819785552, + "language_loss": 0.78933322, + "learning_rate": 0.0002087347734156228, + "loss": 0.80012792, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.2578125, + "step": 3675, + "time_per_iteration": 2.855715751647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078668, + "balance_loss_mlp": 1.05318117, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.06320503796682253, + "language_loss": 0.79648715, + "learning_rate": 0.00020848160561639452, + "loss": 0.80727386, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.25512695, + "step": 3676, + "time_per_iteration": 2.647651433944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079385, + "balance_loss_mlp": 1.05396986, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.05839132735303564, + "language_loss": 0.86102867, + "learning_rate": 0.0002082285509946445, + "loss": 0.8718226, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.25415039, + "step": 3677, + "time_per_iteration": 2.5633320808410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081369, + "balance_loss_mlp": 1.05606055, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.05152517094969974, + "language_loss": 0.8344785, + "learning_rate": 0.00020797560964861683, + "loss": 0.84529221, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.25341797, + "step": 3678, + "time_per_iteration": 2.7661099433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.05028617, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.06274913334452144, + "language_loss": 0.80699748, + "learning_rate": 0.0002077227816765122, + "loss": 0.81774426, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.24401855, + "step": 3679, + "time_per_iteration": 3.065239191055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_mlp": 1.01730835, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.014391592464441782, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77472043, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.0703125, + "step": 3680, + "time_per_iteration": 4.8172595500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073441, + "balance_loss_mlp": 1.04958761, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.05034113841233223, + "language_loss": 0.79209405, + "learning_rate": 0.00020721746624665383, + "loss": 0.80282843, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.23852539, + "step": 3681, + "time_per_iteration": 2.7298145294189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_mlp": 1.05822945, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.059799820942850454, + "language_loss": 0.80445623, + "learning_rate": 0.00020696497898508114, + "loss": 0.81529093, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.25268555, + "step": 3682, + "time_per_iteration": 2.9937915802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075641, + "balance_loss_mlp": 1.05092919, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.06191150286406427, + "language_loss": 0.77959311, + "learning_rate": 0.00020671260548979316, + "loss": 0.79034948, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.24719238, + "step": 3683, + "time_per_iteration": 3.0161404609680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081595, + "balance_loss_mlp": 1.05558372, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.05521829943560005, + "language_loss": 0.85212427, + "learning_rate": 0.00020646034585876982, + "loss": 0.86294019, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.26037598, + "step": 3684, + "time_per_iteration": 2.8698270320892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073657, + "balance_loss_mlp": 1.04850388, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.04944753850163826, + "language_loss": 0.84324521, + "learning_rate": 0.00020620820018994718, + "loss": 0.85398173, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.25170898, + "step": 3685, + "time_per_iteration": 2.801947832107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079387, + "balance_loss_mlp": 1.0536145, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.07519073749771547, + "language_loss": 0.83086288, + "learning_rate": 0.00020595616858121675, + "loss": 0.84165674, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.2578125, + "step": 3686, + "time_per_iteration": 2.7280051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070479, + "balance_loss_mlp": 1.04551697, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.05447903108557543, + "language_loss": 0.80602473, + "learning_rate": 0.00020570425113042586, + "loss": 0.81672955, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.24963379, + "step": 3687, + "time_per_iteration": 2.8146443367004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05596519, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.06579545138102952, + "language_loss": 0.85866553, + "learning_rate": 0.0002054524479353776, + "loss": 0.86947191, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.24682617, + "step": 3688, + "time_per_iteration": 2.6602835655212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04767823, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07679676176766496, + "language_loss": 0.81976587, + "learning_rate": 0.00020520075909383063, + "loss": 0.83050537, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.26306152, + "step": 3689, + "time_per_iteration": 2.866727590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074803, + "balance_loss_mlp": 1.04981625, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.05660248987472117, + "language_loss": 0.81022668, + "learning_rate": 0.00020494918470349916, + "loss": 0.82097471, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.25, + "step": 3690, + "time_per_iteration": 3.272037982940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107649, + "balance_loss_mlp": 1.04971516, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.08247583019648676, + "language_loss": 0.85683942, + "learning_rate": 0.00020469772486205297, + "loss": 0.86760426, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.26794434, + "step": 3691, + "time_per_iteration": 2.677762269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079524, + "balance_loss_mlp": 1.05322635, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.06411942158990899, + "language_loss": 0.81443423, + "learning_rate": 0.0002044463796671177, + "loss": 0.82522947, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.26330566, + "step": 3692, + "time_per_iteration": 2.6739578247070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077922, + "balance_loss_mlp": 1.0519464, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.06149610751956677, + "language_loss": 0.80325758, + "learning_rate": 0.00020419514921627408, + "loss": 0.81403679, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.2598877, + "step": 3693, + "time_per_iteration": 2.8510119915008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076383, + "balance_loss_mlp": 1.05039525, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.05808850805852677, + "language_loss": 0.77474564, + "learning_rate": 0.00020394403360705855, + "loss": 0.78550947, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.26025391, + "step": 3694, + "time_per_iteration": 2.719911813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086948, + "balance_loss_mlp": 1.06034029, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.059410233197540796, + "language_loss": 0.87816525, + "learning_rate": 0.00020369303293696228, + "loss": 0.88903475, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.26635742, + "step": 3695, + "time_per_iteration": 2.657715082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079685, + "balance_loss_mlp": 1.05571198, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.06517545508220793, + "language_loss": 0.7842719, + "learning_rate": 0.00020344214730343304, + "loss": 0.79506874, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.23962402, + "step": 3696, + "time_per_iteration": 2.6142332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05308461, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.05470571931894002, + "language_loss": 0.79182768, + "learning_rate": 0.00020319137680387296, + "loss": 0.80260944, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.25109863, + "step": 3697, + "time_per_iteration": 2.915419578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107666, + "balance_loss_mlp": 1.05055368, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.06661588329403122, + "language_loss": 0.80553949, + "learning_rate": 0.0002029407215356398, + "loss": 0.81630599, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.26123047, + "step": 3698, + "time_per_iteration": 2.5700740814208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108156, + "balance_loss_mlp": 1.05670524, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.06665507382105876, + "language_loss": 0.83601737, + "learning_rate": 0.00020269018159604663, + "loss": 0.84683299, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.24841309, + "step": 3699, + "time_per_iteration": 2.7208173274993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.05197358, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.05024967484992462, + "language_loss": 0.82184601, + "learning_rate": 0.00020243975708236162, + "loss": 0.83261693, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.25146484, + "step": 3700, + "time_per_iteration": 2.6433067321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108194, + "balance_loss_mlp": 1.05664349, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.07883365908247705, + "language_loss": 0.86320221, + "learning_rate": 0.00020218944809180818, + "loss": 0.87402165, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.25305176, + "step": 3701, + "time_per_iteration": 2.705932855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080401, + "balance_loss_mlp": 1.05541444, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.048190263761871716, + "language_loss": 0.84987295, + "learning_rate": 0.00020193925472156493, + "loss": 0.86067688, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.25, + "step": 3702, + "time_per_iteration": 2.6893904209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_mlp": 1.03368771, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.023975764530948636, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.7532953, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.06689453, + "step": 3703, + "time_per_iteration": 4.881204843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078432, + "balance_loss_mlp": 1.05408931, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.04896517905072385, + "language_loss": 0.83809257, + "learning_rate": 0.00020143921523049863, + "loss": 0.84887689, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.24316406, + "step": 3704, + "time_per_iteration": 2.9580681324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075128, + "balance_loss_mlp": 1.04962897, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.05872916530123236, + "language_loss": 0.84084362, + "learning_rate": 0.00020118936930380837, + "loss": 0.85159492, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.25512695, + "step": 3705, + "time_per_iteration": 2.76068377494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_mlp": 1.05290496, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.05789936228630773, + "language_loss": 0.81465518, + "learning_rate": 0.0002009396393856932, + "loss": 0.82543886, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.25463867, + "step": 3706, + "time_per_iteration": 2.664915084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04923296, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.06297371189153962, + "language_loss": 0.8270002, + "learning_rate": 0.00020069002557310673, + "loss": 0.83774769, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.25512695, + "step": 3707, + "time_per_iteration": 2.658581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04976273, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.06876092007107866, + "language_loss": 0.77463377, + "learning_rate": 0.00020044052796295807, + "loss": 0.78538585, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.25476074, + "step": 3708, + "time_per_iteration": 2.7701447010040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073382, + "balance_loss_mlp": 1.04729891, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.058576923733569305, + "language_loss": 0.82293993, + "learning_rate": 0.00020019114665211063, + "loss": 0.83367372, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.2611084, + "step": 3709, + "time_per_iteration": 2.584200143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.04671192, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.05922999044905372, + "language_loss": 0.81765306, + "learning_rate": 0.00019994188173738276, + "loss": 0.82836854, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.24829102, + "step": 3710, + "time_per_iteration": 2.551407814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072134, + "balance_loss_mlp": 1.04628921, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.06343816758833129, + "language_loss": 0.80772817, + "learning_rate": 0.0001996927333155477, + "loss": 0.8184495, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.25878906, + "step": 3711, + "time_per_iteration": 2.748868227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075658, + "balance_loss_mlp": 1.04955149, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.06552359252627656, + "language_loss": 0.8595196, + "learning_rate": 0.00019944370148333346, + "loss": 0.87027609, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.26123047, + "step": 3712, + "time_per_iteration": 3.166109800338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072351, + "balance_loss_mlp": 1.04660141, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.05387618778038521, + "language_loss": 0.80135339, + "learning_rate": 0.00019919478633742278, + "loss": 0.81207693, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.2578125, + "step": 3713, + "time_per_iteration": 2.683401107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075332, + "balance_loss_mlp": 1.04877234, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.058133564140499, + "language_loss": 0.85435075, + "learning_rate": 0.00019894598797445302, + "loss": 0.86510408, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.265625, + "step": 3714, + "time_per_iteration": 2.570040225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074334, + "balance_loss_mlp": 1.04846525, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.050277092127782926, + "language_loss": 0.81853724, + "learning_rate": 0.00019869730649101615, + "loss": 0.82928061, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.25878906, + "step": 3715, + "time_per_iteration": 2.811513662338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071824, + "balance_loss_mlp": 1.04564583, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.06869941272731987, + "language_loss": 0.72641587, + "learning_rate": 0.00019844874198365943, + "loss": 0.73713416, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.26220703, + "step": 3716, + "time_per_iteration": 3.1328516006469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068844, + "balance_loss_mlp": 1.04348803, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.061640340400288096, + "language_loss": 0.84182858, + "learning_rate": 0.00019820029454888362, + "loss": 0.85251707, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.25378418, + "step": 3717, + "time_per_iteration": 2.7154488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014076, + "balance_loss_mlp": 1.00725687, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.019699659470436708, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75535345, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.06835938, + "step": 3718, + "time_per_iteration": 5.046099424362183 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04632878, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.06720182925313008, + "language_loss": 0.80157018, + "learning_rate": 0.0001977037512828529, + "loss": 0.81229812, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.26489258, + "step": 3719, + "time_per_iteration": 2.5823724269866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_mlp": 1.04183865, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.06101106638891309, + "language_loss": 0.86410248, + "learning_rate": 0.0001974556556443734, + "loss": 0.87477803, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.25708008, + "step": 3720, + "time_per_iteration": 2.6981611251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069959, + "balance_loss_mlp": 1.04529428, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.05855660874159423, + "language_loss": 0.88533628, + "learning_rate": 0.00019720767746402547, + "loss": 0.89603585, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.24658203, + "step": 3721, + "time_per_iteration": 2.7615206241607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_mlp": 1.04597294, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.062366353751096386, + "language_loss": 0.8018384, + "learning_rate": 0.00019695981683808222, + "loss": 0.81254995, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.2520752, + "step": 3722, + "time_per_iteration": 2.7723004817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079066, + "balance_loss_mlp": 1.05452061, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.061040751408566865, + "language_loss": 0.85407031, + "learning_rate": 0.00019671207386277225, + "loss": 0.86486095, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.24536133, + "step": 3723, + "time_per_iteration": 2.929828643798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074994, + "balance_loss_mlp": 1.0494113, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.060904147533300125, + "language_loss": 0.78436089, + "learning_rate": 0.0001964644486342777, + "loss": 0.79511088, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.25610352, + "step": 3724, + "time_per_iteration": 2.945258617401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072735, + "balance_loss_mlp": 1.04702103, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.06414027483355057, + "language_loss": 0.87113518, + "learning_rate": 0.00019621694124873524, + "loss": 0.88186252, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.25732422, + "step": 3725, + "time_per_iteration": 2.6636407375335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010246, + "balance_loss_mlp": 1.00323606, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.005035081365633862, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77550328, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.0703125, + "step": 3726, + "time_per_iteration": 4.901204347610474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074333, + "balance_loss_mlp": 1.04913247, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.05913508438980992, + "language_loss": 0.77430266, + "learning_rate": 0.00019572228039082428, + "loss": 0.78504598, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.2520752, + "step": 3727, + "time_per_iteration": 3.088613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078544, + "balance_loss_mlp": 1.05268764, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.05372970057922247, + "language_loss": 0.83879149, + "learning_rate": 0.0001954751271105002, + "loss": 0.84957701, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.25866699, + "step": 3728, + "time_per_iteration": 2.8328897953033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079816, + "balance_loss_mlp": 1.05423403, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.054514017613719934, + "language_loss": 0.80957007, + "learning_rate": 0.00019522809205721687, + "loss": 0.82036829, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.25598145, + "step": 3729, + "time_per_iteration": 2.763596534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076432, + "balance_loss_mlp": 1.05167198, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.06077062039876485, + "language_loss": 0.82796627, + "learning_rate": 0.0001949811753268816, + "loss": 0.83873057, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.24768066, + "step": 3730, + "time_per_iteration": 2.6999707221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107353, + "balance_loss_mlp": 1.04911554, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.06199825755801458, + "language_loss": 0.82858533, + "learning_rate": 0.00019473437701535634, + "loss": 0.83932066, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.2442627, + "step": 3731, + "time_per_iteration": 2.6672961711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.04839206, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.05911673909192475, + "language_loss": 0.89378715, + "learning_rate": 0.00019448769721845677, + "loss": 0.90452051, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.24975586, + "step": 3732, + "time_per_iteration": 2.8097128868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077958, + "balance_loss_mlp": 1.0521369, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.0968125790866447, + "language_loss": 0.85677779, + "learning_rate": 0.00019424113603195203, + "loss": 0.86755735, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.25854492, + "step": 3733, + "time_per_iteration": 2.5098788738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.05124426, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.06289800168130656, + "language_loss": 0.80150187, + "learning_rate": 0.0001939946935515657, + "loss": 0.81226921, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.25512695, + "step": 3734, + "time_per_iteration": 2.8232650756835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075815, + "balance_loss_mlp": 1.05049455, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.06894576786718996, + "language_loss": 0.80948031, + "learning_rate": 0.0001937483698729755, + "loss": 0.82023847, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.25341797, + "step": 3735, + "time_per_iteration": 2.583744525909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_mlp": 1.06058323, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.05171464240859849, + "language_loss": 0.82055521, + "learning_rate": 0.0001935021650918128, + "loss": 0.83142066, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.25976562, + "step": 3736, + "time_per_iteration": 3.018035411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075054, + "balance_loss_mlp": 1.05029404, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.06470560317481229, + "language_loss": 0.87265974, + "learning_rate": 0.0001932560793036625, + "loss": 0.88341027, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.24755859, + "step": 3737, + "time_per_iteration": 2.5036935806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080158, + "balance_loss_mlp": 1.05338335, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.06672658192386556, + "language_loss": 0.8673166, + "learning_rate": 0.00019301011260406382, + "loss": 0.87811816, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.26794434, + "step": 3738, + "time_per_iteration": 2.651357412338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075221, + "balance_loss_mlp": 1.050843, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.054290518405139924, + "language_loss": 0.80049711, + "learning_rate": 0.00019276426508850936, + "loss": 0.81124938, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.24377441, + "step": 3739, + "time_per_iteration": 2.7231712341308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070443, + "balance_loss_mlp": 1.04517078, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.061140917990422254, + "language_loss": 0.80563027, + "learning_rate": 0.00019251853685244564, + "loss": 0.81633466, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.25292969, + "step": 3740, + "time_per_iteration": 3.0039608478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071133, + "balance_loss_mlp": 1.0455265, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.05993968121683736, + "language_loss": 0.80916333, + "learning_rate": 0.00019227292799127283, + "loss": 0.81987464, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.25622559, + "step": 3741, + "time_per_iteration": 3.011082172393799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.04817998, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.062033255796259436, + "language_loss": 0.79226792, + "learning_rate": 0.00019202743860034454, + "loss": 0.80300719, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.25744629, + "step": 3742, + "time_per_iteration": 3.2250611782073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071976, + "balance_loss_mlp": 1.04692984, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.06270566779319728, + "language_loss": 0.83965755, + "learning_rate": 0.00019178206877496873, + "loss": 0.85037732, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.25061035, + "step": 3743, + "time_per_iteration": 2.702446222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068486, + "balance_loss_mlp": 1.04463267, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.05142738510326197, + "language_loss": 0.85388875, + "learning_rate": 0.0001915368186104059, + "loss": 0.8645736, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.23840332, + "step": 3744, + "time_per_iteration": 2.737600326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072746, + "balance_loss_mlp": 1.04818881, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.07812429294813375, + "language_loss": 0.80877572, + "learning_rate": 0.0001912916882018706, + "loss": 0.81950319, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.2454834, + "step": 3745, + "time_per_iteration": 2.7886669635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.04774189, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.10461054453296469, + "language_loss": 0.79336673, + "learning_rate": 0.00019104667764453125, + "loss": 0.80409628, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.2520752, + "step": 3746, + "time_per_iteration": 3.01520037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068478, + "balance_loss_mlp": 1.04382503, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.05271540811251211, + "language_loss": 0.80517203, + "learning_rate": 0.00019080178703350926, + "loss": 0.81585681, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.24658203, + "step": 3747, + "time_per_iteration": 2.6013572216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067775, + "balance_loss_mlp": 1.04224086, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.06037415597287081, + "language_loss": 0.83132112, + "learning_rate": 0.00019055701646387952, + "loss": 0.84199888, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.25549316, + "step": 3748, + "time_per_iteration": 2.641214609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012229, + "balance_loss_mlp": 1.00474262, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.010630398353693617, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81484914, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.07470703, + "step": 3749, + "time_per_iteration": 4.815402507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.04136407, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.06404376467324384, + "language_loss": 0.86850023, + "learning_rate": 0.00019006783582886368, + "loss": 0.8791635, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.24975586, + "step": 3750, + "time_per_iteration": 2.5772666931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068921, + "balance_loss_mlp": 1.04362464, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.05743356486239607, + "language_loss": 0.83082181, + "learning_rate": 0.00018982342595339437, + "loss": 0.84151101, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.25292969, + "step": 3751, + "time_per_iteration": 3.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074387, + "balance_loss_mlp": 1.04874492, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.12990726200021083, + "language_loss": 0.82180882, + "learning_rate": 0.00018957913649915076, + "loss": 0.83255273, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.25646973, + "step": 3752, + "time_per_iteration": 3.160003900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069675, + "balance_loss_mlp": 1.0439254, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.06468827882865268, + "language_loss": 0.80619174, + "learning_rate": 0.00018933496756097428, + "loss": 0.81688845, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.2578125, + "step": 3753, + "time_per_iteration": 2.5997426509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.04083598, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.06037343169471402, + "language_loss": 0.81664622, + "learning_rate": 0.0001890909192336603, + "loss": 0.8273102, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.2557373, + "step": 3754, + "time_per_iteration": 3.018083095550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.04364371, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.056170219084609056, + "language_loss": 0.70541704, + "learning_rate": 0.00018884699161195623, + "loss": 0.71610725, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.25390625, + "step": 3755, + "time_per_iteration": 2.947492837905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068503, + "balance_loss_mlp": 1.04259872, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.08930664907788496, + "language_loss": 0.77445567, + "learning_rate": 0.00018860318479056327, + "loss": 0.78514069, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.25939941, + "step": 3756, + "time_per_iteration": 3.133481740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075638, + "balance_loss_mlp": 1.05084264, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.05236327296273719, + "language_loss": 0.83486211, + "learning_rate": 0.00018835949886413555, + "loss": 0.84561849, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.24804688, + "step": 3757, + "time_per_iteration": 2.7377569675445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.04592407, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.06766060207164688, + "language_loss": 0.79256356, + "learning_rate": 0.0001881159339272806, + "loss": 0.80327755, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.25476074, + "step": 3758, + "time_per_iteration": 2.6691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106811, + "balance_loss_mlp": 1.04336238, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.06062364795368716, + "language_loss": 0.78869492, + "learning_rate": 0.00018787249007455858, + "loss": 0.79937607, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.24731445, + "step": 3759, + "time_per_iteration": 2.628452777862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072784, + "balance_loss_mlp": 1.04866767, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.05921726316721053, + "language_loss": 0.71849477, + "learning_rate": 0.00018762916740048302, + "loss": 0.7292226, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.24108887, + "step": 3760, + "time_per_iteration": 4.164097547531128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074503, + "balance_loss_mlp": 1.04969609, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.05859039427854228, + "language_loss": 0.85892487, + "learning_rate": 0.0001873859659995195, + "loss": 0.86966991, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.24816895, + "step": 3761, + "time_per_iteration": 2.7077507972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076187, + "balance_loss_mlp": 1.05047345, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.05612829292688987, + "language_loss": 0.8333689, + "learning_rate": 0.0001871428859660878, + "loss": 0.84413075, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.25744629, + "step": 3762, + "time_per_iteration": 2.7349491119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070635, + "balance_loss_mlp": 1.04679286, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.05320593884566549, + "language_loss": 0.82095098, + "learning_rate": 0.00018689992739455975, + "loss": 0.83165729, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.23828125, + "step": 3763, + "time_per_iteration": 2.9456627368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_mlp": 1.04832602, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.05110197345931534, + "language_loss": 0.86203957, + "learning_rate": 0.00018665709037926027, + "loss": 0.87277734, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.25476074, + "step": 3764, + "time_per_iteration": 3.318403959274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.04516387, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.06311256302273614, + "language_loss": 0.85269356, + "learning_rate": 0.00018641437501446694, + "loss": 0.86338234, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.23693848, + "step": 3765, + "time_per_iteration": 2.6275501251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077113, + "balance_loss_mlp": 1.05141139, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.06293710681021243, + "language_loss": 0.82769656, + "learning_rate": 0.0001861717813944104, + "loss": 0.83846772, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.25744629, + "step": 3766, + "time_per_iteration": 2.6608469486236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074023, + "balance_loss_mlp": 1.04876232, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.06015775700699107, + "language_loss": 0.79908741, + "learning_rate": 0.00018592930961327365, + "loss": 0.80982769, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.25280762, + "step": 3767, + "time_per_iteration": 2.7321486473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107651, + "balance_loss_mlp": 1.05160677, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.056564551709211236, + "language_loss": 0.88070989, + "learning_rate": 0.00018568695976519273, + "loss": 0.89147508, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.24890137, + "step": 3768, + "time_per_iteration": 2.7732081413269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073175, + "balance_loss_mlp": 1.04744947, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.06484399949200302, + "language_loss": 0.80721432, + "learning_rate": 0.00018544473194425593, + "loss": 0.81794608, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.25744629, + "step": 3769, + "time_per_iteration": 2.489635467529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069994, + "balance_loss_mlp": 1.04453063, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.06360093923079267, + "language_loss": 0.78936434, + "learning_rate": 0.00018520262624450485, + "loss": 0.80006427, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.25488281, + "step": 3770, + "time_per_iteration": 2.874816417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070988, + "balance_loss_mlp": 1.04658556, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.05111495515347452, + "language_loss": 0.87226415, + "learning_rate": 0.00018496064275993324, + "loss": 0.88297403, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.24377441, + "step": 3771, + "time_per_iteration": 2.7426414489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070976, + "balance_loss_mlp": 1.04448795, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.06635315591168078, + "language_loss": 0.82333881, + "learning_rate": 0.00018471878158448686, + "loss": 0.83404857, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.26538086, + "step": 3772, + "time_per_iteration": 2.927983283996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073436, + "balance_loss_mlp": 1.04748392, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.0478676363130983, + "language_loss": 0.84174544, + "learning_rate": 0.00018447704281206512, + "loss": 0.85247982, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.25964355, + "step": 3773, + "time_per_iteration": 2.863914966583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068748, + "balance_loss_mlp": 1.04243803, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.056210264368279125, + "language_loss": 0.83150065, + "learning_rate": 0.0001842354265365191, + "loss": 0.84218812, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.26330566, + "step": 3774, + "time_per_iteration": 2.6950740814208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.04815984, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.08533819626854355, + "language_loss": 0.81115055, + "learning_rate": 0.0001839939328516526, + "loss": 0.82188785, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.25598145, + "step": 3775, + "time_per_iteration": 2.7223706245422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075372, + "balance_loss_mlp": 1.04981351, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.08605287501334834, + "language_loss": 0.81360769, + "learning_rate": 0.0001837525618512218, + "loss": 0.82436144, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.2557373, + "step": 3776, + "time_per_iteration": 2.874652624130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067677, + "balance_loss_mlp": 1.04284596, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.060733174286640615, + "language_loss": 0.83042395, + "learning_rate": 0.00018351131362893519, + "loss": 0.84110069, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.24841309, + "step": 3777, + "time_per_iteration": 2.801011323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070149, + "balance_loss_mlp": 1.04434013, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.06246763883136397, + "language_loss": 0.80644751, + "learning_rate": 0.00018327018827845364, + "loss": 0.81714904, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.25842285, + "step": 3778, + "time_per_iteration": 2.5989460945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.04869461, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.05228982381822259, + "language_loss": 0.87562966, + "learning_rate": 0.00018302918589339036, + "loss": 0.88636208, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.2454834, + "step": 3779, + "time_per_iteration": 2.6237618923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073848, + "balance_loss_mlp": 1.04871857, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06453049409533262, + "language_loss": 0.90400481, + "learning_rate": 0.00018278830656731054, + "loss": 0.9147433, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.25158691, + "step": 3780, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069473, + "balance_loss_mlp": 1.04534531, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.050403453356215815, + "language_loss": 0.86580253, + "learning_rate": 0.00018254755039373222, + "loss": 0.87649727, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.24121094, + "step": 3781, + "time_per_iteration": 2.7791805267333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078472, + "balance_loss_mlp": 1.05350983, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.06136859684084447, + "language_loss": 0.83780336, + "learning_rate": 0.0001823069174661252, + "loss": 0.84858811, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.24963379, + "step": 3782, + "time_per_iteration": 2.8298797607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069254, + "balance_loss_mlp": 1.0445894, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.05448040343195996, + "language_loss": 0.78343076, + "learning_rate": 0.00018206640787791112, + "loss": 0.79412329, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.2467041, + "step": 3783, + "time_per_iteration": 2.609013795852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.0477066, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.057564515393037245, + "language_loss": 0.85957235, + "learning_rate": 0.00018182602172246416, + "loss": 0.87028909, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.23974609, + "step": 3784, + "time_per_iteration": 2.6400623321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072013, + "balance_loss_mlp": 1.04888618, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.060673398412002894, + "language_loss": 0.76418436, + "learning_rate": 0.00018158575909311075, + "loss": 0.77490449, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.23132324, + "step": 3785, + "time_per_iteration": 2.64180850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079039, + "balance_loss_mlp": 1.05404127, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.06019097733888483, + "language_loss": 0.8038618, + "learning_rate": 0.000181345620083129, + "loss": 0.8146522, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.24987793, + "step": 3786, + "time_per_iteration": 2.8254077434539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075124, + "balance_loss_mlp": 1.05057859, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.056512794901340806, + "language_loss": 0.86981964, + "learning_rate": 0.00018110560478574927, + "loss": 0.88057089, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.2454834, + "step": 3787, + "time_per_iteration": 2.6989898681640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107191, + "balance_loss_mlp": 1.04607677, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.0653462875447768, + "language_loss": 0.80641389, + "learning_rate": 0.0001808657132941533, + "loss": 0.81713301, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.25830078, + "step": 3788, + "time_per_iteration": 2.7848241329193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076986, + "balance_loss_mlp": 1.05143917, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.06505823149164586, + "language_loss": 0.8307749, + "learning_rate": 0.00018062594570147572, + "loss": 0.84154475, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.25549316, + "step": 3789, + "time_per_iteration": 2.633287191390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070828, + "balance_loss_mlp": 1.046152, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.05002031972924792, + "language_loss": 0.85413891, + "learning_rate": 0.00018038630210080243, + "loss": 0.86484718, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.24658203, + "step": 3790, + "time_per_iteration": 2.866363286972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072918, + "balance_loss_mlp": 1.04853952, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.05805310793954541, + "language_loss": 0.85253292, + "learning_rate": 0.0001801467825851712, + "loss": 0.86326218, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.24401855, + "step": 3791, + "time_per_iteration": 2.728860378265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071802, + "balance_loss_mlp": 1.04638696, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.14519807994310208, + "language_loss": 0.78306311, + "learning_rate": 0.00017990738724757172, + "loss": 0.79378116, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.25427246, + "step": 3792, + "time_per_iteration": 2.8468916416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_mlp": 1.04319978, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.05959978185176886, + "language_loss": 0.8250258, + "learning_rate": 0.00017966811618094598, + "loss": 0.83570778, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.24987793, + "step": 3793, + "time_per_iteration": 2.909195899963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077382, + "balance_loss_mlp": 1.05095315, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.06013443658312294, + "language_loss": 0.8499018, + "learning_rate": 0.00017942896947818664, + "loss": 0.86067569, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.26452637, + "step": 3794, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008896, + "balance_loss_mlp": 1.00121939, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.014415453052224393, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75833952, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.07666016, + "step": 3795, + "time_per_iteration": 4.844003200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067336, + "balance_loss_mlp": 1.04246938, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.07521259733742676, + "language_loss": 0.8533113, + "learning_rate": 0.00017895104953559947, + "loss": 0.86398464, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.24865723, + "step": 3796, + "time_per_iteration": 2.5970304012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04881954, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.082255252193866, + "language_loss": 0.8954308, + "learning_rate": 0.00017871227648131672, + "loss": 0.90617365, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.25476074, + "step": 3797, + "time_per_iteration": 2.5412604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066925, + "balance_loss_mlp": 1.0418191, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.050248722250274616, + "language_loss": 0.8297137, + "learning_rate": 0.0001784736281619907, + "loss": 0.84038293, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.25134277, + "step": 3798, + "time_per_iteration": 2.5844838619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068914, + "balance_loss_mlp": 1.04355788, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.07691325106249959, + "language_loss": 0.7466501, + "learning_rate": 0.00017823510467027232, + "loss": 0.75733924, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.25341797, + "step": 3799, + "time_per_iteration": 2.777209520339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071842, + "balance_loss_mlp": 1.04620039, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.08066489228669042, + "language_loss": 0.7834214, + "learning_rate": 0.00017799670609876516, + "loss": 0.79413986, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.25671387, + "step": 3800, + "time_per_iteration": 2.5069777965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107102, + "balance_loss_mlp": 1.04590285, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.05293495483873373, + "language_loss": 0.88974595, + "learning_rate": 0.00017775843254002366, + "loss": 0.90045619, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.2512207, + "step": 3801, + "time_per_iteration": 2.725081443786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077058, + "balance_loss_mlp": 1.0519762, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.05473119278948026, + "language_loss": 0.84161508, + "learning_rate": 0.00017752028408655367, + "loss": 0.85238564, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.25097656, + "step": 3802, + "time_per_iteration": 3.025043249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075279, + "balance_loss_mlp": 1.04994678, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.05406841313546952, + "language_loss": 0.85023701, + "learning_rate": 0.00017728226083081272, + "loss": 0.86098975, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.25354004, + "step": 3803, + "time_per_iteration": 2.556396245956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078247, + "balance_loss_mlp": 1.05355895, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.06231590720725376, + "language_loss": 0.81697959, + "learning_rate": 0.00017704436286520965, + "loss": 0.82776201, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.24682617, + "step": 3804, + "time_per_iteration": 2.5290911197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078799, + "balance_loss_mlp": 1.05242968, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.06772838198197546, + "language_loss": 0.84615296, + "learning_rate": 0.0001768065902821046, + "loss": 0.85694098, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.26379395, + "step": 3805, + "time_per_iteration": 2.6657214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072852, + "balance_loss_mlp": 1.04691195, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.06439046141851584, + "language_loss": 0.82463551, + "learning_rate": 0.00017656894317380907, + "loss": 0.83536404, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.25976562, + "step": 3806, + "time_per_iteration": 2.7381749153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008677, + "balance_loss_mlp": 1.00085652, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.011036115367728498, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77039945, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.078125, + "step": 3807, + "time_per_iteration": 5.021719217300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074035, + "balance_loss_mlp": 1.04846501, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.059101144317775495, + "language_loss": 0.84063375, + "learning_rate": 0.00017609402575064875, + "loss": 0.85137415, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.25585938, + "step": 3808, + "time_per_iteration": 2.601905345916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070564, + "balance_loss_mlp": 1.04569697, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.06287307202427123, + "language_loss": 0.81348085, + "learning_rate": 0.00017585675562016367, + "loss": 0.8241865, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.2487793, + "step": 3809, + "time_per_iteration": 2.5671656131744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101117, + "balance_loss_mlp": 1.0035404, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.009961164092808575, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78224015, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.07617188, + "step": 3810, + "time_per_iteration": 4.85601019859314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067566, + "balance_loss_mlp": 1.04141164, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.0717218178349286, + "language_loss": 0.85344338, + "learning_rate": 0.00017538259298196474, + "loss": 0.86411905, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.26171875, + "step": 3811, + "time_per_iteration": 2.5674660205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066538, + "balance_loss_mlp": 1.0418613, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.06191722538005279, + "language_loss": 0.8221786, + "learning_rate": 0.00017514570065833745, + "loss": 0.83284396, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.24658203, + "step": 3812, + "time_per_iteration": 2.7502520084381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065481, + "balance_loss_mlp": 1.04084063, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.09654235990380512, + "language_loss": 0.80427462, + "learning_rate": 0.00017490893445433426, + "loss": 0.81492949, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.24633789, + "step": 3813, + "time_per_iteration": 2.644380569458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.04200649, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.05501039024116298, + "language_loss": 0.81422758, + "learning_rate": 0.00017467229446187587, + "loss": 0.82489812, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.25061035, + "step": 3814, + "time_per_iteration": 2.6799376010894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072542, + "balance_loss_mlp": 1.04665017, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.054283563918009155, + "language_loss": 0.81726635, + "learning_rate": 0.00017443578077283424, + "loss": 0.82799172, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.2590332, + "step": 3815, + "time_per_iteration": 2.6411497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072187, + "balance_loss_mlp": 1.04731965, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.06697852947124575, + "language_loss": 0.85358864, + "learning_rate": 0.0001741993934790319, + "loss": 0.8643105, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.24853516, + "step": 3816, + "time_per_iteration": 2.813728094100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106954, + "balance_loss_mlp": 1.04457784, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.07301575323096621, + "language_loss": 0.83966112, + "learning_rate": 0.00017396313267224273, + "loss": 0.85035658, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.24963379, + "step": 3817, + "time_per_iteration": 2.7044739723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074705, + "balance_loss_mlp": 1.04939699, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.05834260982052782, + "language_loss": 0.88725907, + "learning_rate": 0.0001737269984441912, + "loss": 0.89800614, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.2532959, + "step": 3818, + "time_per_iteration": 2.6479249000549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068069, + "balance_loss_mlp": 1.04333293, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.04867070462384417, + "language_loss": 0.85300821, + "learning_rate": 0.00017349099088655263, + "loss": 0.86368895, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.24743652, + "step": 3819, + "time_per_iteration": 2.687084197998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068808, + "balance_loss_mlp": 1.04391694, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.05808726713133537, + "language_loss": 0.81269497, + "learning_rate": 0.00017325511009095375, + "loss": 0.82338297, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.24902344, + "step": 3820, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.04350281, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.05934097649534438, + "language_loss": 0.83911049, + "learning_rate": 0.00017301935614897113, + "loss": 0.84979987, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.2545166, + "step": 3821, + "time_per_iteration": 2.6836743354797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.04855156, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.0494453398159371, + "language_loss": 0.81605434, + "learning_rate": 0.00017278372915213274, + "loss": 0.82679343, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.25378418, + "step": 3822, + "time_per_iteration": 2.651975393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008179, + "balance_loss_mlp": 1.00121737, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.007266533432635982, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80902022, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.06982422, + "step": 3823, + "time_per_iteration": 4.976882457733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075947, + "balance_loss_mlp": 1.05096054, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.0894957625662193, + "language_loss": 0.80647838, + "learning_rate": 0.00017231285635975314, + "loss": 0.81723785, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.25, + "step": 3824, + "time_per_iteration": 2.8835809230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_mlp": 1.04871035, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.0659132638478438, + "language_loss": 0.83565962, + "learning_rate": 0.00017207761074702115, + "loss": 0.84640133, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.25488281, + "step": 3825, + "time_per_iteration": 2.5829551219940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.05089879, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.05423674228427361, + "language_loss": 0.83801639, + "learning_rate": 0.0001718424924450514, + "loss": 0.84877622, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.25085449, + "step": 3826, + "time_per_iteration": 2.6215810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072026, + "balance_loss_mlp": 1.0464201, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.047662784770319516, + "language_loss": 0.86247635, + "learning_rate": 0.00017160750154512482, + "loss": 0.8731966, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.25610352, + "step": 3827, + "time_per_iteration": 2.7316274642944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072121, + "balance_loss_mlp": 1.04726601, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.05425230647323069, + "language_loss": 0.83439684, + "learning_rate": 0.0001713726381384731, + "loss": 0.84511811, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.24841309, + "step": 3828, + "time_per_iteration": 2.7767257690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107353, + "balance_loss_mlp": 1.04826927, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.06782192310346803, + "language_loss": 0.81600618, + "learning_rate": 0.00017113790231627812, + "loss": 0.8267414, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.25280762, + "step": 3829, + "time_per_iteration": 2.4791929721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100591, + "balance_loss_mlp": 0.99885303, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.00950707875200575, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80264139, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.07080078, + "step": 3830, + "time_per_iteration": 6.233624696731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075447, + "balance_loss_mlp": 1.05000758, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.0605697653719091, + "language_loss": 0.82126367, + "learning_rate": 0.00017066881378973936, + "loss": 0.83201814, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.25463867, + "step": 3831, + "time_per_iteration": 2.6804988384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_mlp": 1.0483644, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.051765900336182155, + "language_loss": 0.83060026, + "learning_rate": 0.00017043446126751189, + "loss": 0.84133494, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.2512207, + "step": 3832, + "time_per_iteration": 2.677116870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078507, + "balance_loss_mlp": 1.05299592, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.06293756083772555, + "language_loss": 0.76538479, + "learning_rate": 0.00017020023669397376, + "loss": 0.7761699, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.25524902, + "step": 3833, + "time_per_iteration": 2.6688897609710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107645, + "balance_loss_mlp": 1.04953265, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.06089571273560201, + "language_loss": 0.81964701, + "learning_rate": 0.0001699661401600589, + "loss": 0.83041155, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.26953125, + "step": 3834, + "time_per_iteration": 2.6013684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072143, + "balance_loss_mlp": 1.04688239, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.05707021957695399, + "language_loss": 0.78780484, + "learning_rate": 0.00016973217175665205, + "loss": 0.79852629, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.25268555, + "step": 3835, + "time_per_iteration": 2.5545742511749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_mlp": 0.99759406, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.011573205656029463, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82170916, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.07128906, + "step": 3836, + "time_per_iteration": 4.935137748718262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073573, + "balance_loss_mlp": 1.04770422, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.05911051824592706, + "language_loss": 0.8443321, + "learning_rate": 0.00016926461970465047, + "loss": 0.85506785, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.25878906, + "step": 3837, + "time_per_iteration": 2.753530979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070473, + "balance_loss_mlp": 1.04638028, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.055427222427827466, + "language_loss": 0.84596455, + "learning_rate": 0.00016903103623757516, + "loss": 0.85666919, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.2409668, + "step": 3838, + "time_per_iteration": 3.0433106422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070361, + "balance_loss_mlp": 1.04518437, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.06096616849216926, + "language_loss": 0.80038297, + "learning_rate": 0.00016879758126404738, + "loss": 0.81108665, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.25183105, + "step": 3839, + "time_per_iteration": 2.726783037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071404, + "balance_loss_mlp": 1.04598832, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.0748668456042948, + "language_loss": 0.80022889, + "learning_rate": 0.00016856425487470216, + "loss": 0.81094301, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.25439453, + "step": 3840, + "time_per_iteration": 3.0780324935913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_mlp": 1.04228592, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.06187629511856373, + "language_loss": 0.79238671, + "learning_rate": 0.00016833105716012486, + "loss": 0.80306083, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.25146484, + "step": 3841, + "time_per_iteration": 3.1636850833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070126, + "balance_loss_mlp": 1.04452026, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.05887802150454755, + "language_loss": 0.85242188, + "learning_rate": 0.00016809798821085088, + "loss": 0.86312318, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.25622559, + "step": 3842, + "time_per_iteration": 2.990478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069406, + "balance_loss_mlp": 1.04390705, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.051928079352218694, + "language_loss": 0.8929773, + "learning_rate": 0.00016786504811736565, + "loss": 0.90367138, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.25524902, + "step": 3843, + "time_per_iteration": 2.6872341632843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066886, + "balance_loss_mlp": 1.04195881, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.06625408386492132, + "language_loss": 0.82992953, + "learning_rate": 0.00016763223697010442, + "loss": 0.84059834, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.24938965, + "step": 3844, + "time_per_iteration": 2.9391865730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.04412675, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.05828893088019289, + "language_loss": 0.84686291, + "learning_rate": 0.00016739955485945256, + "loss": 0.85754752, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.24304199, + "step": 3845, + "time_per_iteration": 2.7142622470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072077, + "balance_loss_mlp": 1.04656637, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.07100000886785215, + "language_loss": 0.85870165, + "learning_rate": 0.00016716700187574513, + "loss": 0.86942244, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.25537109, + "step": 3846, + "time_per_iteration": 2.6977670192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067392, + "balance_loss_mlp": 1.04300213, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.054057188356913304, + "language_loss": 0.84146428, + "learning_rate": 0.0001669345781092675, + "loss": 0.85213816, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.24377441, + "step": 3847, + "time_per_iteration": 2.7265117168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074624, + "balance_loss_mlp": 1.05013824, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.06355688718688712, + "language_loss": 0.87326193, + "learning_rate": 0.0001667022836502546, + "loss": 0.88400817, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.24499512, + "step": 3848, + "time_per_iteration": 2.7551324367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073869, + "balance_loss_mlp": 1.04852557, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.08017271540920272, + "language_loss": 0.828776, + "learning_rate": 0.00016647011858889077, + "loss": 0.83951473, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.25378418, + "step": 3849, + "time_per_iteration": 2.5299232006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073654, + "balance_loss_mlp": 1.04783297, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.06268234066304752, + "language_loss": 0.85992008, + "learning_rate": 0.00016623808301531056, + "loss": 0.87065661, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.25842285, + "step": 3850, + "time_per_iteration": 2.6404004096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077251, + "balance_loss_mlp": 1.05166864, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.07684631062218569, + "language_loss": 0.79265726, + "learning_rate": 0.00016600617701959842, + "loss": 0.80342978, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.25610352, + "step": 3851, + "time_per_iteration": 2.719182014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007219, + "balance_loss_mlp": 1.00025725, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.009023170879128087, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79851031, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.06982422, + "step": 3852, + "time_per_iteration": 4.949675798416138 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073682, + "balance_loss_mlp": 1.04883838, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.05701919948873552, + "language_loss": 0.81264549, + "learning_rate": 0.00016554275412186315, + "loss": 0.82338226, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.24853516, + "step": 3853, + "time_per_iteration": 2.843740701675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04884005, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.06536701062861092, + "language_loss": 0.80980605, + "learning_rate": 0.0001653112373997568, + "loss": 0.82055348, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.25927734, + "step": 3854, + "time_per_iteration": 2.65200138092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073741, + "balance_loss_mlp": 1.04929078, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.06830067718858168, + "language_loss": 0.74823475, + "learning_rate": 0.0001650798506153517, + "loss": 0.75897211, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.24450684, + "step": 3855, + "time_per_iteration": 2.687006950378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070882, + "balance_loss_mlp": 1.04607463, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.07905469083436836, + "language_loss": 0.84182036, + "learning_rate": 0.00016484859385848023, + "loss": 0.85252917, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.24816895, + "step": 3856, + "time_per_iteration": 2.6188693046569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072107, + "balance_loss_mlp": 1.0480032, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.061726371250172385, + "language_loss": 0.77338076, + "learning_rate": 0.0001646174672189243, + "loss": 0.7841019, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.24108887, + "step": 3857, + "time_per_iteration": 2.649557590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067204, + "balance_loss_mlp": 1.0426352, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.0578395137702567, + "language_loss": 0.80607724, + "learning_rate": 0.00016438647078641488, + "loss": 0.81674922, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.24572754, + "step": 3858, + "time_per_iteration": 2.619621515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072571, + "balance_loss_mlp": 1.04788327, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.06183948781118283, + "language_loss": 0.83172727, + "learning_rate": 0.00016415560465063344, + "loss": 0.842453, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.24694824, + "step": 3859, + "time_per_iteration": 2.7068328857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_mlp": 1.04234803, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.07149126280637065, + "language_loss": 0.79273307, + "learning_rate": 0.0001639248689012095, + "loss": 0.80340761, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.2512207, + "step": 3860, + "time_per_iteration": 2.559715986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069308, + "balance_loss_mlp": 1.04384458, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.06474025834236737, + "language_loss": 0.87225401, + "learning_rate": 0.00016369426362772271, + "loss": 0.88294709, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.25463867, + "step": 3861, + "time_per_iteration": 2.768488883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071358, + "balance_loss_mlp": 1.0464673, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.05729012917412524, + "language_loss": 0.80612242, + "learning_rate": 0.00016346378891970233, + "loss": 0.816836, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.24890137, + "step": 3862, + "time_per_iteration": 2.805666923522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107111, + "balance_loss_mlp": 1.04621959, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.054983080042834975, + "language_loss": 0.81883794, + "learning_rate": 0.00016323344486662633, + "loss": 0.82954907, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.24902344, + "step": 3863, + "time_per_iteration": 3.301302671432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072159, + "balance_loss_mlp": 1.04673147, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.05456395021125743, + "language_loss": 0.78892124, + "learning_rate": 0.00016300323155792247, + "loss": 0.7996428, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.2545166, + "step": 3864, + "time_per_iteration": 2.8931703567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066338, + "balance_loss_mlp": 1.0422101, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.05569760658066131, + "language_loss": 0.88605452, + "learning_rate": 0.00016277314908296687, + "loss": 0.89671785, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.24121094, + "step": 3865, + "time_per_iteration": 2.684453248977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071833, + "balance_loss_mlp": 1.04628646, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.09698057624651829, + "language_loss": 0.75883031, + "learning_rate": 0.00016254319753108604, + "loss": 0.76954859, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.25561523, + "step": 3866, + "time_per_iteration": 2.8249847888946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04626155, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.06903603879321982, + "language_loss": 0.76659936, + "learning_rate": 0.00016231337699155492, + "loss": 0.7773214, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.25964355, + "step": 3867, + "time_per_iteration": 2.954054594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.04905081, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.052812057289516566, + "language_loss": 0.78941596, + "learning_rate": 0.0001620836875535977, + "loss": 0.80016011, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.25378418, + "step": 3868, + "time_per_iteration": 2.868677854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065856, + "balance_loss_mlp": 1.04120398, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.06361911402361287, + "language_loss": 0.806584, + "learning_rate": 0.00016185412930638766, + "loss": 0.81724262, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.24658203, + "step": 3869, + "time_per_iteration": 2.8323211669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071749, + "balance_loss_mlp": 1.04708433, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.05653769935152868, + "language_loss": 0.82733011, + "learning_rate": 0.00016162470233904765, + "loss": 0.83804756, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.24658203, + "step": 3870, + "time_per_iteration": 2.7211382389068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04862642, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.06316774486708195, + "language_loss": 0.82555729, + "learning_rate": 0.00016139540674064856, + "loss": 0.83629668, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.25280762, + "step": 3871, + "time_per_iteration": 2.739121675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070986, + "balance_loss_mlp": 1.04655969, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.05640449284487911, + "language_loss": 0.78114176, + "learning_rate": 0.00016116624260021113, + "loss": 0.79185158, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.24414062, + "step": 3872, + "time_per_iteration": 2.7855870723724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071272, + "balance_loss_mlp": 1.04650021, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.05661952400288272, + "language_loss": 0.84321451, + "learning_rate": 0.0001609372100067046, + "loss": 0.85392725, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.24768066, + "step": 3873, + "time_per_iteration": 2.5051002502441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076949, + "balance_loss_mlp": 1.05139041, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.07051271048779074, + "language_loss": 0.85103834, + "learning_rate": 0.0001607083090490475, + "loss": 0.86180782, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.25585938, + "step": 3874, + "time_per_iteration": 2.865432024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073052, + "balance_loss_mlp": 1.04762459, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.0748811600341369, + "language_loss": 0.80497265, + "learning_rate": 0.00016047953981610714, + "loss": 0.81570315, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.25439453, + "step": 3875, + "time_per_iteration": 2.7216734886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007311, + "balance_loss_mlp": 1.00044441, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.007625795803468779, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80736953, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.06884766, + "step": 3876, + "time_per_iteration": 5.382456064224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075501, + "balance_loss_mlp": 1.05013371, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.05488514319290027, + "language_loss": 0.81060588, + "learning_rate": 0.0001600223968795889, + "loss": 0.82136083, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.25378418, + "step": 3877, + "time_per_iteration": 2.9120445251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100724, + "balance_loss_mlp": 1.0003736, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.007629360710496433, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.7670331, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.06884766, + "step": 3878, + "time_per_iteration": 4.901887893676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072632, + "balance_loss_mlp": 1.04855156, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.07646083771091663, + "language_loss": 0.82140052, + "learning_rate": 0.00015956578190706483, + "loss": 0.83212686, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.24072266, + "step": 3879, + "time_per_iteration": 2.665292978286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106672, + "balance_loss_mlp": 1.04198372, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.05773895513703621, + "language_loss": 0.75869894, + "learning_rate": 0.00015933767262892468, + "loss": 0.76936615, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.24743652, + "step": 3880, + "time_per_iteration": 2.7083511352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068645, + "balance_loss_mlp": 1.04439831, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.07814319262934219, + "language_loss": 0.82429087, + "learning_rate": 0.00015910969560762927, + "loss": 0.83497727, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.2421875, + "step": 3881, + "time_per_iteration": 2.556643009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072001, + "balance_loss_mlp": 1.04824293, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.05526796797761112, + "language_loss": 0.8303771, + "learning_rate": 0.00015888185093168727, + "loss": 0.84109712, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.2376709, + "step": 3882, + "time_per_iteration": 2.7359204292297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074755, + "balance_loss_mlp": 1.0493511, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.05340233113956033, + "language_loss": 0.81238657, + "learning_rate": 0.00015865413868955581, + "loss": 0.82313412, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.25439453, + "step": 3883, + "time_per_iteration": 2.658531665802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.04343939, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.051714571371053245, + "language_loss": 0.82935232, + "learning_rate": 0.00015842655896964054, + "loss": 0.8400166, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.22973633, + "step": 3884, + "time_per_iteration": 3.018538475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077883, + "balance_loss_mlp": 1.05318248, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.06900594182420934, + "language_loss": 0.74108642, + "learning_rate": 0.00015819911186029567, + "loss": 0.75186527, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.24719238, + "step": 3885, + "time_per_iteration": 2.767460823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074459, + "balance_loss_mlp": 1.04935396, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.05191869003121536, + "language_loss": 0.8641215, + "learning_rate": 0.00015797179744982443, + "loss": 0.87486613, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.25073242, + "step": 3886, + "time_per_iteration": 2.722130060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04973185, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.0600854170312897, + "language_loss": 0.79131281, + "learning_rate": 0.00015774461582647765, + "loss": 0.80205405, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.24389648, + "step": 3887, + "time_per_iteration": 2.6940510272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072368, + "balance_loss_mlp": 1.04781055, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07732553341953252, + "language_loss": 0.8101362, + "learning_rate": 0.00015751756707845505, + "loss": 0.82085991, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.24560547, + "step": 3888, + "time_per_iteration": 2.6013286113739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_mlp": 1.04748178, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.05831839301711609, + "language_loss": 0.88772756, + "learning_rate": 0.00015729065129390502, + "loss": 0.89844996, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.24768066, + "step": 3889, + "time_per_iteration": 3.000511884689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107686, + "balance_loss_mlp": 1.05071712, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.09513844178064898, + "language_loss": 0.82148743, + "learning_rate": 0.0001570638685609241, + "loss": 0.83225602, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.26159668, + "step": 3890, + "time_per_iteration": 2.5567352771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_mlp": 1.04621816, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.06496446825599186, + "language_loss": 0.80583847, + "learning_rate": 0.00015683721896755693, + "loss": 0.81655896, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.25866699, + "step": 3891, + "time_per_iteration": 2.5300092697143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017758, + "balance_loss_mlp": 1.01103473, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.007988812932881569, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83228242, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.06738281, + "step": 3892, + "time_per_iteration": 4.90599799156189 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072454, + "balance_loss_mlp": 1.04705048, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.06234068524332242, + "language_loss": 0.85285282, + "learning_rate": 0.00015638431955158528, + "loss": 0.86357737, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.25402832, + "step": 3893, + "time_per_iteration": 2.674448251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077206, + "balance_loss_mlp": 1.05186236, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.051873425900431515, + "language_loss": 0.8129698, + "learning_rate": 0.00015615806990481186, + "loss": 0.82374185, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.25366211, + "step": 3894, + "time_per_iteration": 2.749011754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075566, + "balance_loss_mlp": 1.05075812, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.04941596722004592, + "language_loss": 0.84629339, + "learning_rate": 0.00015593195374931452, + "loss": 0.85704899, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.24804688, + "step": 3895, + "time_per_iteration": 2.7212753295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077894, + "balance_loss_mlp": 1.05278873, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.06116342211722219, + "language_loss": 0.80278218, + "learning_rate": 0.00015570597117287922, + "loss": 0.8135612, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.25109863, + "step": 3896, + "time_per_iteration": 2.7101802825927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_mlp": 1.04633236, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.069447374717696, + "language_loss": 0.77728438, + "learning_rate": 0.0001554801222632406, + "loss": 0.78799057, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.24291992, + "step": 3897, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107305, + "balance_loss_mlp": 1.04788542, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.06164453931329584, + "language_loss": 0.85245335, + "learning_rate": 0.00015525440710808052, + "loss": 0.86318392, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.25170898, + "step": 3898, + "time_per_iteration": 2.653172016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107387, + "balance_loss_mlp": 1.04789472, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.06163823743918883, + "language_loss": 0.77877641, + "learning_rate": 0.00015502882579502953, + "loss": 0.78951514, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.2598877, + "step": 3899, + "time_per_iteration": 2.949995517730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074982, + "balance_loss_mlp": 1.04994845, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.062464860099035104, + "language_loss": 0.85077929, + "learning_rate": 0.00015480337841166592, + "loss": 0.86152911, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.25012207, + "step": 3900, + "time_per_iteration": 2.7779133319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078827, + "balance_loss_mlp": 1.05378067, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.06586633865886998, + "language_loss": 0.82996714, + "learning_rate": 0.00015457806504551647, + "loss": 0.8407554, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.25061035, + "step": 3901, + "time_per_iteration": 2.8566529750823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074374, + "balance_loss_mlp": 1.04922056, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.053967524095388235, + "language_loss": 0.78072977, + "learning_rate": 0.0001543528857840554, + "loss": 0.79147345, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.25158691, + "step": 3902, + "time_per_iteration": 2.6760079860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107594, + "balance_loss_mlp": 1.05152607, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.0598852080475998, + "language_loss": 0.80620217, + "learning_rate": 0.000154127840714705, + "loss": 0.81696159, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.24401855, + "step": 3903, + "time_per_iteration": 2.788379430770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072506, + "balance_loss_mlp": 1.04635119, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.0690597284577383, + "language_loss": 0.82208622, + "learning_rate": 0.00015390292992483557, + "loss": 0.8328113, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.26184082, + "step": 3904, + "time_per_iteration": 2.507995128631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010781, + "balance_loss_mlp": 1.05372167, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.057063892472999186, + "language_loss": 0.8453331, + "learning_rate": 0.00015367815350176523, + "loss": 0.85611403, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.24389648, + "step": 3905, + "time_per_iteration": 2.733604907989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077145, + "balance_loss_mlp": 1.05211139, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.056222194479754704, + "language_loss": 0.82852668, + "learning_rate": 0.00015345351153275987, + "loss": 0.83929813, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.25048828, + "step": 3906, + "time_per_iteration": 2.5045523643493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068307, + "balance_loss_mlp": 1.04364252, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.0670025336867701, + "language_loss": 0.80755925, + "learning_rate": 0.00015322900410503332, + "loss": 0.81824237, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.24645996, + "step": 3907, + "time_per_iteration": 2.7994320392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072066, + "balance_loss_mlp": 1.04688847, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.05833722566179846, + "language_loss": 0.77270997, + "learning_rate": 0.00015300463130574703, + "loss": 0.78343064, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.2520752, + "step": 3908, + "time_per_iteration": 2.8524723052978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074857, + "balance_loss_mlp": 1.05007386, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.06750112030828431, + "language_loss": 0.8202616, + "learning_rate": 0.00015278039322201033, + "loss": 0.83101016, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.24780273, + "step": 3909, + "time_per_iteration": 2.9736523628234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107108, + "balance_loss_mlp": 1.04605806, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.06973049488885559, + "language_loss": 0.79777265, + "learning_rate": 0.00015255628994088004, + "loss": 0.80848348, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.25012207, + "step": 3910, + "time_per_iteration": 2.5302295684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071428, + "balance_loss_mlp": 1.04685879, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.06491426565594356, + "language_loss": 0.75382125, + "learning_rate": 0.00015233232154936082, + "loss": 0.76453555, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.24572754, + "step": 3911, + "time_per_iteration": 3.251619815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078269, + "balance_loss_mlp": 1.05206633, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.0623404961346465, + "language_loss": 0.76721239, + "learning_rate": 0.0001521084881344048, + "loss": 0.77799511, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.26220703, + "step": 3912, + "time_per_iteration": 2.850635051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075285, + "balance_loss_mlp": 1.05071616, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.05187339069994817, + "language_loss": 0.86498892, + "learning_rate": 0.00015188478978291208, + "loss": 0.87574184, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.24572754, + "step": 3913, + "time_per_iteration": 2.765442371368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072293, + "balance_loss_mlp": 1.04759288, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.06241775193338078, + "language_loss": 0.86580771, + "learning_rate": 0.00015166122658173014, + "loss": 0.87653065, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.24682617, + "step": 3914, + "time_per_iteration": 2.7562687397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076326, + "balance_loss_mlp": 1.05007637, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.05387803011271429, + "language_loss": 0.88860059, + "learning_rate": 0.00015143779861765332, + "loss": 0.89936382, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.26257324, + "step": 3915, + "time_per_iteration": 2.932776927947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107374, + "balance_loss_mlp": 1.04894459, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.057566889010823, + "language_loss": 0.81424505, + "learning_rate": 0.00015121450597742458, + "loss": 0.82498246, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.2479248, + "step": 3916, + "time_per_iteration": 2.854919672012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078465, + "balance_loss_mlp": 1.05316877, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07096809285669192, + "language_loss": 0.7879523, + "learning_rate": 0.00015099134874773369, + "loss": 0.79873693, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.25317383, + "step": 3917, + "time_per_iteration": 2.717822313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072702, + "balance_loss_mlp": 1.04738212, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.05614014037376785, + "language_loss": 0.80481035, + "learning_rate": 0.00015076832701521793, + "loss": 0.81553745, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.25341797, + "step": 3918, + "time_per_iteration": 2.7440896034240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077078, + "balance_loss_mlp": 1.05145979, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.07007735924828153, + "language_loss": 0.81983852, + "learning_rate": 0.000150545440866462, + "loss": 0.83060932, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.25646973, + "step": 3919, + "time_per_iteration": 3.0307159423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080089, + "balance_loss_mlp": 1.05534124, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.06360208867996311, + "language_loss": 0.78682411, + "learning_rate": 0.000150322690387998, + "loss": 0.79762495, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.24755859, + "step": 3920, + "time_per_iteration": 2.4933719635009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075825, + "balance_loss_mlp": 1.05169666, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.07326690987324283, + "language_loss": 0.75561839, + "learning_rate": 0.00015010007566630535, + "loss": 0.76637661, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.24121094, + "step": 3921, + "time_per_iteration": 2.7614030838012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.05487168, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.09124669942400691, + "language_loss": 0.81765956, + "learning_rate": 0.00014987759678781077, + "loss": 0.82845515, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.24707031, + "step": 3922, + "time_per_iteration": 2.6194660663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107743, + "balance_loss_mlp": 1.0523603, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07360679346061566, + "language_loss": 0.82340884, + "learning_rate": 0.00014965525383888795, + "loss": 0.83418316, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.25085449, + "step": 3923, + "time_per_iteration": 2.8085968494415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073002, + "balance_loss_mlp": 1.04881442, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.05494147017339954, + "language_loss": 0.72481954, + "learning_rate": 0.00014943304690585851, + "loss": 0.73554957, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.24182129, + "step": 3924, + "time_per_iteration": 2.9154560565948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079076, + "balance_loss_mlp": 1.0540781, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.07583618548945481, + "language_loss": 0.79405016, + "learning_rate": 0.0001492109760749908, + "loss": 0.80484092, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.25012207, + "step": 3925, + "time_per_iteration": 2.5836076736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076807, + "balance_loss_mlp": 1.0515945, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.05355436965428176, + "language_loss": 0.80110025, + "learning_rate": 0.00014898904143250002, + "loss": 0.81186831, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.25231934, + "step": 3926, + "time_per_iteration": 2.6505353450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024147, + "balance_loss_mlp": 1.01732779, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.0157174231445921, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76779342, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.06835938, + "step": 3927, + "time_per_iteration": 4.953717470169067 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077692, + "balance_loss_mlp": 1.05381441, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.05489454471207153, + "language_loss": 0.80429578, + "learning_rate": 0.0001485455810572474, + "loss": 0.81507266, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.23864746, + "step": 3928, + "time_per_iteration": 2.637946844100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077738, + "balance_loss_mlp": 1.05282295, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.058181996359435495, + "language_loss": 0.84069693, + "learning_rate": 0.00014832405549665236, + "loss": 0.85147429, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.24902344, + "step": 3929, + "time_per_iteration": 2.6932008266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074105, + "balance_loss_mlp": 1.05033493, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.06320192227376603, + "language_loss": 0.78577268, + "learning_rate": 0.00014810266646876746, + "loss": 0.79651374, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.2376709, + "step": 3930, + "time_per_iteration": 2.7697536945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071439, + "balance_loss_mlp": 1.04683375, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.06814480820805115, + "language_loss": 0.77612817, + "learning_rate": 0.00014788141405954364, + "loss": 0.78684253, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.24633789, + "step": 3931, + "time_per_iteration": 2.979769468307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073336, + "balance_loss_mlp": 1.04886281, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.059820147392813335, + "language_loss": 0.84867471, + "learning_rate": 0.00014766029835487865, + "loss": 0.85940808, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.24475098, + "step": 3932, + "time_per_iteration": 2.7333834171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.05246568, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.06432503649028948, + "language_loss": 0.79687858, + "learning_rate": 0.0001474393194406173, + "loss": 0.80764747, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.24438477, + "step": 3933, + "time_per_iteration": 2.896916627883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.04866862, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.05519381243728572, + "language_loss": 0.79627228, + "learning_rate": 0.00014721847740255112, + "loss": 0.80699992, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.24084473, + "step": 3934, + "time_per_iteration": 2.8888845443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011104, + "balance_loss_mlp": 1.00419021, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.009067101269619127, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74923027, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.06933594, + "step": 3935, + "time_per_iteration": 4.645391941070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106868, + "balance_loss_mlp": 1.0436697, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.07237572770548766, + "language_loss": 0.78754729, + "learning_rate": 0.00014677720429790526, + "loss": 0.79823411, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.25, + "step": 3936, + "time_per_iteration": 2.588223457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063945, + "balance_loss_mlp": 1.03916097, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.047485127857512396, + "language_loss": 0.84842449, + "learning_rate": 0.0001465567734026429, + "loss": 0.85906392, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.24804688, + "step": 3937, + "time_per_iteration": 2.733915090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.04105449, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08009981712231565, + "language_loss": 0.82548285, + "learning_rate": 0.00014633647972621034, + "loss": 0.83615267, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.25964355, + "step": 3938, + "time_per_iteration": 2.4831509590148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066047, + "balance_loss_mlp": 1.04131114, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.049323859420558516, + "language_loss": 0.8679713, + "learning_rate": 0.00014611632335413354, + "loss": 0.87863177, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.24743652, + "step": 3939, + "time_per_iteration": 2.817972421646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066166, + "balance_loss_mlp": 1.04209805, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.05378533672074644, + "language_loss": 0.82628143, + "learning_rate": 0.00014589630437188456, + "loss": 0.83694315, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.24047852, + "step": 3940, + "time_per_iteration": 3.1869349479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069178, + "balance_loss_mlp": 1.04451323, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.0625352255464929, + "language_loss": 0.78564709, + "learning_rate": 0.00014567642286488253, + "loss": 0.79633886, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.2467041, + "step": 3941, + "time_per_iteration": 2.5982542037963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067881, + "balance_loss_mlp": 1.04189372, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.07448102100024, + "language_loss": 0.79396963, + "learning_rate": 0.00014545667891849258, + "loss": 0.8046484, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.26013184, + "step": 3942, + "time_per_iteration": 2.6813278198242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.04107857, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.05620268870521042, + "language_loss": 0.82649952, + "learning_rate": 0.00014523707261802733, + "loss": 0.83716011, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.24987793, + "step": 3943, + "time_per_iteration": 2.6405162811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068268, + "balance_loss_mlp": 1.04263783, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.05791403818328359, + "language_loss": 0.8163532, + "learning_rate": 0.00014501760404874527, + "loss": 0.8270359, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.25634766, + "step": 3944, + "time_per_iteration": 2.722963809967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106607, + "balance_loss_mlp": 1.04108405, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.06238439989518053, + "language_loss": 0.86068374, + "learning_rate": 0.00014479827329585176, + "loss": 0.87134445, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.24963379, + "step": 3945, + "time_per_iteration": 2.7014224529266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.04362893, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.04867252918796388, + "language_loss": 0.8493138, + "learning_rate": 0.00014457908044449846, + "loss": 0.85999829, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.24816895, + "step": 3946, + "time_per_iteration": 2.7054529190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106665, + "balance_loss_mlp": 1.04214025, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.06710425705469547, + "language_loss": 0.83130479, + "learning_rate": 0.00014436002557978371, + "loss": 0.84197128, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.24511719, + "step": 3947, + "time_per_iteration": 2.788025379180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004209, + "balance_loss_mlp": 0.99724722, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.014479305235322698, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77647352, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.06982422, + "step": 3948, + "time_per_iteration": 4.901083946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067183, + "balance_loss_mlp": 1.0420419, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.05243549460506514, + "language_loss": 0.79874659, + "learning_rate": 0.0001439223301503945, + "loss": 0.80941838, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.25146484, + "step": 3949, + "time_per_iteration": 2.538907527923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_mlp": 1.04793382, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.10938231230046584, + "language_loss": 0.76564628, + "learning_rate": 0.00014370368975564834, + "loss": 0.77636915, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.24353027, + "step": 3950, + "time_per_iteration": 2.9170916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072617, + "balance_loss_mlp": 1.04752314, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.06203753703543282, + "language_loss": 0.83670735, + "learning_rate": 0.00014348518768739766, + "loss": 0.84743357, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.25109863, + "step": 3951, + "time_per_iteration": 2.730717897415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002273, + "balance_loss_mlp": 0.99526346, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.013476999765998546, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.7773031, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.0703125, + "step": 3952, + "time_per_iteration": 4.813526391983032 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067232, + "balance_loss_mlp": 1.04273486, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.09216142296418942, + "language_loss": 0.86414772, + "learning_rate": 0.00014304859886964867, + "loss": 0.87482005, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.24487305, + "step": 3953, + "time_per_iteration": 2.9926557540893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068844, + "balance_loss_mlp": 1.0442636, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.05133749222292773, + "language_loss": 0.83801866, + "learning_rate": 0.00014283051228964878, + "loss": 0.84870708, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.24572754, + "step": 3954, + "time_per_iteration": 2.68623423576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068293, + "balance_loss_mlp": 1.0438199, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.07227710596047977, + "language_loss": 0.82760596, + "learning_rate": 0.00014261256437514197, + "loss": 0.8382889, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.24462891, + "step": 3955, + "time_per_iteration": 2.664646625518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067041, + "balance_loss_mlp": 1.04321122, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.06297577079801352, + "language_loss": 0.82699019, + "learning_rate": 0.0001423947552107428, + "loss": 0.83766061, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.23815918, + "step": 3956, + "time_per_iteration": 2.705461263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106862, + "balance_loss_mlp": 1.04420578, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.06155196103872169, + "language_loss": 0.77009457, + "learning_rate": 0.00014217708488101243, + "loss": 0.78078079, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.24389648, + "step": 3957, + "time_per_iteration": 3.0419583320617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069471, + "balance_loss_mlp": 1.04394794, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08526954862375616, + "language_loss": 0.77756625, + "learning_rate": 0.0001419595534704579, + "loss": 0.78826094, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.25537109, + "step": 3958, + "time_per_iteration": 2.6491963863372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_mlp": 1.04386628, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.05412065824000071, + "language_loss": 0.81526375, + "learning_rate": 0.00014174216106353237, + "loss": 0.82594681, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.24438477, + "step": 3959, + "time_per_iteration": 2.6313867568969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067389, + "balance_loss_mlp": 1.04217625, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.06465573226743382, + "language_loss": 0.76379991, + "learning_rate": 0.00014152490774463512, + "loss": 0.77447385, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.25231934, + "step": 3960, + "time_per_iteration": 2.5999507904052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_mlp": 1.04220998, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.07999326850245969, + "language_loss": 0.87070781, + "learning_rate": 0.00014130779359811135, + "loss": 0.8813796, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.24963379, + "step": 3961, + "time_per_iteration": 2.470813512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067267, + "balance_loss_mlp": 1.04288888, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.05710380569291167, + "language_loss": 0.8618769, + "learning_rate": 0.0001410908187082521, + "loss": 0.87254959, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.24365234, + "step": 3962, + "time_per_iteration": 2.8664379119873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068922, + "balance_loss_mlp": 1.04251671, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.06132823926479849, + "language_loss": 0.83309317, + "learning_rate": 0.0001408739831592949, + "loss": 0.84378237, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.26416016, + "step": 3963, + "time_per_iteration": 2.6825127601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066723, + "balance_loss_mlp": 1.04126, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.061542532553496905, + "language_loss": 0.7740978, + "learning_rate": 0.0001406572870354224, + "loss": 0.78476501, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.25488281, + "step": 3964, + "time_per_iteration": 2.8066251277923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068698, + "balance_loss_mlp": 1.0439024, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.0743228593837952, + "language_loss": 0.8726244, + "learning_rate": 0.00014044073042076337, + "loss": 0.88331133, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.24804688, + "step": 3965, + "time_per_iteration": 2.56150484085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063933, + "balance_loss_mlp": 1.04018617, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.05421398369924913, + "language_loss": 0.89094937, + "learning_rate": 0.00014022431339939302, + "loss": 0.90158874, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.23730469, + "step": 3966, + "time_per_iteration": 2.655383586883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062991, + "balance_loss_mlp": 1.03745639, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.06770239365131947, + "language_loss": 0.78292239, + "learning_rate": 0.00014000803605533163, + "loss": 0.79355228, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.25537109, + "step": 3967, + "time_per_iteration": 2.7987117767333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064596, + "balance_loss_mlp": 1.04112363, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.07669794307495341, + "language_loss": 0.83868659, + "learning_rate": 0.00013979189847254553, + "loss": 0.84933251, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.23474121, + "step": 3968, + "time_per_iteration": 2.5726282596588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068744, + "balance_loss_mlp": 1.04398394, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.06422002731628916, + "language_loss": 0.80682731, + "learning_rate": 0.00013957590073494674, + "loss": 0.81751466, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.24780273, + "step": 3969, + "time_per_iteration": 2.7832956314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063668, + "balance_loss_mlp": 1.03938496, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.0668838144354532, + "language_loss": 0.79043007, + "learning_rate": 0.0001393600429263931, + "loss": 0.80106676, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.24267578, + "step": 3970, + "time_per_iteration": 2.7605960369110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_mlp": 1.01881361, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.01539224673333176, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75770235, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.06542969, + "step": 3971, + "time_per_iteration": 4.931908369064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062427, + "balance_loss_mlp": 1.03795385, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.05820803040195091, + "language_loss": 0.81583369, + "learning_rate": 0.0001389287474315804, + "loss": 0.82645798, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.24450684, + "step": 3972, + "time_per_iteration": 2.6463029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.03905725, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.06630733211536807, + "language_loss": 0.8009305, + "learning_rate": 0.00013871330991276505, + "loss": 0.81156087, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.23950195, + "step": 3973, + "time_per_iteration": 2.706376791000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.04109335, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.09714932843218141, + "language_loss": 0.80939794, + "learning_rate": 0.00013849801265788247, + "loss": 0.82005835, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.24938965, + "step": 3974, + "time_per_iteration": 2.9903647899627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068526, + "balance_loss_mlp": 1.04320633, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.0619121183931823, + "language_loss": 0.83072186, + "learning_rate": 0.00013828285575051818, + "loss": 0.84140712, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.25354004, + "step": 3975, + "time_per_iteration": 2.6031386852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067956, + "balance_loss_mlp": 1.04345858, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.057910960280581285, + "language_loss": 0.84483737, + "learning_rate": 0.0001380678392742035, + "loss": 0.85551691, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.24499512, + "step": 3976, + "time_per_iteration": 2.7115118503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064756, + "balance_loss_mlp": 1.03948343, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.05528375504555246, + "language_loss": 0.8460198, + "learning_rate": 0.00013785296331241526, + "loss": 0.85666734, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.25292969, + "step": 3977, + "time_per_iteration": 2.907803535461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.04070044, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.06111847190326819, + "language_loss": 0.87681317, + "learning_rate": 0.00013763822794857583, + "loss": 0.8874656, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.24560547, + "step": 3978, + "time_per_iteration": 3.321633815765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.04451132, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.06052146133272544, + "language_loss": 0.89868426, + "learning_rate": 0.00013742363326605278, + "loss": 0.90937102, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.24145508, + "step": 3979, + "time_per_iteration": 2.6986289024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069543, + "balance_loss_mlp": 1.04461646, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.05317029267567304, + "language_loss": 0.78481579, + "learning_rate": 0.00013720917934815935, + "loss": 0.79551125, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.24938965, + "step": 3980, + "time_per_iteration": 2.7501296997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.0423131, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.07789477911070539, + "language_loss": 0.83389938, + "learning_rate": 0.00013699486627815344, + "loss": 0.84457338, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.25109863, + "step": 3981, + "time_per_iteration": 2.6420035362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071024, + "balance_loss_mlp": 1.04684854, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.05908161503025523, + "language_loss": 0.82459986, + "learning_rate": 0.00013678069413923928, + "loss": 0.8353101, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.24169922, + "step": 3982, + "time_per_iteration": 2.6923530101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069033, + "balance_loss_mlp": 1.04438031, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.05955764603352247, + "language_loss": 0.82175618, + "learning_rate": 0.00013656666301456555, + "loss": 0.83244646, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.24658203, + "step": 3983, + "time_per_iteration": 2.507713794708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070412, + "balance_loss_mlp": 1.04577184, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.05383529418711491, + "language_loss": 0.84338224, + "learning_rate": 0.0001363527729872267, + "loss": 0.8540864, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.24633789, + "step": 3984, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074302, + "balance_loss_mlp": 1.04956603, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.06494738707995135, + "language_loss": 0.76830447, + "learning_rate": 0.00013613902414026207, + "loss": 0.77904749, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.24755859, + "step": 3985, + "time_per_iteration": 2.782332420349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071114, + "balance_loss_mlp": 1.04673553, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.062182975481338824, + "language_loss": 0.82354724, + "learning_rate": 0.00013592541655665642, + "loss": 0.83425832, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.24389648, + "step": 3986, + "time_per_iteration": 2.9702792167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072457, + "balance_loss_mlp": 1.04731619, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.06854938145673521, + "language_loss": 0.85176432, + "learning_rate": 0.00013571195031933947, + "loss": 0.86248893, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.25134277, + "step": 3987, + "time_per_iteration": 2.7414095401763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017121, + "balance_loss_mlp": 1.01035035, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.007346318890350203, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81498468, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.06787109, + "step": 3988, + "time_per_iteration": 4.668884515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070415, + "balance_loss_mlp": 1.04586959, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.05768495800947346, + "language_loss": 0.85781401, + "learning_rate": 0.00013528544221501655, + "loss": 0.86851817, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.2454834, + "step": 3989, + "time_per_iteration": 2.7101733684539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082587, + "balance_loss_mlp": 1.05741, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.055358014362887016, + "language_loss": 0.81702012, + "learning_rate": 0.00013507240051359586, + "loss": 0.82784599, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.25195312, + "step": 3990, + "time_per_iteration": 3.045398235321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076149, + "balance_loss_mlp": 1.05154467, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.06163508204720818, + "language_loss": 0.86476028, + "learning_rate": 0.00013485950048963425, + "loss": 0.87552178, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.24597168, + "step": 3991, + "time_per_iteration": 2.6160435676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070646, + "balance_loss_mlp": 1.04651809, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.06405846615426129, + "language_loss": 0.83226049, + "learning_rate": 0.00013464674222578643, + "loss": 0.84296697, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.24133301, + "step": 3992, + "time_per_iteration": 3.241417407989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075856, + "balance_loss_mlp": 1.05133498, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.0861576715386882, + "language_loss": 0.83366966, + "learning_rate": 0.00013443412580465292, + "loss": 0.84442818, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.24523926, + "step": 3993, + "time_per_iteration": 4.034927606582642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077486, + "balance_loss_mlp": 1.05234468, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.06539433729122356, + "language_loss": 0.84409565, + "learning_rate": 0.00013422165130877857, + "loss": 0.8548705, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.25146484, + "step": 3994, + "time_per_iteration": 2.899876356124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076399, + "balance_loss_mlp": 1.05162692, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.06486497816335288, + "language_loss": 0.80478024, + "learning_rate": 0.00013400931882065327, + "loss": 0.81554425, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.24755859, + "step": 3995, + "time_per_iteration": 2.637848138809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_mlp": 1.04965222, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.05862406149444633, + "language_loss": 0.80990946, + "learning_rate": 0.0001337971284227118, + "loss": 0.82065952, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.25378418, + "step": 3996, + "time_per_iteration": 2.996047258377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014574, + "balance_loss_mlp": 1.00766027, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.0056434488295698, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77132994, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.06933594, + "step": 3997, + "time_per_iteration": 4.898136854171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079533, + "balance_loss_mlp": 1.05454707, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.05325095394191187, + "language_loss": 0.8044312, + "learning_rate": 0.0001333731742268438, + "loss": 0.81522655, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.24963379, + "step": 3998, + "time_per_iteration": 2.682598352432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074014, + "balance_loss_mlp": 1.0484314, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.05950995381117831, + "language_loss": 0.85969436, + "learning_rate": 0.0001331614105935109, + "loss": 0.87043446, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.25598145, + "step": 3999, + "time_per_iteration": 2.6892640590667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074883, + "balance_loss_mlp": 1.04993236, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.05504712261606648, + "language_loss": 0.84570682, + "learning_rate": 0.00013294978937954883, + "loss": 0.85645556, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.24963379, + "step": 4000, + "time_per_iteration": 2.7923429012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069985, + "balance_loss_mlp": 1.04480815, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.06369425747465635, + "language_loss": 0.85450065, + "learning_rate": 0.00013273831066711655, + "loss": 0.86520052, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.25195312, + "step": 4001, + "time_per_iteration": 2.604282855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078533, + "balance_loss_mlp": 1.05357099, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.05227266936279928, + "language_loss": 0.80363703, + "learning_rate": 0.00013252697453831747, + "loss": 0.81442237, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.24975586, + "step": 4002, + "time_per_iteration": 2.7329213619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075879, + "balance_loss_mlp": 1.04984355, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.05410166650315036, + "language_loss": 0.82619524, + "learning_rate": 0.00013231578107519916, + "loss": 0.836954, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.26037598, + "step": 4003, + "time_per_iteration": 2.8749208450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.04640484, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.06212967389396702, + "language_loss": 0.82997644, + "learning_rate": 0.00013210473035975422, + "loss": 0.84067953, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.23901367, + "step": 4004, + "time_per_iteration": 2.632235288619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073797, + "balance_loss_mlp": 1.04878688, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.08127476835150631, + "language_loss": 0.85770059, + "learning_rate": 0.0001318938224739201, + "loss": 0.8684386, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.25012207, + "step": 4005, + "time_per_iteration": 3.1193981170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069411, + "balance_loss_mlp": 1.04423416, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06450494228742032, + "language_loss": 0.84133303, + "learning_rate": 0.00013168305749957843, + "loss": 0.85202718, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.25183105, + "step": 4006, + "time_per_iteration": 2.7679009437561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066176, + "balance_loss_mlp": 1.04140389, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.05098549472423512, + "language_loss": 0.82851386, + "learning_rate": 0.00013147243551855532, + "loss": 0.83917558, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.24768066, + "step": 4007, + "time_per_iteration": 2.56661057472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106967, + "balance_loss_mlp": 1.04553032, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.048065096858605744, + "language_loss": 0.80720365, + "learning_rate": 0.00013126195661262148, + "loss": 0.81790042, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.24133301, + "step": 4008, + "time_per_iteration": 2.748946189880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04871428, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.05225893259169981, + "language_loss": 0.86849338, + "learning_rate": 0.00013105162086349216, + "loss": 0.87922454, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.24389648, + "step": 4009, + "time_per_iteration": 2.8441760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075731, + "balance_loss_mlp": 1.05036318, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.05592364071179798, + "language_loss": 0.86026949, + "learning_rate": 0.00013084142835282687, + "loss": 0.87102675, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.25390625, + "step": 4010, + "time_per_iteration": 2.6637930870056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018937, + "balance_loss_mlp": 1.01207089, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.009139726311940191, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80903304, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.06884766, + "step": 4011, + "time_per_iteration": 4.862056255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071968, + "balance_loss_mlp": 1.04698229, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.059120136224188345, + "language_loss": 0.89225733, + "learning_rate": 0.0001304214733732485, + "loss": 0.90297705, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.24987793, + "step": 4012, + "time_per_iteration": 2.759030818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073689, + "balance_loss_mlp": 1.04848814, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.06471041730678638, + "language_loss": 0.82730675, + "learning_rate": 0.00013021171106737672, + "loss": 0.83804369, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.25219727, + "step": 4013, + "time_per_iteration": 2.6606547832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076087, + "balance_loss_mlp": 1.05184019, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.04900066226128831, + "language_loss": 0.79908812, + "learning_rate": 0.00013000209232605071, + "loss": 0.80984896, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.24230957, + "step": 4014, + "time_per_iteration": 2.665905237197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.05282855, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.06112285511526787, + "language_loss": 0.79969144, + "learning_rate": 0.0001297926172306519, + "loss": 0.81047451, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.25500488, + "step": 4015, + "time_per_iteration": 2.657850503921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070931, + "balance_loss_mlp": 1.04582596, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.05237005996318212, + "language_loss": 0.79076529, + "learning_rate": 0.0001295832858625055, + "loss": 0.80147457, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.25097656, + "step": 4016, + "time_per_iteration": 3.2558414936065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073374, + "balance_loss_mlp": 1.04854274, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.05315298112926871, + "language_loss": 0.70125198, + "learning_rate": 0.00012937409830288154, + "loss": 0.71198577, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.24841309, + "step": 4017, + "time_per_iteration": 2.8071141242980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.05123496, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.08174629048590813, + "language_loss": 0.85300404, + "learning_rate": 0.00012916505463299362, + "loss": 0.86376196, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.24572754, + "step": 4018, + "time_per_iteration": 2.521777868270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107567, + "balance_loss_mlp": 1.05085087, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.06122283434294953, + "language_loss": 0.78065503, + "learning_rate": 0.00012895615493399972, + "loss": 0.7914117, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.24816895, + "step": 4019, + "time_per_iteration": 2.843815326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077372, + "balance_loss_mlp": 1.05187333, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.06855885620407597, + "language_loss": 0.82437384, + "learning_rate": 0.00012874739928700192, + "loss": 0.83514762, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.25512695, + "step": 4020, + "time_per_iteration": 2.625136613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074703, + "balance_loss_mlp": 1.049371, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.06838381213825387, + "language_loss": 0.79939824, + "learning_rate": 0.00012853878777304624, + "loss": 0.81014526, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.25341797, + "step": 4021, + "time_per_iteration": 2.870577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078589, + "balance_loss_mlp": 1.05390024, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.05110075490537154, + "language_loss": 0.84490621, + "learning_rate": 0.000128330320473123, + "loss": 0.85569209, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.24694824, + "step": 4022, + "time_per_iteration": 2.6740787029266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016568, + "balance_loss_mlp": 1.0095588, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.007922822581460296, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79348469, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.0703125, + "step": 4023, + "time_per_iteration": 4.92633318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079759, + "balance_loss_mlp": 1.05466557, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.0679194814270071, + "language_loss": 0.81913227, + "learning_rate": 0.0001279138188390543, + "loss": 0.82992983, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.25109863, + "step": 4024, + "time_per_iteration": 2.822410821914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074667, + "balance_loss_mlp": 1.05036056, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.05859090584356498, + "language_loss": 0.86860305, + "learning_rate": 0.00012770578466660915, + "loss": 0.87934977, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.24291992, + "step": 4025, + "time_per_iteration": 2.9427406787872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081782, + "balance_loss_mlp": 1.05593777, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.062055121147232294, + "language_loss": 0.82006752, + "learning_rate": 0.0001274978950315968, + "loss": 0.83088535, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.25878906, + "step": 4026, + "time_per_iteration": 2.795128583908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075185, + "balance_loss_mlp": 1.05006814, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.08117867948419291, + "language_loss": 0.83182287, + "learning_rate": 0.00012729015001472716, + "loss": 0.84257472, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.2512207, + "step": 4027, + "time_per_iteration": 2.6325039863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04923844, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.05921270053212527, + "language_loss": 0.82036096, + "learning_rate": 0.00012708254969665418, + "loss": 0.83111012, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.25695801, + "step": 4028, + "time_per_iteration": 2.7775604724884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079508, + "balance_loss_mlp": 1.05439043, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.06748938029736128, + "language_loss": 0.83602798, + "learning_rate": 0.00012687509415797526, + "loss": 0.8468231, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.2512207, + "step": 4029, + "time_per_iteration": 2.550536632537842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077, + "balance_loss_mlp": 1.05216861, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.05440055390110852, + "language_loss": 0.81075221, + "learning_rate": 0.00012666778347923208, + "loss": 0.82152218, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.24829102, + "step": 4030, + "time_per_iteration": 2.627509593963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071803, + "balance_loss_mlp": 1.04774618, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.05010388479437805, + "language_loss": 0.84007275, + "learning_rate": 0.0001264606177409092, + "loss": 0.85079074, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.24047852, + "step": 4031, + "time_per_iteration": 2.6272945404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.04220271, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.05768180331763808, + "language_loss": 0.86024618, + "learning_rate": 0.00012625359702343609, + "loss": 0.87091547, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.24707031, + "step": 4032, + "time_per_iteration": 2.7090940475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066743, + "balance_loss_mlp": 1.04268646, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.05938615660994814, + "language_loss": 0.84870774, + "learning_rate": 0.00012604672140718504, + "loss": 0.85937512, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.24047852, + "step": 4033, + "time_per_iteration": 2.6182591915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069686, + "balance_loss_mlp": 1.04444957, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.06535534521633303, + "language_loss": 0.77696943, + "learning_rate": 0.00012583999097247233, + "loss": 0.78766632, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.25256348, + "step": 4034, + "time_per_iteration": 2.8029134273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064158, + "balance_loss_mlp": 1.03980374, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.06345201912035602, + "language_loss": 0.79936808, + "learning_rate": 0.0001256334057995578, + "loss": 0.81000972, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.24365234, + "step": 4035, + "time_per_iteration": 2.7016162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063824, + "balance_loss_mlp": 1.03927922, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.056783072410337934, + "language_loss": 0.85302633, + "learning_rate": 0.000125426965968645, + "loss": 0.86366457, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.24536133, + "step": 4036, + "time_per_iteration": 2.699063301086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064554, + "balance_loss_mlp": 1.04023552, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07280358929704372, + "language_loss": 0.82468921, + "learning_rate": 0.00012522067155988092, + "loss": 0.83533478, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.24304199, + "step": 4037, + "time_per_iteration": 2.6608784198760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065667, + "balance_loss_mlp": 1.04110956, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.05908438933946337, + "language_loss": 0.75255591, + "learning_rate": 0.00012501452265335617, + "loss": 0.76321256, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.24560547, + "step": 4038, + "time_per_iteration": 2.8470029830932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068825, + "balance_loss_mlp": 1.04455364, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.059889567588302765, + "language_loss": 0.83042991, + "learning_rate": 0.0001248085193291047, + "loss": 0.84111816, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.24255371, + "step": 4039, + "time_per_iteration": 2.730807304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.04434443, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.05468138841735705, + "language_loss": 0.82561696, + "learning_rate": 0.00012460266166710443, + "loss": 0.83630657, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.24609375, + "step": 4040, + "time_per_iteration": 3.2041711807250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.04067707, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.06748219458401046, + "language_loss": 0.77782911, + "learning_rate": 0.00012439694974727633, + "loss": 0.78847289, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.23706055, + "step": 4041, + "time_per_iteration": 3.0317485332489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066638, + "balance_loss_mlp": 1.04206872, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.05414460584794901, + "language_loss": 0.80244517, + "learning_rate": 0.00012419138364948458, + "loss": 0.81311154, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.24560547, + "step": 4042, + "time_per_iteration": 2.7031445503234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063112, + "balance_loss_mlp": 1.03903186, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.05641629712994933, + "language_loss": 0.8246541, + "learning_rate": 0.00012398596345353702, + "loss": 0.83528519, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.24072266, + "step": 4043, + "time_per_iteration": 2.9440853595733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068598, + "balance_loss_mlp": 1.04396939, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06391086688710987, + "language_loss": 0.83438706, + "learning_rate": 0.0001237806892391851, + "loss": 0.84507304, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.24621582, + "step": 4044, + "time_per_iteration": 2.699157476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070628, + "balance_loss_mlp": 1.04549861, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.06994716374064003, + "language_loss": 0.81070113, + "learning_rate": 0.0001235755610861233, + "loss": 0.82140744, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.25134277, + "step": 4045, + "time_per_iteration": 2.772141933441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_mlp": 1.03941059, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.05993243352080289, + "language_loss": 0.8561902, + "learning_rate": 0.0001233705790739893, + "loss": 0.86682665, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.2421875, + "step": 4046, + "time_per_iteration": 2.715252637863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063047, + "balance_loss_mlp": 1.03909791, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.05972072663503075, + "language_loss": 0.74957597, + "learning_rate": 0.0001231657432823643, + "loss": 0.7602064, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.23937988, + "step": 4047, + "time_per_iteration": 3.2537522315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068015, + "balance_loss_mlp": 1.04351759, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.08653070667167902, + "language_loss": 0.79029131, + "learning_rate": 0.0001229610537907725, + "loss": 0.80097145, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.24511719, + "step": 4048, + "time_per_iteration": 2.6116526126861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.04393375, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.07563630490624115, + "language_loss": 0.9076525, + "learning_rate": 0.00012275651067868143, + "loss": 0.91833448, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.24255371, + "step": 4049, + "time_per_iteration": 2.6179893016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067923, + "balance_loss_mlp": 1.04335427, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.05280851265506485, + "language_loss": 0.80811793, + "learning_rate": 0.00012255211402550182, + "loss": 0.81879717, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.24572754, + "step": 4050, + "time_per_iteration": 3.24564266204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106679, + "balance_loss_mlp": 1.0426023, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.06858136893192632, + "language_loss": 0.76661634, + "learning_rate": 0.00012234786391058727, + "loss": 0.77728426, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.24194336, + "step": 4051, + "time_per_iteration": 2.7757604122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072241, + "balance_loss_mlp": 1.04733872, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.06727738365211818, + "language_loss": 0.85258687, + "learning_rate": 0.0001221437604132352, + "loss": 0.86330926, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.24914551, + "step": 4052, + "time_per_iteration": 2.6063547134399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068764, + "balance_loss_mlp": 1.04383731, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.07458964549292091, + "language_loss": 0.81427658, + "learning_rate": 0.0001219398036126852, + "loss": 0.82496417, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.24926758, + "step": 4053, + "time_per_iteration": 2.7283682823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069627, + "balance_loss_mlp": 1.04429483, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.05694657807222903, + "language_loss": 0.78109223, + "learning_rate": 0.00012173599358812027, + "loss": 0.79178852, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.25341797, + "step": 4054, + "time_per_iteration": 3.234590768814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071917, + "balance_loss_mlp": 1.04676414, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07419180869397772, + "language_loss": 0.82674354, + "learning_rate": 0.0001215323304186668, + "loss": 0.83746266, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.25170898, + "step": 4055, + "time_per_iteration": 2.747619152069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068312, + "balance_loss_mlp": 1.04483986, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.05875387608266639, + "language_loss": 0.88048428, + "learning_rate": 0.00012132881418339364, + "loss": 0.8911674, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.23449707, + "step": 4056, + "time_per_iteration": 2.711988687515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020033, + "balance_loss_mlp": 1.01268935, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.018856968303163506, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78537595, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.07324219, + "step": 4057, + "time_per_iteration": 4.845250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068664, + "balance_loss_mlp": 1.04403543, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.07123577938940119, + "language_loss": 0.76748145, + "learning_rate": 0.00012092222283137944, + "loss": 0.77816808, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.24633789, + "step": 4058, + "time_per_iteration": 2.7359137535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015158, + "balance_loss_mlp": 1.00781476, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.015685369403867444, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79921466, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.07324219, + "step": 4059, + "time_per_iteration": 4.798109292984009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068881, + "balance_loss_mlp": 1.0447526, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.05299878085313822, + "language_loss": 0.83917785, + "learning_rate": 0.00012051622016348856, + "loss": 0.84986663, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.24121094, + "step": 4060, + "time_per_iteration": 3.049039125442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068121, + "balance_loss_mlp": 1.04395747, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.06624192950559629, + "language_loss": 0.84473646, + "learning_rate": 0.00012031343978315539, + "loss": 0.85541761, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.24169922, + "step": 4061, + "time_per_iteration": 2.5507752895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065012, + "balance_loss_mlp": 1.03988302, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.06423319540710895, + "language_loss": 0.82729137, + "learning_rate": 0.00012011080681021774, + "loss": 0.83794153, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.2512207, + "step": 4062, + "time_per_iteration": 2.611497640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_mlp": 1.04255247, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.05897582780878463, + "language_loss": 0.86540973, + "learning_rate": 0.00011990832132334512, + "loss": 0.87608433, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.24902344, + "step": 4063, + "time_per_iteration": 2.538100481033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069255, + "balance_loss_mlp": 1.04350615, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.06886794650581697, + "language_loss": 0.82671422, + "learning_rate": 0.00011970598340114897, + "loss": 0.83740675, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.25769043, + "step": 4064, + "time_per_iteration": 2.9499306678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067844, + "balance_loss_mlp": 1.04288173, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.06050124300613184, + "language_loss": 0.83932686, + "learning_rate": 0.00011950379312218396, + "loss": 0.85000533, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.24975586, + "step": 4065, + "time_per_iteration": 2.6981561183929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066746, + "balance_loss_mlp": 1.04218912, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.0632552129471463, + "language_loss": 0.86015439, + "learning_rate": 0.00011930175056494719, + "loss": 0.87082189, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.24560547, + "step": 4066, + "time_per_iteration": 2.8711161613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072113, + "balance_loss_mlp": 1.04645956, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.04894683237800459, + "language_loss": 0.76267391, + "learning_rate": 0.00011909985580787885, + "loss": 0.77339506, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.25683594, + "step": 4067, + "time_per_iteration": 2.623110771179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067479, + "balance_loss_mlp": 1.04259992, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.051509930807311505, + "language_loss": 0.81230474, + "learning_rate": 0.00011889810892936137, + "loss": 0.82297957, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.24865723, + "step": 4068, + "time_per_iteration": 2.7267825603485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072363, + "balance_loss_mlp": 1.04626799, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.06989547292093733, + "language_loss": 0.7721833, + "learning_rate": 0.00011869651000771959, + "loss": 0.78290695, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.26086426, + "step": 4069, + "time_per_iteration": 2.8386263847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066766, + "balance_loss_mlp": 1.04239988, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.05590330027486899, + "language_loss": 0.82998097, + "learning_rate": 0.00011849505912122117, + "loss": 0.84064865, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.24353027, + "step": 4070, + "time_per_iteration": 2.7070353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071318, + "balance_loss_mlp": 1.04591501, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07526003762503049, + "language_loss": 0.77768147, + "learning_rate": 0.00011829375634807654, + "loss": 0.78839469, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.25415039, + "step": 4071, + "time_per_iteration": 3.020780324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070393, + "balance_loss_mlp": 1.04478693, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.07034512653521276, + "language_loss": 0.81144375, + "learning_rate": 0.00011809260176643821, + "loss": 0.82214773, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.25634766, + "step": 4072, + "time_per_iteration": 3.048811674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070982, + "balance_loss_mlp": 1.0463053, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.059023675544883115, + "language_loss": 0.83701313, + "learning_rate": 0.00011789159545440131, + "loss": 0.84772301, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.2467041, + "step": 4073, + "time_per_iteration": 2.576185703277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_mlp": 1.04587591, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05002266794770535, + "language_loss": 0.82456756, + "learning_rate": 0.00011769073749000348, + "loss": 0.83526802, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.24169922, + "step": 4074, + "time_per_iteration": 2.8168578147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079683, + "balance_loss_mlp": 1.05485141, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.06200789049799382, + "language_loss": 0.76390898, + "learning_rate": 0.0001174900279512246, + "loss": 0.77470577, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.24829102, + "step": 4075, + "time_per_iteration": 2.5813939571380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073387, + "balance_loss_mlp": 1.04935431, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.0619484344784815, + "language_loss": 0.81889015, + "learning_rate": 0.00011728946691598707, + "loss": 0.82962406, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.2401123, + "step": 4076, + "time_per_iteration": 2.604560136795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072391, + "balance_loss_mlp": 1.04755974, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.06799226151214174, + "language_loss": 0.76469254, + "learning_rate": 0.00011708905446215561, + "loss": 0.77541649, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.24841309, + "step": 4077, + "time_per_iteration": 2.8604230880737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076266, + "balance_loss_mlp": 1.05137515, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.05961897466734042, + "language_loss": 0.80370939, + "learning_rate": 0.00011688879066753711, + "loss": 0.81447208, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.24914551, + "step": 4078, + "time_per_iteration": 2.676522970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076486, + "balance_loss_mlp": 1.05258489, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.06622191956248384, + "language_loss": 0.87440282, + "learning_rate": 0.00011668867560988122, + "loss": 0.88516766, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.23876953, + "step": 4079, + "time_per_iteration": 2.5650012493133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_mlp": 1.04890943, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.05649669086947294, + "language_loss": 0.84200358, + "learning_rate": 0.00011648870936687916, + "loss": 0.85274112, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.24853516, + "step": 4080, + "time_per_iteration": 2.746581554412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077673, + "balance_loss_mlp": 1.05274642, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.06352176738045938, + "language_loss": 0.78612041, + "learning_rate": 0.00011628889201616461, + "loss": 0.79689717, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.24926758, + "step": 4081, + "time_per_iteration": 2.6527559757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107817, + "balance_loss_mlp": 1.05413723, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.059333709179443264, + "language_loss": 0.81988198, + "learning_rate": 0.00011608922363531393, + "loss": 0.83066362, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.24023438, + "step": 4082, + "time_per_iteration": 2.6462624073028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107947, + "balance_loss_mlp": 1.05538988, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.07948062508408354, + "language_loss": 0.83596992, + "learning_rate": 0.00011588970430184504, + "loss": 0.84676462, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.24072266, + "step": 4083, + "time_per_iteration": 3.0286946296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076264, + "balance_loss_mlp": 1.05245781, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.051566265858254114, + "language_loss": 0.81763256, + "learning_rate": 0.00011569033409321822, + "loss": 0.82839513, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.23803711, + "step": 4084, + "time_per_iteration": 2.677654981613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074536, + "balance_loss_mlp": 1.051386, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.08091390991612231, + "language_loss": 0.73483807, + "learning_rate": 0.00011549111308683591, + "loss": 0.74558342, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.23144531, + "step": 4085, + "time_per_iteration": 2.658348560333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107372, + "balance_loss_mlp": 1.04996181, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.08162659995684916, + "language_loss": 0.80996692, + "learning_rate": 0.00011529204136004251, + "loss": 0.8207041, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.23754883, + "step": 4086, + "time_per_iteration": 2.4145894050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075099, + "balance_loss_mlp": 1.05042303, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.05377876358731873, + "language_loss": 0.84456086, + "learning_rate": 0.00011509311899012459, + "loss": 0.85531187, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.2467041, + "step": 4087, + "time_per_iteration": 2.6459927558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107412, + "balance_loss_mlp": 1.05020642, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.06320948891726123, + "language_loss": 0.78042746, + "learning_rate": 0.00011489434605431053, + "loss": 0.79116869, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.23901367, + "step": 4088, + "time_per_iteration": 2.6297101974487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071348, + "balance_loss_mlp": 1.04667187, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.0677420392553831, + "language_loss": 0.81525648, + "learning_rate": 0.0001146957226297708, + "loss": 0.82596999, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.24682617, + "step": 4089, + "time_per_iteration": 2.686326026916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078205, + "balance_loss_mlp": 1.05306411, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.0625994839156693, + "language_loss": 0.76622844, + "learning_rate": 0.00011449724879361827, + "loss": 0.77701044, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.25146484, + "step": 4090, + "time_per_iteration": 2.944439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072693, + "balance_loss_mlp": 1.04931605, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.07495100545355479, + "language_loss": 0.73970717, + "learning_rate": 0.00011429892462290687, + "loss": 0.7504341, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.23376465, + "step": 4091, + "time_per_iteration": 2.666902542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076674, + "balance_loss_mlp": 1.05143714, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.08113379583083238, + "language_loss": 0.83441567, + "learning_rate": 0.00011410075019463295, + "loss": 0.84518242, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.25256348, + "step": 4092, + "time_per_iteration": 2.6106560230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077622, + "balance_loss_mlp": 1.05387568, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.06484532402040227, + "language_loss": 0.80351543, + "learning_rate": 0.00011390272558573461, + "loss": 0.81429172, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.23730469, + "step": 4093, + "time_per_iteration": 2.6585676670074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_mlp": 1.04527855, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.06429182980195784, + "language_loss": 0.79664457, + "learning_rate": 0.00011370485087309202, + "loss": 0.80732632, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.22888184, + "step": 4094, + "time_per_iteration": 2.6336958408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.05357265, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.05995704636978539, + "language_loss": 0.79134881, + "learning_rate": 0.00011350712613352688, + "loss": 0.80212498, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.24023438, + "step": 4095, + "time_per_iteration": 2.7010250091552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075966, + "balance_loss_mlp": 1.05183816, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.08641736652738899, + "language_loss": 0.79282218, + "learning_rate": 0.00011330955144380283, + "loss": 0.80358183, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.24121094, + "step": 4096, + "time_per_iteration": 2.608861207962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075916, + "balance_loss_mlp": 1.0511322, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.05952811942716876, + "language_loss": 0.8624779, + "learning_rate": 0.00011311212688062483, + "loss": 0.87323707, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.24780273, + "step": 4097, + "time_per_iteration": 2.779981851577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071321, + "balance_loss_mlp": 1.04694271, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.07195937030471744, + "language_loss": 0.77942967, + "learning_rate": 0.0001129148525206402, + "loss": 0.79014289, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.24365234, + "step": 4098, + "time_per_iteration": 2.821665048599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.04944086, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.06162490932245696, + "language_loss": 0.86554086, + "learning_rate": 0.00011271772844043759, + "loss": 0.87627459, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.23913574, + "step": 4099, + "time_per_iteration": 2.6586296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071216, + "balance_loss_mlp": 1.04692149, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.0681580072696929, + "language_loss": 0.75896525, + "learning_rate": 0.00011252075471654727, + "loss": 0.7696774, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.24279785, + "step": 4100, + "time_per_iteration": 2.9564926624298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078491, + "balance_loss_mlp": 1.05411243, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.05782591826916926, + "language_loss": 0.78057027, + "learning_rate": 0.00011232393142544133, + "loss": 0.79135513, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.24365234, + "step": 4101, + "time_per_iteration": 2.9211971759796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071535, + "balance_loss_mlp": 1.04766965, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.05727153385577965, + "language_loss": 0.83000249, + "learning_rate": 0.00011212725864353323, + "loss": 0.84071785, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.23864746, + "step": 4102, + "time_per_iteration": 3.0621325969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010163, + "balance_loss_mlp": 1.00324917, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.011162541851203939, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77346092, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.06933594, + "step": 4103, + "time_per_iteration": 4.843589782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.05412483, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.06819602416212077, + "language_loss": 0.7587899, + "learning_rate": 0.00011173436491267291, + "loss": 0.76958299, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.25195312, + "step": 4104, + "time_per_iteration": 2.5900800228118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071242, + "balance_loss_mlp": 1.04709017, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.058920781794481306, + "language_loss": 0.82051444, + "learning_rate": 0.0001115381441162554, + "loss": 0.83122683, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.24145508, + "step": 4105, + "time_per_iteration": 2.615739345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010138, + "balance_loss_mlp": 1.00322342, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.008527676451490414, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74593866, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.06933594, + "step": 4106, + "time_per_iteration": 4.8757970333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073097, + "balance_loss_mlp": 1.04834938, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06585069884795663, + "language_loss": 0.85103577, + "learning_rate": 0.00011114615504234465, + "loss": 0.8617667, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.24743652, + "step": 4107, + "time_per_iteration": 2.7732784748077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069208, + "balance_loss_mlp": 1.04528236, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.058284414227118955, + "language_loss": 0.80976272, + "learning_rate": 0.00011095038691703468, + "loss": 0.82045484, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.23913574, + "step": 4108, + "time_per_iteration": 2.8352532386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.04486418, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.05943698489540686, + "language_loss": 0.82732534, + "learning_rate": 0.00011075476983417998, + "loss": 0.83800745, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.2331543, + "step": 4109, + "time_per_iteration": 2.8430795669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074219, + "balance_loss_mlp": 1.04974484, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.060761374813596516, + "language_loss": 0.77827907, + "learning_rate": 0.00011055930386972579, + "loss": 0.78902125, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.24487305, + "step": 4110, + "time_per_iteration": 2.8313000202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04976118, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.06263812917256945, + "language_loss": 0.78423452, + "learning_rate": 0.00011036398909955863, + "loss": 0.79498512, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.2532959, + "step": 4111, + "time_per_iteration": 2.952772378921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070116, + "balance_loss_mlp": 1.04626179, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.053341385482647614, + "language_loss": 0.8184247, + "learning_rate": 0.00011016882559950648, + "loss": 0.82912588, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.23852539, + "step": 4112, + "time_per_iteration": 2.8120028972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071944, + "balance_loss_mlp": 1.0478282, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.05974230031891692, + "language_loss": 0.81143081, + "learning_rate": 0.00010997381344533853, + "loss": 0.82215035, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.2409668, + "step": 4113, + "time_per_iteration": 2.7792022228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072154, + "balance_loss_mlp": 1.04729867, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.1069423386655018, + "language_loss": 0.80604601, + "learning_rate": 0.00010977895271276517, + "loss": 0.81676757, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.24853516, + "step": 4114, + "time_per_iteration": 2.663350820541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076688, + "balance_loss_mlp": 1.05166614, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.054772292847761084, + "language_loss": 0.80259216, + "learning_rate": 0.00010958424347743807, + "loss": 0.81335902, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.25036621, + "step": 4115, + "time_per_iteration": 2.7451772689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068946, + "balance_loss_mlp": 1.04523504, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06136237031197372, + "language_loss": 0.80247879, + "learning_rate": 0.00010938968581494991, + "loss": 0.81316829, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.23718262, + "step": 4116, + "time_per_iteration": 2.953747034072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076632, + "balance_loss_mlp": 1.05221832, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.35095947289915363, + "language_loss": 0.79212964, + "learning_rate": 0.000109195279800835, + "loss": 0.80289596, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.24414062, + "step": 4117, + "time_per_iteration": 2.715400218963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107219, + "balance_loss_mlp": 1.04746628, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.07732725032472638, + "language_loss": 0.76730645, + "learning_rate": 0.00010900102551056834, + "loss": 0.77802831, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.24719238, + "step": 4118, + "time_per_iteration": 3.0449063777923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074277, + "balance_loss_mlp": 1.05074465, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.056769240767155345, + "language_loss": 0.84711397, + "learning_rate": 0.00010880692301956601, + "loss": 0.85785675, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.23535156, + "step": 4119, + "time_per_iteration": 2.4458513259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072784, + "balance_loss_mlp": 1.0488348, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.050731815554774906, + "language_loss": 0.86393607, + "learning_rate": 0.00010861297240318518, + "loss": 0.87466389, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.23937988, + "step": 4120, + "time_per_iteration": 2.8899548053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071017, + "balance_loss_mlp": 1.04761648, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.05101998246826083, + "language_loss": 0.8699168, + "learning_rate": 0.00010841917373672444, + "loss": 0.88062692, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.23388672, + "step": 4121, + "time_per_iteration": 2.735093593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072991, + "balance_loss_mlp": 1.04925656, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.06441744003390583, + "language_loss": 0.78287446, + "learning_rate": 0.00010822552709542293, + "loss": 0.79360437, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.23730469, + "step": 4122, + "time_per_iteration": 2.807694911956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107165, + "balance_loss_mlp": 1.04821384, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.05111544046549841, + "language_loss": 0.86263895, + "learning_rate": 0.0001080320325544612, + "loss": 0.87335551, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.23425293, + "step": 4123, + "time_per_iteration": 2.7574799060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073909, + "balance_loss_mlp": 1.04917264, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.061782912255848775, + "language_loss": 0.82952595, + "learning_rate": 0.00010783869018895997, + "loss": 0.84026504, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.24731445, + "step": 4124, + "time_per_iteration": 2.5689857006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070849, + "balance_loss_mlp": 1.04611349, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.05779110209300624, + "language_loss": 0.84302074, + "learning_rate": 0.00010764550007398189, + "loss": 0.85372925, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.24755859, + "step": 4125, + "time_per_iteration": 2.624270439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072775, + "balance_loss_mlp": 1.04818177, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.06779162634736255, + "language_loss": 0.81683314, + "learning_rate": 0.00010745246228452982, + "loss": 0.8275609, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.24597168, + "step": 4126, + "time_per_iteration": 2.6135480403900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073416, + "balance_loss_mlp": 1.04958653, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.08048379704700387, + "language_loss": 0.81824982, + "learning_rate": 0.00010725957689554771, + "loss": 0.82898396, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.23791504, + "step": 4127, + "time_per_iteration": 2.7506372928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072246, + "balance_loss_mlp": 1.0476892, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.05152467214863494, + "language_loss": 0.84890383, + "learning_rate": 0.00010706684398192013, + "loss": 0.85962629, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.2454834, + "step": 4128, + "time_per_iteration": 2.707517385482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070232, + "balance_loss_mlp": 1.04580581, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.06480144593438951, + "language_loss": 0.82284296, + "learning_rate": 0.00010687426361847313, + "loss": 0.83354527, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.24414062, + "step": 4129, + "time_per_iteration": 2.7257397174835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076314, + "balance_loss_mlp": 1.05154264, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.058525612459034176, + "language_loss": 0.85989046, + "learning_rate": 0.00010668183587997254, + "loss": 0.87065363, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.24780273, + "step": 4130, + "time_per_iteration": 2.6166372299194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107138, + "balance_loss_mlp": 1.04734755, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.05131686661427292, + "language_loss": 0.77755392, + "learning_rate": 0.0001064895608411256, + "loss": 0.78826773, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.24047852, + "step": 4131, + "time_per_iteration": 2.806696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074078, + "balance_loss_mlp": 1.04922318, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.06416883380663327, + "language_loss": 0.80383301, + "learning_rate": 0.00010629743857657998, + "loss": 0.81457376, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.24853516, + "step": 4132, + "time_per_iteration": 2.9960997104644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_mlp": 1.02619219, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.021894290253507743, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71631676, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.06835938, + "step": 4133, + "time_per_iteration": 4.623692512512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_mlp": 1.05289078, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.06117584542977267, + "language_loss": 0.82268703, + "learning_rate": 0.00010591365266868802, + "loss": 0.83344853, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.23254395, + "step": 4134, + "time_per_iteration": 2.9868671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_mlp": 1.02421665, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.02172018156045361, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76542771, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.06787109, + "step": 4135, + "time_per_iteration": 4.976134300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071076, + "balance_loss_mlp": 1.04654241, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.06221307729096171, + "language_loss": 0.79485571, + "learning_rate": 0.00010553047875229166, + "loss": 0.80556649, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.24536133, + "step": 4136, + "time_per_iteration": 2.5057613849639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077488, + "balance_loss_mlp": 1.05284762, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.058263200481796444, + "language_loss": 0.83615577, + "learning_rate": 0.00010533912147689328, + "loss": 0.84693062, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.24645996, + "step": 4137, + "time_per_iteration": 2.601483106613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073106, + "balance_loss_mlp": 1.04894197, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.05046290164389679, + "language_loss": 0.82613522, + "learning_rate": 0.00010514791742243656, + "loss": 0.83686626, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.24157715, + "step": 4138, + "time_per_iteration": 2.5787172317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069273, + "balance_loss_mlp": 1.04491878, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.06370293603956936, + "language_loss": 0.82636452, + "learning_rate": 0.00010495686666315341, + "loss": 0.83705723, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.2434082, + "step": 4139, + "time_per_iteration": 2.9137964248657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074776, + "balance_loss_mlp": 1.05144691, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.06334360078787715, + "language_loss": 0.77300745, + "learning_rate": 0.00010476596927321635, + "loss": 0.78375518, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.23327637, + "step": 4140, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071011, + "balance_loss_mlp": 1.04807556, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.04689989949815107, + "language_loss": 0.80298364, + "learning_rate": 0.00010457522532673835, + "loss": 0.81369376, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.22924805, + "step": 4141, + "time_per_iteration": 2.804485321044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074886, + "balance_loss_mlp": 1.05092525, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.0598189384756839, + "language_loss": 0.83671814, + "learning_rate": 0.00010438463489777272, + "loss": 0.84746695, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.23937988, + "step": 4142, + "time_per_iteration": 2.593764305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071767, + "balance_loss_mlp": 1.047472, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.06370121218595463, + "language_loss": 0.77954531, + "learning_rate": 0.00010419419806031316, + "loss": 0.79026294, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.24291992, + "step": 4143, + "time_per_iteration": 2.6709976196289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073716, + "balance_loss_mlp": 1.05002928, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.05491242983846696, + "language_loss": 0.84102464, + "learning_rate": 0.00010400391488829403, + "loss": 0.85176182, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.23681641, + "step": 4144, + "time_per_iteration": 2.767470121383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074277, + "balance_loss_mlp": 1.04962492, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.0554387027549828, + "language_loss": 0.86743295, + "learning_rate": 0.00010381378545558984, + "loss": 0.87817574, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.24658203, + "step": 4145, + "time_per_iteration": 2.7449891567230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067536, + "balance_loss_mlp": 1.04415917, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.05491436381940487, + "language_loss": 0.8494069, + "learning_rate": 0.00010362380983601505, + "loss": 0.86008221, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.23352051, + "step": 4146, + "time_per_iteration": 2.540022134780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071068, + "balance_loss_mlp": 1.04748869, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.05412435738245621, + "language_loss": 0.79004198, + "learning_rate": 0.00010343398810332477, + "loss": 0.80075264, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.2355957, + "step": 4147, + "time_per_iteration": 3.457382917404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064363, + "balance_loss_mlp": 1.04003215, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.057902148991934105, + "language_loss": 0.84552854, + "learning_rate": 0.00010324432033121467, + "loss": 0.8561722, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.2434082, + "step": 4148, + "time_per_iteration": 2.923807382583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068154, + "balance_loss_mlp": 1.04443097, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.05631253152540408, + "language_loss": 0.83762592, + "learning_rate": 0.00010305480659332005, + "loss": 0.84830743, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.23718262, + "step": 4149, + "time_per_iteration": 2.58620285987854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069961, + "balance_loss_mlp": 1.04592896, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.0573271525329392, + "language_loss": 0.83780408, + "learning_rate": 0.00010286544696321682, + "loss": 0.84850371, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.24023438, + "step": 4150, + "time_per_iteration": 2.5883052349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066534, + "balance_loss_mlp": 1.04267991, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.06896512478110718, + "language_loss": 0.79666734, + "learning_rate": 0.00010267624151442073, + "loss": 0.80733263, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.23864746, + "step": 4151, + "time_per_iteration": 2.660498857498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.04581642, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.08734927072671815, + "language_loss": 0.81357807, + "learning_rate": 0.000102487190320388, + "loss": 0.82427216, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.23596191, + "step": 4152, + "time_per_iteration": 3.378927230834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066042, + "balance_loss_mlp": 1.04099584, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.06453133942638287, + "language_loss": 0.79819167, + "learning_rate": 0.00010229829345451475, + "loss": 0.80885208, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.25061035, + "step": 4153, + "time_per_iteration": 3.319824457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068485, + "balance_loss_mlp": 1.04403472, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06331967067065977, + "language_loss": 0.79648089, + "learning_rate": 0.00010210955099013724, + "loss": 0.8071658, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.24438477, + "step": 4154, + "time_per_iteration": 3.4123902320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066775, + "balance_loss_mlp": 1.04323101, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.07636085006530072, + "language_loss": 0.76853955, + "learning_rate": 0.00010192096300053167, + "loss": 0.77920735, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.23547363, + "step": 4155, + "time_per_iteration": 3.0885937213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062142, + "balance_loss_mlp": 1.03831244, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.054659422757200274, + "language_loss": 0.85324514, + "learning_rate": 0.00010173252955891477, + "loss": 0.86386657, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.23828125, + "step": 4156, + "time_per_iteration": 2.7677974700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_mlp": 1.04228568, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07109273842515242, + "language_loss": 0.7358321, + "learning_rate": 0.00010154425073844253, + "loss": 0.74649352, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.23840332, + "step": 4157, + "time_per_iteration": 2.6927075386047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068264, + "balance_loss_mlp": 1.0444932, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.045745633219446115, + "language_loss": 0.82380056, + "learning_rate": 0.00010135612661221138, + "loss": 0.83448321, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.23742676, + "step": 4158, + "time_per_iteration": 2.5854756832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069104, + "balance_loss_mlp": 1.04447556, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.11567647192687848, + "language_loss": 0.82128304, + "learning_rate": 0.00010116815725325751, + "loss": 0.83197409, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.24633789, + "step": 4159, + "time_per_iteration": 3.2739415168762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069034, + "balance_loss_mlp": 1.04451275, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.051597263691987395, + "language_loss": 0.8072108, + "learning_rate": 0.00010098034273455725, + "loss": 0.81790113, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.24523926, + "step": 4160, + "time_per_iteration": 2.9544758796691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_mlp": 1.03870511, + "epoch": 0.8005001923816852, + "flos": 488465925120.0, + "grad_norm": 0.05783262033731221, + "language_loss": 0.8016758, + "learning_rate": 0.00010079268312902662, + "loss": 0.81229925, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.23632812, + "step": 4161, + "time_per_iteration": 2.6659867763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068128, + "balance_loss_mlp": 1.0439285, + "epoch": 0.8006925740669488, + "flos": 513248306688.0, + "grad_norm": 0.061865386123552114, + "language_loss": 0.81856108, + "learning_rate": 0.0001006051785095215, + "loss": 0.82924247, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.24206543, + "step": 4162, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_mlp": 1.03975272, + "epoch": 0.8008849557522124, + "flos": 578529879552.0, + "grad_norm": 0.07843686612614051, + "language_loss": 0.79364276, + "learning_rate": 0.0001004178289488376, + "loss": 0.8042888, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.24841309, + "step": 4163, + "time_per_iteration": 2.7118594646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065979, + "balance_loss_mlp": 1.04216099, + "epoch": 0.801077337437476, + "flos": 478708766208.0, + "grad_norm": 0.053856509771467526, + "language_loss": 0.83958352, + "learning_rate": 0.0001002306345197106, + "loss": 0.85024333, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.23815918, + "step": 4164, + "time_per_iteration": 2.578308343887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068826, + "balance_loss_mlp": 1.04498386, + "epoch": 0.8012697191227395, + "flos": 676700573184.0, + "grad_norm": 0.06071324675869247, + "language_loss": 0.80072832, + "learning_rate": 0.00010004359529481571, + "loss": 0.81141657, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.23828125, + "step": 4165, + "time_per_iteration": 2.905269145965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069548, + "balance_loss_mlp": 1.04625463, + "epoch": 0.8014621008080031, + "flos": 1295132405760.0, + "grad_norm": 0.06586004492951407, + "language_loss": 0.82562852, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83632404, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.23303223, + "step": 4166, + "time_per_iteration": 3.7128326892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.0523994, + "epoch": 0.8016544824932667, + "flos": 511827683328.0, + "grad_norm": 0.07654195541387675, + "language_loss": 0.83752513, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84828973, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.24035645, + "step": 4167, + "time_per_iteration": 2.6038944721221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073049, + "balance_loss_mlp": 1.04911208, + "epoch": 0.8018468641785302, + "flos": 535690879488.0, + "grad_norm": 0.07186822354063922, + "language_loss": 0.81419587, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82492638, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.23925781, + "step": 4168, + "time_per_iteration": 2.6602823734283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.04145718, + "epoch": 0.8020392458637937, + "flos": 1023431086080.0, + "grad_norm": 0.07423668989948455, + "language_loss": 0.79874551, + "learning_rate": 9.929699188895447e-05, + "loss": 0.80939651, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.23620605, + "step": 4169, + "time_per_iteration": 3.2493438720703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018878, + "balance_loss_mlp": 1.01201177, + "epoch": 0.8022316275490573, + "flos": 1561806821376.0, + "grad_norm": 0.012589088151617522, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.7907328, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.06884766, + "step": 4170, + "time_per_iteration": 4.9680397510528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074777, + "balance_loss_mlp": 1.05058968, + "epoch": 0.8024240092343209, + "flos": 420698810880.0, + "grad_norm": 0.059456208257663117, + "language_loss": 0.83677548, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84752321, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.24194336, + "step": 4171, + "time_per_iteration": 2.5254433155059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072144, + "balance_loss_mlp": 1.04793262, + "epoch": 0.8026163909195845, + "flos": 763836645888.0, + "grad_norm": 0.06548527724702696, + "language_loss": 0.79033399, + "learning_rate": 9.873867253111762e-05, + "loss": 0.80105543, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.24206543, + "step": 4172, + "time_per_iteration": 3.027867317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016172, + "balance_loss_mlp": 1.00930583, + "epoch": 0.8028087726048481, + "flos": 1518861362688.0, + "grad_norm": 0.010389425904671548, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81280732, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.06884766, + "step": 4173, + "time_per_iteration": 4.953596353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068112, + "balance_loss_mlp": 1.04440093, + "epoch": 0.8030011542901115, + "flos": 517861486080.0, + "grad_norm": 0.05784474933725161, + "language_loss": 0.88619745, + "learning_rate": 9.836723842278733e-05, + "loss": 0.8968786, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.23706055, + "step": 4174, + "time_per_iteration": 2.5819122791290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073726, + "balance_loss_mlp": 1.0496335, + "epoch": 0.8031935359753751, + "flos": 545616165888.0, + "grad_norm": 0.06458070365487185, + "language_loss": 0.77987558, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79061282, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.2409668, + "step": 4175, + "time_per_iteration": 2.6588666439056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066808, + "balance_loss_mlp": 1.04332376, + "epoch": 0.8033859176606387, + "flos": 603559309824.0, + "grad_norm": 0.05329489111459666, + "language_loss": 0.84854198, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85921007, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.23474121, + "step": 4176, + "time_per_iteration": 2.793970823287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073083, + "balance_loss_mlp": 1.04912198, + "epoch": 0.8035782993459023, + "flos": 565859520000.0, + "grad_norm": 0.06573423775320317, + "language_loss": 0.81603885, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82676965, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.23937988, + "step": 4177, + "time_per_iteration": 2.6889266967773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069602, + "balance_loss_mlp": 1.04500914, + "epoch": 0.8037706810311658, + "flos": 538435952640.0, + "grad_norm": 0.06472425874294328, + "language_loss": 0.84585536, + "learning_rate": 9.762624191379054e-05, + "loss": 0.85655141, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.24584961, + "step": 4178, + "time_per_iteration": 2.6785759925842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_mlp": 1.04635787, + "epoch": 0.8039630627164294, + "flos": 515187993600.0, + "grad_norm": 0.06080511762192285, + "language_loss": 0.79866344, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80935353, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.22644043, + "step": 4179, + "time_per_iteration": 2.638092517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006529, + "balance_loss_mlp": 0.99971008, + "epoch": 0.804155444401693, + "flos": 1478834247168.0, + "grad_norm": 0.007721922864740662, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75740093, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.06835938, + "step": 4180, + "time_per_iteration": 4.915950775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070552, + "balance_loss_mlp": 1.04663897, + "epoch": 0.8043478260869565, + "flos": 521164896768.0, + "grad_norm": 0.07383497316313108, + "language_loss": 0.77627772, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78698325, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.23901367, + "step": 4181, + "time_per_iteration": 2.6206631660461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068366, + "balance_loss_mlp": 1.04459548, + "epoch": 0.8045402077722201, + "flos": 545448038400.0, + "grad_norm": 0.05588102000641447, + "language_loss": 0.80511087, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81579459, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.23754883, + "step": 4182, + "time_per_iteration": 2.762315511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071285, + "balance_loss_mlp": 1.0475862, + "epoch": 0.8047325894574836, + "flos": 678388068864.0, + "grad_norm": 0.06106500726873185, + "language_loss": 0.73959017, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75030297, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.23681641, + "step": 4183, + "time_per_iteration": 2.914637327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067779, + "balance_loss_mlp": 1.04349661, + "epoch": 0.8049249711427472, + "flos": 587227262976.0, + "grad_norm": 0.06176389383154452, + "language_loss": 0.79018617, + "learning_rate": 9.65194350425882e-05, + "loss": 0.80086398, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.24291992, + "step": 4184, + "time_per_iteration": 2.7324373722076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.04418921, + "epoch": 0.8051173528280108, + "flos": 814194312192.0, + "grad_norm": 0.0704849570966166, + "language_loss": 0.78184354, + "learning_rate": 9.633551507115452e-05, + "loss": 0.79252017, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.23486328, + "step": 4185, + "time_per_iteration": 3.108370542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072015, + "balance_loss_mlp": 1.04811406, + "epoch": 0.8053097345132744, + "flos": 725687175168.0, + "grad_norm": 0.06261111839901821, + "language_loss": 0.77779377, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78851396, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.23876953, + "step": 4186, + "time_per_iteration": 2.9714622497558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071694, + "balance_loss_mlp": 1.04800677, + "epoch": 0.805502116198538, + "flos": 748050453504.0, + "grad_norm": 0.0639304252406746, + "language_loss": 0.81475556, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82547259, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.23669434, + "step": 4187, + "time_per_iteration": 2.9930331707000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.04930937, + "epoch": 0.8056944978838014, + "flos": 640258421760.0, + "grad_norm": 0.051419524714537625, + "language_loss": 0.87329555, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88402855, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.23999023, + "step": 4188, + "time_per_iteration": 2.9604146480560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069277, + "balance_loss_mlp": 1.04494643, + "epoch": 0.805886879569065, + "flos": 644631892992.0, + "grad_norm": 0.08161833889252136, + "language_loss": 0.78437293, + "learning_rate": 9.560140306306436e-05, + "loss": 0.7950657, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.2434082, + "step": 4189, + "time_per_iteration": 2.7467589378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067948, + "balance_loss_mlp": 1.04483366, + "epoch": 0.8060792612543286, + "flos": 661230812160.0, + "grad_norm": 0.06429171281300161, + "language_loss": 0.81920123, + "learning_rate": 9.541826738671233e-05, + "loss": 0.82988071, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.2310791, + "step": 4190, + "time_per_iteration": 2.8135294914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072966, + "balance_loss_mlp": 1.04954159, + "epoch": 0.8062716429395922, + "flos": 455075366400.0, + "grad_norm": 0.06490382146104284, + "language_loss": 0.8240993, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83482891, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.23425293, + "step": 4191, + "time_per_iteration": 2.5631208419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106835, + "balance_loss_mlp": 1.04356611, + "epoch": 0.8064640246248557, + "flos": 526407994368.0, + "grad_norm": 0.07814550212543987, + "language_loss": 0.85738069, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86806417, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.2479248, + "step": 4192, + "time_per_iteration": 2.6771442890167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075525, + "balance_loss_mlp": 1.05173111, + "epoch": 0.8066564063101193, + "flos": 865115458560.0, + "grad_norm": 0.05122743172221322, + "language_loss": 0.82006085, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83081615, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.23803711, + "step": 4193, + "time_per_iteration": 3.1859543323516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.04287791, + "epoch": 0.8068487879953828, + "flos": 530536614912.0, + "grad_norm": 0.05914433880976634, + "language_loss": 0.8218236, + "learning_rate": 9.468729611697246e-05, + "loss": 0.83249271, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.24035645, + "step": 4194, + "time_per_iteration": 2.6743924617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069484, + "balance_loss_mlp": 1.04611874, + "epoch": 0.8070411696806464, + "flos": 566183291904.0, + "grad_norm": 0.053953829426493176, + "language_loss": 0.82204551, + "learning_rate": 9.450494651319003e-05, + "loss": 0.83274031, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.23364258, + "step": 4195, + "time_per_iteration": 2.651991605758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068847, + "balance_loss_mlp": 1.04413462, + "epoch": 0.80723355136591, + "flos": 986591010816.0, + "grad_norm": 0.059924308887848395, + "language_loss": 0.7934891, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80417752, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.24719238, + "step": 4196, + "time_per_iteration": 3.319042921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_mlp": 1.04500604, + "epoch": 0.8074259330511735, + "flos": 566961513984.0, + "grad_norm": 0.05558320714369874, + "language_loss": 0.83282971, + "learning_rate": 9.414071965778221e-05, + "loss": 0.84352082, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.24108887, + "step": 4197, + "time_per_iteration": 2.787029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069448, + "balance_loss_mlp": 1.04553497, + "epoch": 0.8076183147364371, + "flos": 494662712832.0, + "grad_norm": 0.06138750227857287, + "language_loss": 0.79915768, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80985212, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.23901367, + "step": 4198, + "time_per_iteration": 2.716717481613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069564, + "balance_loss_mlp": 1.04573464, + "epoch": 0.8078106964217007, + "flos": 420011993088.0, + "grad_norm": 0.0692709388607304, + "language_loss": 0.80226934, + "learning_rate": 9.377712307650044e-05, + "loss": 0.81296504, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.23840332, + "step": 4199, + "time_per_iteration": 2.4748783111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062956, + "balance_loss_mlp": 1.03986561, + "epoch": 0.8080030781069643, + "flos": 527537152512.0, + "grad_norm": 0.06697499154438591, + "language_loss": 0.83209741, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84272695, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.23083496, + "step": 4200, + "time_per_iteration": 2.65059757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069346, + "balance_loss_mlp": 1.04487228, + "epoch": 0.8081954597922277, + "flos": 544148554752.0, + "grad_norm": 0.07319165127998006, + "language_loss": 0.81825453, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82894802, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.24462891, + "step": 4201, + "time_per_iteration": 2.6146020889282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074978, + "balance_loss_mlp": 1.04963422, + "epoch": 0.8083878414774913, + "flos": 640900823040.0, + "grad_norm": 0.06857691771567244, + "language_loss": 0.75549376, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76624352, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.25378418, + "step": 4202, + "time_per_iteration": 2.8163068294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068191, + "balance_loss_mlp": 1.04393232, + "epoch": 0.8085802231627549, + "flos": 705614146560.0, + "grad_norm": 0.05917117219948211, + "language_loss": 0.72779202, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73847389, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.24243164, + "step": 4203, + "time_per_iteration": 2.926943063735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_mlp": 1.03952456, + "epoch": 0.8087726048480185, + "flos": 419762373120.0, + "grad_norm": 0.06288185583259268, + "language_loss": 0.88665909, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89728582, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.23156738, + "step": 4204, + "time_per_iteration": 2.533940076828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107244, + "balance_loss_mlp": 1.04898, + "epoch": 0.8089649865332821, + "flos": 508766178816.0, + "grad_norm": 0.04485916890924309, + "language_loss": 0.8704623, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88118672, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.23461914, + "step": 4205, + "time_per_iteration": 2.8577754497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074426, + "balance_loss_mlp": 1.05071497, + "epoch": 0.8091573682185456, + "flos": 457219883520.0, + "grad_norm": 0.06131410050585944, + "language_loss": 0.85401237, + "learning_rate": 9.250950659394386e-05, + "loss": 0.86475658, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.23706055, + "step": 4206, + "time_per_iteration": 2.739069700241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067889, + "balance_loss_mlp": 1.04442883, + "epoch": 0.8093497499038091, + "flos": 525256441344.0, + "grad_norm": 0.058670433467628735, + "language_loss": 0.77132225, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78200108, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.23449707, + "step": 4207, + "time_per_iteration": 2.7943315505981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.05148828, + "epoch": 0.8095421315890727, + "flos": 489617478144.0, + "grad_norm": 0.060722658828799125, + "language_loss": 0.76975334, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78050661, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.23840332, + "step": 4208, + "time_per_iteration": 2.628981351852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072763, + "balance_loss_mlp": 1.04902852, + "epoch": 0.8097345132743363, + "flos": 625109861376.0, + "grad_norm": 0.06220580039777883, + "language_loss": 0.80972207, + "learning_rate": 9.196861401017164e-05, + "loss": 0.82044971, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.23706055, + "step": 4209, + "time_per_iteration": 2.7958450317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074238, + "balance_loss_mlp": 1.05026519, + "epoch": 0.8099268949595998, + "flos": 615688584192.0, + "grad_norm": 0.06273370335614084, + "language_loss": 0.79599589, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80673826, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.23962402, + "step": 4210, + "time_per_iteration": 2.7682175636291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077533, + "balance_loss_mlp": 1.0541079, + "epoch": 0.8101192766448634, + "flos": 479642632704.0, + "grad_norm": 0.06627219906647694, + "language_loss": 0.80003035, + "learning_rate": 9.160881089682566e-05, + "loss": 0.81080568, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.23400879, + "step": 4211, + "time_per_iteration": 2.631140947341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074787, + "balance_loss_mlp": 1.05109978, + "epoch": 0.810311658330127, + "flos": 517327741440.0, + "grad_norm": 0.05849303544720674, + "language_loss": 0.86804986, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87879777, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.23681641, + "step": 4212, + "time_per_iteration": 2.5948104858398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071765, + "balance_loss_mlp": 1.04823303, + "epoch": 0.8105040400153906, + "flos": 575782235136.0, + "grad_norm": 0.06296476398611156, + "language_loss": 0.841757, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85247463, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.23547363, + "step": 4213, + "time_per_iteration": 2.7866246700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072973, + "balance_loss_mlp": 1.04913127, + "epoch": 0.8106964217006541, + "flos": 638963707392.0, + "grad_norm": 0.057390560702911333, + "language_loss": 0.85144347, + "learning_rate": 9.107029553743862e-05, + "loss": 0.8621732, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.23840332, + "step": 4214, + "time_per_iteration": 2.842522144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079487, + "balance_loss_mlp": 1.05565715, + "epoch": 0.8108888033859176, + "flos": 579505964544.0, + "grad_norm": 0.06211235803289476, + "language_loss": 0.81593323, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82672811, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.23815918, + "step": 4215, + "time_per_iteration": 2.700958251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.05348349, + "epoch": 0.8110811850711812, + "flos": 559907209728.0, + "grad_norm": 0.05316807102824332, + "language_loss": 0.8362149, + "learning_rate": 9.071207898465284e-05, + "loss": 0.8469913, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.24157715, + "step": 4216, + "time_per_iteration": 2.8000106811523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003326, + "balance_loss_mlp": 0.996364, + "epoch": 0.8112735667564448, + "flos": 1517939979264.0, + "grad_norm": 0.009729060219912019, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78263742, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.06982422, + "step": 4217, + "time_per_iteration": 5.381849765777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078499, + "balance_loss_mlp": 1.05508685, + "epoch": 0.8114659484417084, + "flos": 616340897280.0, + "grad_norm": 0.07154443867922712, + "language_loss": 0.85639107, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86717606, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.23400879, + "step": 4218, + "time_per_iteration": 2.835637092590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_mlp": 1.04660106, + "epoch": 0.8116583301269719, + "flos": 649951340544.0, + "grad_norm": 0.056455139203309426, + "language_loss": 0.79380965, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80451119, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.23535156, + "step": 4219, + "time_per_iteration": 2.942309617996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.05086303, + "epoch": 0.8118507118122354, + "flos": 553087844352.0, + "grad_norm": 0.06424342270235255, + "language_loss": 0.80477631, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81552052, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.23535156, + "step": 4220, + "time_per_iteration": 2.708371877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069972, + "balance_loss_mlp": 1.04653561, + "epoch": 0.812043093497499, + "flos": 544118819328.0, + "grad_norm": 0.07378899764991645, + "language_loss": 0.87550426, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88620389, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.234375, + "step": 4221, + "time_per_iteration": 2.6693153381347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069628, + "balance_loss_mlp": 1.04652536, + "epoch": 0.8122354751827626, + "flos": 583404788736.0, + "grad_norm": 0.054351261228181624, + "language_loss": 0.83540028, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84609658, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.23095703, + "step": 4222, + "time_per_iteration": 2.7870144844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_mlp": 0.99706995, + "epoch": 0.8124278568680262, + "flos": 1530568120320.0, + "grad_norm": 0.009675250582902717, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79254061, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.06933594, + "step": 4223, + "time_per_iteration": 4.929150581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073586, + "balance_loss_mlp": 1.04953003, + "epoch": 0.8126202385532897, + "flos": 432865161216.0, + "grad_norm": 0.06048023651851461, + "language_loss": 0.80246794, + "learning_rate": 8.928557430748668e-05, + "loss": 0.81320387, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.24047852, + "step": 4224, + "time_per_iteration": 2.558927059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005286, + "balance_loss_mlp": 0.99841946, + "epoch": 0.8128126202385533, + "flos": 1547905987584.0, + "grad_norm": 0.007942256322590725, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77500916, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.06884766, + "step": 4225, + "time_per_iteration": 4.829640865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_mlp": 1.04875958, + "epoch": 0.8130050019238169, + "flos": 528317945856.0, + "grad_norm": 0.06071065403013227, + "language_loss": 0.89078009, + "learning_rate": 8.893054129078077e-05, + "loss": 0.90149975, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.23217773, + "step": 4226, + "time_per_iteration": 2.6663520336151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074451, + "balance_loss_mlp": 1.04990602, + "epoch": 0.8131973836090804, + "flos": 543125481984.0, + "grad_norm": 0.05854730036453429, + "language_loss": 0.80283505, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81357956, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.24523926, + "step": 4227, + "time_per_iteration": 2.737178087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_mlp": 1.0496552, + "epoch": 0.8133897652943439, + "flos": 576494019072.0, + "grad_norm": 0.07250995282732464, + "language_loss": 0.82311916, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83386123, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.2454834, + "step": 4228, + "time_per_iteration": 2.7085983753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072354, + "balance_loss_mlp": 1.0487864, + "epoch": 0.8135821469796075, + "flos": 579219268608.0, + "grad_norm": 0.06181457109601203, + "language_loss": 0.79171389, + "learning_rate": 8.839918887251025e-05, + "loss": 0.80243742, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.23571777, + "step": 4229, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107323, + "balance_loss_mlp": 1.04960322, + "epoch": 0.8137745286648711, + "flos": 650346693120.0, + "grad_norm": 0.05392017949015088, + "language_loss": 0.84132361, + "learning_rate": 8.822239090334472e-05, + "loss": 0.85205585, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.23620605, + "step": 4230, + "time_per_iteration": 2.934464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107431, + "balance_loss_mlp": 1.04952633, + "epoch": 0.8139669103501347, + "flos": 701888219136.0, + "grad_norm": 0.059650707054353234, + "language_loss": 0.75657654, + "learning_rate": 8.804575280042493e-05, + "loss": 0.76731968, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.2479248, + "step": 4231, + "time_per_iteration": 2.8997764587402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107496, + "balance_loss_mlp": 1.05128467, + "epoch": 0.8141592920353983, + "flos": 650223355392.0, + "grad_norm": 0.08031341432177187, + "language_loss": 0.83293855, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84368813, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.23657227, + "step": 4232, + "time_per_iteration": 2.823725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.04657876, + "epoch": 0.8143516737206618, + "flos": 536829949440.0, + "grad_norm": 0.06617341429663869, + "language_loss": 0.81372333, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82443595, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.24694824, + "step": 4233, + "time_per_iteration": 2.596062660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076841, + "balance_loss_mlp": 1.05249834, + "epoch": 0.8145440554059253, + "flos": 508366056960.0, + "grad_norm": 0.06250257700047851, + "language_loss": 0.82593143, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83669984, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.2434082, + "step": 4234, + "time_per_iteration": 2.573758602142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072995, + "balance_loss_mlp": 1.04948664, + "epoch": 0.8147364370911889, + "flos": 635032576512.0, + "grad_norm": 0.05248303737167748, + "language_loss": 0.86687446, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87760437, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.23498535, + "step": 4235, + "time_per_iteration": 2.8768227100372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.04670143, + "epoch": 0.8149288187764525, + "flos": 422801482752.0, + "grad_norm": 0.0632046895747122, + "language_loss": 0.78667194, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79738808, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.24926758, + "step": 4236, + "time_per_iteration": 2.479666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072748, + "balance_loss_mlp": 1.05016923, + "epoch": 0.8151212004617161, + "flos": 597444014592.0, + "grad_norm": 0.05718912918333018, + "language_loss": 0.81758833, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82831585, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.22583008, + "step": 4237, + "time_per_iteration": 2.7750775814056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010102, + "balance_loss_mlp": 1.00333071, + "epoch": 0.8153135821469796, + "flos": 1479330915840.0, + "grad_norm": 0.007872761056194484, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78862947, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.06787109, + "step": 4238, + "time_per_iteration": 4.9892895221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070513, + "balance_loss_mlp": 1.04645634, + "epoch": 0.8155059638322432, + "flos": 437097669120.0, + "grad_norm": 0.058923617839845364, + "language_loss": 0.82525653, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83596164, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.24035645, + "step": 4239, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070695, + "balance_loss_mlp": 1.04635239, + "epoch": 0.8156983455175068, + "flos": 794390727168.0, + "grad_norm": 0.05678086613450876, + "language_loss": 0.85605669, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86676371, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.24328613, + "step": 4240, + "time_per_iteration": 3.0692131519317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073498, + "balance_loss_mlp": 1.0495373, + "epoch": 0.8158907272027703, + "flos": 685986029568.0, + "grad_norm": 0.059688242669239784, + "language_loss": 0.81930083, + "learning_rate": 8.628817947092616e-05, + "loss": 0.83003575, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.23974609, + "step": 4241, + "time_per_iteration": 2.845006227493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074554, + "balance_loss_mlp": 1.05058062, + "epoch": 0.8160831088880338, + "flos": 487055213568.0, + "grad_norm": 0.07205757206043806, + "language_loss": 0.8466413, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85738689, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.23974609, + "step": 4242, + "time_per_iteration": 2.540684461593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076416, + "balance_loss_mlp": 1.0526818, + "epoch": 0.8162754905732974, + "flos": 464872172544.0, + "grad_norm": 0.060485918814619816, + "language_loss": 0.80289745, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81366158, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.23730469, + "step": 4243, + "time_per_iteration": 2.5800933837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010569, + "balance_loss_mlp": 1.00384557, + "epoch": 0.816467872258561, + "flos": 1239530522112.0, + "grad_norm": 0.005783108588514147, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76295686, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.06738281, + "step": 4244, + "time_per_iteration": 4.735422611236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073017, + "balance_loss_mlp": 1.04923451, + "epoch": 0.8166602539438246, + "flos": 687169516032.0, + "grad_norm": 0.058029924379859654, + "language_loss": 0.86948967, + "learning_rate": 8.558964360534615e-05, + "loss": 0.88021982, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.23779297, + "step": 4245, + "time_per_iteration": 2.920170783996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011667, + "balance_loss_mlp": 1.00494385, + "epoch": 0.8168526356290882, + "flos": 1490520807936.0, + "grad_norm": 0.004946966844072032, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73986411, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.06738281, + "step": 4246, + "time_per_iteration": 4.9647252559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073957, + "balance_loss_mlp": 1.0499959, + "epoch": 0.8170450173143516, + "flos": 578201338368.0, + "grad_norm": 0.05414239297152735, + "language_loss": 0.84796524, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85870481, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.23950195, + "step": 4247, + "time_per_iteration": 2.730175256729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073063, + "balance_loss_mlp": 1.04956651, + "epoch": 0.8172373989996152, + "flos": 571275514368.0, + "grad_norm": 0.0616901984083448, + "language_loss": 0.84620667, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85693735, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.23486328, + "step": 4248, + "time_per_iteration": 2.741957664489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076046, + "balance_loss_mlp": 1.05173934, + "epoch": 0.8174297806848788, + "flos": 528831866880.0, + "grad_norm": 0.06067464626133788, + "language_loss": 0.81253791, + "learning_rate": 8.489368195241948e-05, + "loss": 0.82329834, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.24304199, + "step": 4249, + "time_per_iteration": 2.7099785804748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069837, + "balance_loss_mlp": 1.04637706, + "epoch": 0.8176221623701424, + "flos": 569108602368.0, + "grad_norm": 0.057171818885021125, + "language_loss": 0.7918399, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80253828, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.23461914, + "step": 4250, + "time_per_iteration": 2.7882819175720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_mlp": 1.05128062, + "epoch": 0.8178145440554059, + "flos": 656521459200.0, + "grad_norm": 0.07586143278279464, + "language_loss": 0.80699354, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81774032, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.23388672, + "step": 4251, + "time_per_iteration": 2.8482766151428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069339, + "balance_loss_mlp": 1.04569983, + "epoch": 0.8180069257406695, + "flos": 545924883456.0, + "grad_norm": 0.0891506562399526, + "language_loss": 0.87943554, + "learning_rate": 8.437340264101828e-05, + "loss": 0.89012891, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.23657227, + "step": 4252, + "time_per_iteration": 2.690966844558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070632, + "balance_loss_mlp": 1.04671824, + "epoch": 0.818199307425933, + "flos": 619271350272.0, + "grad_norm": 0.06457353202356947, + "language_loss": 0.84974635, + "learning_rate": 8.420029883528474e-05, + "loss": 0.86045271, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.23876953, + "step": 4253, + "time_per_iteration": 2.7346584796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107695, + "balance_loss_mlp": 1.05236959, + "epoch": 0.8183916891111966, + "flos": 647618872320.0, + "grad_norm": 0.058517110393729366, + "language_loss": 0.7747196, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78548908, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.24560547, + "step": 4254, + "time_per_iteration": 2.902817964553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073698, + "balance_loss_mlp": 1.05016649, + "epoch": 0.8185840707964602, + "flos": 499120247808.0, + "grad_norm": 0.0672149584819886, + "language_loss": 0.78553545, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79627252, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.23510742, + "step": 4255, + "time_per_iteration": 2.5662145614624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068342, + "balance_loss_mlp": 1.04424965, + "epoch": 0.8187764524817237, + "flos": 786229659648.0, + "grad_norm": 0.04856216619330027, + "language_loss": 0.79861438, + "learning_rate": 8.368195625315251e-05, + "loss": 0.8092978, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.24084473, + "step": 4256, + "time_per_iteration": 3.066568374633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072675, + "balance_loss_mlp": 1.04816532, + "epoch": 0.8189688341669873, + "flos": 550710959616.0, + "grad_norm": 0.0482875627424869, + "language_loss": 0.8077091, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81843579, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.24487305, + "step": 4257, + "time_per_iteration": 2.8208279609680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008421, + "balance_loss_mlp": 1.00174499, + "epoch": 0.8191612158522509, + "flos": 1351972435968.0, + "grad_norm": 0.0038578999028146157, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.7215777, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.06689453, + "step": 4258, + "time_per_iteration": 4.855503082275391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069037, + "balance_loss_mlp": 1.04530239, + "epoch": 0.8193535975375145, + "flos": 544257211392.0, + "grad_norm": 0.06050178824041954, + "language_loss": 0.84065247, + "learning_rate": 8.316506833163318e-05, + "loss": 0.85134286, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.23718262, + "step": 4259, + "time_per_iteration": 2.6480743885040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106949, + "balance_loss_mlp": 1.04551733, + "epoch": 0.8195459792227779, + "flos": 865733266944.0, + "grad_norm": 0.053169941526036414, + "language_loss": 0.85682011, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86751509, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.23986816, + "step": 4260, + "time_per_iteration": 3.0866482257843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.05007422, + "epoch": 0.8197383609080415, + "flos": 569293982208.0, + "grad_norm": 0.061667842904939214, + "language_loss": 0.81912506, + "learning_rate": 8.282128542083101e-05, + "loss": 0.8298648, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.2388916, + "step": 4261, + "time_per_iteration": 2.675989866256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.04430175, + "epoch": 0.8199307425933051, + "flos": 530813399040.0, + "grad_norm": 0.053936985731850594, + "language_loss": 0.84986472, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86053348, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.22583008, + "step": 4262, + "time_per_iteration": 2.6208925247192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073879, + "balance_loss_mlp": 1.04916656, + "epoch": 0.8201231242785687, + "flos": 567070170624.0, + "grad_norm": 0.053552526024574415, + "language_loss": 0.85210896, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86284775, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.24719238, + "step": 4263, + "time_per_iteration": 2.716259717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070202, + "balance_loss_mlp": 1.04670596, + "epoch": 0.8203155059638323, + "flos": 1230505717248.0, + "grad_norm": 0.061528756897057495, + "language_loss": 0.82971454, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84041655, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.23474121, + "step": 4264, + "time_per_iteration": 3.53371000289917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070087, + "balance_loss_mlp": 1.04562545, + "epoch": 0.8205078876490958, + "flos": 574198626816.0, + "grad_norm": 0.05437009195264598, + "language_loss": 0.80134302, + "learning_rate": 8.213566368959558e-05, + "loss": 0.81204391, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.24462891, + "step": 4265, + "time_per_iteration": 2.678452730178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068039, + "balance_loss_mlp": 1.04403043, + "epoch": 0.8207002693343594, + "flos": 931400280576.0, + "grad_norm": 0.05700879545907034, + "language_loss": 0.78525734, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79593778, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.2401123, + "step": 4266, + "time_per_iteration": 3.185957670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.04407215, + "epoch": 0.8208926510196229, + "flos": 549571889664.0, + "grad_norm": 0.05502076151782196, + "language_loss": 0.80635643, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81702411, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.22692871, + "step": 4267, + "time_per_iteration": 2.6516170501708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070354, + "balance_loss_mlp": 1.04677415, + "epoch": 0.8210850327048865, + "flos": 648182352384.0, + "grad_norm": 0.05163863039989425, + "language_loss": 0.82111454, + "learning_rate": 8.162315056592918e-05, + "loss": 0.8318181, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.23596191, + "step": 4268, + "time_per_iteration": 2.8418874740600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068732, + "balance_loss_mlp": 1.04558122, + "epoch": 0.82127741439015, + "flos": 601520878080.0, + "grad_norm": 0.0584034837454372, + "language_loss": 0.81278813, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82347548, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.23144531, + "step": 4269, + "time_per_iteration": 2.7613987922668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070732, + "balance_loss_mlp": 1.04777241, + "epoch": 0.8214697960754136, + "flos": 474831963648.0, + "grad_norm": 0.09670213264130083, + "language_loss": 0.8378197, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84852707, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.22973633, + "step": 4270, + "time_per_iteration": 2.718817949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066128, + "balance_loss_mlp": 1.04242873, + "epoch": 0.8216621777606772, + "flos": 903648172032.0, + "grad_norm": 0.06875355685078723, + "language_loss": 0.85044003, + "learning_rate": 8.11120992965671e-05, + "loss": 0.86110133, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.23693848, + "step": 4271, + "time_per_iteration": 3.1280128955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068814, + "balance_loss_mlp": 1.04395938, + "epoch": 0.8218545594459408, + "flos": 514461528576.0, + "grad_norm": 0.08013013312339903, + "language_loss": 0.82012117, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83080924, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.2487793, + "step": 4272, + "time_per_iteration": 2.6690683364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070725, + "balance_loss_mlp": 1.04672813, + "epoch": 0.8220469411312044, + "flos": 494536803840.0, + "grad_norm": 0.05213060875832122, + "language_loss": 0.863258, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87396526, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.23986816, + "step": 4273, + "time_per_iteration": 2.6432783603668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107137, + "balance_loss_mlp": 1.04725409, + "epoch": 0.8222393228164678, + "flos": 386433483264.0, + "grad_norm": 0.07702362963996895, + "language_loss": 0.89937019, + "learning_rate": 8.060251166717835e-05, + "loss": 0.91008389, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.24108887, + "step": 4274, + "time_per_iteration": 2.4411094188690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075339, + "balance_loss_mlp": 1.0512464, + "epoch": 0.8224317045017314, + "flos": 536590241280.0, + "grad_norm": 0.05073133205853468, + "language_loss": 0.87501049, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88576388, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.2409668, + "step": 4275, + "time_per_iteration": 2.6580936908721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072046, + "balance_loss_mlp": 1.04795384, + "epoch": 0.822624086186995, + "flos": 554899051008.0, + "grad_norm": 0.061816323283698855, + "language_loss": 0.82731915, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83803964, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.24084473, + "step": 4276, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071877, + "balance_loss_mlp": 1.04751015, + "epoch": 0.8228164678722586, + "flos": 539579791872.0, + "grad_norm": 0.05838521013995571, + "language_loss": 0.80136567, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81208444, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.24377441, + "step": 4277, + "time_per_iteration": 2.7059576511383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074492, + "balance_loss_mlp": 1.0514127, + "epoch": 0.8230088495575221, + "flos": 473253124608.0, + "grad_norm": 0.060736883959351555, + "language_loss": 0.79328179, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80402672, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.23083496, + "step": 4278, + "time_per_iteration": 2.6723952293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074832, + "balance_loss_mlp": 1.05057299, + "epoch": 0.8232012312427857, + "flos": 591672314880.0, + "grad_norm": 0.07690493497404575, + "language_loss": 0.8312307, + "learning_rate": 7.975645631856127e-05, + "loss": 0.84197903, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.24255371, + "step": 4279, + "time_per_iteration": 2.6807098388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070086, + "balance_loss_mlp": 1.04629207, + "epoch": 0.8233936129280492, + "flos": 572644380672.0, + "grad_norm": 0.06508062719684189, + "language_loss": 0.74714917, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75785005, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.23791504, + "step": 4280, + "time_per_iteration": 2.741769790649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074321, + "balance_loss_mlp": 1.05055034, + "epoch": 0.8235859946133128, + "flos": 731337735168.0, + "grad_norm": 0.0503633953001768, + "language_loss": 0.78454876, + "learning_rate": 7.941917579079383e-05, + "loss": 0.7952919, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.23779297, + "step": 4281, + "time_per_iteration": 3.003262758255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_mlp": 1.05114222, + "epoch": 0.8237783762985764, + "flos": 570314483712.0, + "grad_norm": 0.06906172818982871, + "language_loss": 0.81415063, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82489479, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.23278809, + "step": 4282, + "time_per_iteration": 2.717515230178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006714, + "balance_loss_mlp": 0.99979985, + "epoch": 0.8239707579838399, + "flos": 1466232897024.0, + "grad_norm": 0.008081480434335962, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76304388, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.06933594, + "step": 4283, + "time_per_iteration": 5.022932767868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070622, + "balance_loss_mlp": 1.046947, + "epoch": 0.8241631396691035, + "flos": 467313297408.0, + "grad_norm": 0.07325194572667328, + "language_loss": 0.80727637, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81798261, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.23657227, + "step": 4284, + "time_per_iteration": 2.6633455753326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007071, + "balance_loss_mlp": 1.0002048, + "epoch": 0.8243555213543671, + "flos": 1539426290688.0, + "grad_norm": 0.007371028264575339, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78941345, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.06884766, + "step": 4285, + "time_per_iteration": 4.923422336578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070494, + "balance_loss_mlp": 1.04681873, + "epoch": 0.8245479030396307, + "flos": 797429836800.0, + "grad_norm": 0.052917023765971946, + "language_loss": 0.82609844, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83680344, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.23657227, + "step": 4286, + "time_per_iteration": 3.1522793769836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070568, + "balance_loss_mlp": 1.0472033, + "epoch": 0.8247402847248941, + "flos": 646114185216.0, + "grad_norm": 0.05594284535828706, + "language_loss": 0.76698542, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77769113, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.23352051, + "step": 4287, + "time_per_iteration": 2.8717780113220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070808, + "balance_loss_mlp": 1.04741931, + "epoch": 0.8249326664101577, + "flos": 604421595648.0, + "grad_norm": 0.056506252147876544, + "language_loss": 0.79746181, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80816984, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.23388672, + "step": 4288, + "time_per_iteration": 2.810127019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073495, + "balance_loss_mlp": 1.05003452, + "epoch": 0.8251250480954213, + "flos": 824369218560.0, + "grad_norm": 0.12017691240559532, + "language_loss": 0.86732876, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87806374, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.23449707, + "step": 4289, + "time_per_iteration": 3.084991693496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069032, + "balance_loss_mlp": 1.04540503, + "epoch": 0.8253174297806849, + "flos": 757382897664.0, + "grad_norm": 0.06647761450900506, + "language_loss": 0.78061903, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79130936, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.23620605, + "step": 4290, + "time_per_iteration": 2.902520179748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070801, + "balance_loss_mlp": 1.047961, + "epoch": 0.8255098114659485, + "flos": 794469648384.0, + "grad_norm": 0.05336857596764246, + "language_loss": 0.87808669, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88879466, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.22851562, + "step": 4291, + "time_per_iteration": 3.148972749710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069085, + "balance_loss_mlp": 1.04600561, + "epoch": 0.825702193151212, + "flos": 710417475072.0, + "grad_norm": 0.062415193374422175, + "language_loss": 0.77379918, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78448999, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.23071289, + "step": 4292, + "time_per_iteration": 2.851947784423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069176, + "balance_loss_mlp": 1.04618037, + "epoch": 0.8258945748364755, + "flos": 683394029568.0, + "grad_norm": 0.06038634003775471, + "language_loss": 0.80922878, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81992054, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.2298584, + "step": 4293, + "time_per_iteration": 2.892784357070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073009, + "balance_loss_mlp": 1.04934597, + "epoch": 0.8260869565217391, + "flos": 594563120640.0, + "grad_norm": 0.05548696520695019, + "language_loss": 0.78991163, + "learning_rate": 7.724279585440186e-05, + "loss": 0.80064166, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.23669434, + "step": 4294, + "time_per_iteration": 2.72245192527771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072099, + "balance_loss_mlp": 1.04767334, + "epoch": 0.8262793382070027, + "flos": 651480993792.0, + "grad_norm": 0.06062374979520041, + "language_loss": 0.8531217, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86384273, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.2442627, + "step": 4295, + "time_per_iteration": 2.771437406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.04420578, + "epoch": 0.8264717198922663, + "flos": 538922709504.0, + "grad_norm": 0.08624043107385752, + "language_loss": 0.84494382, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85562551, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.23937988, + "step": 4296, + "time_per_iteration": 2.6370511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070653, + "balance_loss_mlp": 1.04687035, + "epoch": 0.8266641015775298, + "flos": 538949873664.0, + "grad_norm": 0.06654656411960891, + "language_loss": 0.75516403, + "learning_rate": 7.674448824012514e-05, + "loss": 0.76587057, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.23742676, + "step": 4297, + "time_per_iteration": 2.6292786598205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074873, + "balance_loss_mlp": 1.05018449, + "epoch": 0.8268564832627934, + "flos": 585361728000.0, + "grad_norm": 0.053180571919213125, + "language_loss": 0.84044874, + "learning_rate": 7.657871426083979e-05, + "loss": 0.85119742, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.24682617, + "step": 4298, + "time_per_iteration": 2.7611520290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.04500568, + "epoch": 0.827048864948057, + "flos": 430661173248.0, + "grad_norm": 0.0881519445405178, + "language_loss": 0.84719664, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85788739, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.24035645, + "step": 4299, + "time_per_iteration": 2.4467363357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106492, + "balance_loss_mlp": 1.04194832, + "epoch": 0.8272412466333205, + "flos": 1388430761472.0, + "grad_norm": 0.07161846854718491, + "language_loss": 0.85220098, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86285019, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.22961426, + "step": 4300, + "time_per_iteration": 3.737535238265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070983, + "balance_loss_mlp": 1.04735565, + "epoch": 0.827433628318584, + "flos": 538230749184.0, + "grad_norm": 0.057611680805270904, + "language_loss": 0.82816952, + "learning_rate": 7.608237890043335e-05, + "loss": 0.83887935, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.23608398, + "step": 4301, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064109, + "balance_loss_mlp": 1.03986228, + "epoch": 0.8276260100038476, + "flos": 730734981120.0, + "grad_norm": 0.06820448267452858, + "language_loss": 0.77710879, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78774989, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.24230957, + "step": 4302, + "time_per_iteration": 2.923125743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071553, + "balance_loss_mlp": 1.04758024, + "epoch": 0.8278183916891112, + "flos": 871102273536.0, + "grad_norm": 0.050152131715166706, + "language_loss": 0.82855344, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83926898, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.23962402, + "step": 4303, + "time_per_iteration": 3.1750757694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068661, + "balance_loss_mlp": 1.04548657, + "epoch": 0.8280107733743748, + "flos": 594543297024.0, + "grad_norm": 0.054286253624127895, + "language_loss": 0.78169298, + "learning_rate": 7.558752475439134e-05, + "loss": 0.79237956, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.23168945, + "step": 4304, + "time_per_iteration": 2.7362046241760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066893, + "balance_loss_mlp": 1.04376626, + "epoch": 0.8282031550596384, + "flos": 768607667712.0, + "grad_norm": 0.057065663401577975, + "language_loss": 0.84445703, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85512602, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.23120117, + "step": 4305, + "time_per_iteration": 3.0129735469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064903, + "balance_loss_mlp": 1.04113245, + "epoch": 0.8283955367449019, + "flos": 696108805632.0, + "grad_norm": 0.05641540030356317, + "language_loss": 0.78055018, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79119921, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.2376709, + "step": 4306, + "time_per_iteration": 2.9275615215301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064861, + "balance_loss_mlp": 1.04099536, + "epoch": 0.8285879184301654, + "flos": 660630256128.0, + "grad_norm": 0.060113634614107715, + "language_loss": 0.83076233, + "learning_rate": 7.509415355178806e-05, + "loss": 0.84141093, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.23864746, + "step": 4307, + "time_per_iteration": 2.9101569652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.04316437, + "epoch": 0.828780300115429, + "flos": 558709042176.0, + "grad_norm": 0.06849376731379381, + "language_loss": 0.78221494, + "learning_rate": 7.493002632534618e-05, + "loss": 0.79288924, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.24255371, + "step": 4308, + "time_per_iteration": 2.653721570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066743, + "balance_loss_mlp": 1.04313982, + "epoch": 0.8289726818006926, + "flos": 830963930112.0, + "grad_norm": 0.06152111373530928, + "language_loss": 0.81996018, + "learning_rate": 7.476606412570352e-05, + "loss": 0.83062756, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.23596191, + "step": 4309, + "time_per_iteration": 3.0518198013305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066895, + "balance_loss_mlp": 1.04339886, + "epoch": 0.8291650634859561, + "flos": 732289227264.0, + "grad_norm": 0.0622755107347832, + "language_loss": 0.80474293, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81541193, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.23486328, + "step": 4310, + "time_per_iteration": 2.912027597427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_mlp": 1.05124211, + "epoch": 0.8293574451712197, + "flos": 860910114816.0, + "grad_norm": 0.09220252331582286, + "language_loss": 0.81365955, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82442182, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.24975586, + "step": 4311, + "time_per_iteration": 3.195178270339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072175, + "balance_loss_mlp": 1.04859567, + "epoch": 0.8295498268564833, + "flos": 495156810240.0, + "grad_norm": 0.05370166855349878, + "language_loss": 0.82026577, + "learning_rate": 7.427516832380948e-05, + "loss": 0.83098745, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.2355957, + "step": 4312, + "time_per_iteration": 2.8094701766967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071559, + "balance_loss_mlp": 1.04825366, + "epoch": 0.8297422085417469, + "flos": 554471391744.0, + "grad_norm": 0.056720068302814244, + "language_loss": 0.77900606, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78972161, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.23266602, + "step": 4313, + "time_per_iteration": 2.8235886096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070784, + "balance_loss_mlp": 1.04769325, + "epoch": 0.8299345902270104, + "flos": 1247497417728.0, + "grad_norm": 0.07777163044205856, + "language_loss": 0.77210194, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78280979, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.23095703, + "step": 4314, + "time_per_iteration": 3.678664207458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073602, + "balance_loss_mlp": 1.04967725, + "epoch": 0.8301269719122739, + "flos": 585260411904.0, + "grad_norm": 0.058448053996127784, + "language_loss": 0.83041668, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84115267, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.23925781, + "step": 4315, + "time_per_iteration": 2.7324447631835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070451, + "balance_loss_mlp": 1.04641795, + "epoch": 0.8303193535975375, + "flos": 509732352000.0, + "grad_norm": 0.05225421465036167, + "language_loss": 0.85065174, + "learning_rate": 7.362295481759412e-05, + "loss": 0.86135626, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.24035645, + "step": 4316, + "time_per_iteration": 2.6747384071350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069014, + "balance_loss_mlp": 1.04514837, + "epoch": 0.8305117352828011, + "flos": 580652375040.0, + "grad_norm": 0.06799760077646158, + "language_loss": 0.839338, + "learning_rate": 7.346031511856722e-05, + "loss": 0.85002816, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.23840332, + "step": 4317, + "time_per_iteration": 2.6837451457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_mlp": 1.04169214, + "epoch": 0.8307041169680647, + "flos": 481626736128.0, + "grad_norm": 0.06243921298560797, + "language_loss": 0.79024559, + "learning_rate": 7.329784101693232e-05, + "loss": 0.80090392, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.24108887, + "step": 4318, + "time_per_iteration": 2.647980213165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069382, + "balance_loss_mlp": 1.04579091, + "epoch": 0.8308964986533282, + "flos": 624605852160.0, + "grad_norm": 0.06930026355697931, + "language_loss": 0.83169067, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84238452, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.23596191, + "step": 4319, + "time_per_iteration": 2.7219338417053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066419, + "balance_loss_mlp": 1.04255295, + "epoch": 0.8310888803385917, + "flos": 827319495168.0, + "grad_norm": 0.062344981143790404, + "language_loss": 0.79345471, + "learning_rate": 7.297338985808589e-05, + "loss": 0.80411887, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.23864746, + "step": 4320, + "time_per_iteration": 3.0479512214660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072012, + "balance_loss_mlp": 1.04771757, + "epoch": 0.8312812620238553, + "flos": 583743241728.0, + "grad_norm": 0.0524694898745761, + "language_loss": 0.82171959, + "learning_rate": 7.281141292683746e-05, + "loss": 0.83243972, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.24304199, + "step": 4321, + "time_per_iteration": 2.7896528244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_mlp": 1.04058981, + "epoch": 0.8314736437091189, + "flos": 1115605052928.0, + "grad_norm": 0.07871576581536682, + "language_loss": 0.74626267, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75691032, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.24169922, + "step": 4322, + "time_per_iteration": 3.396692991256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069483, + "balance_loss_mlp": 1.04443729, + "epoch": 0.8316660253943825, + "flos": 517547625984.0, + "grad_norm": 0.07516139933736624, + "language_loss": 0.8218689, + "learning_rate": 7.248795667511543e-05, + "loss": 0.83256376, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.25061035, + "step": 4323, + "time_per_iteration": 2.781719207763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106835, + "balance_loss_mlp": 1.04479408, + "epoch": 0.831858407079646, + "flos": 795329736192.0, + "grad_norm": 0.060463118214837686, + "language_loss": 0.78359312, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79427665, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.23547363, + "step": 4324, + "time_per_iteration": 2.991947650909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064888, + "balance_loss_mlp": 1.04118955, + "epoch": 0.8320507887649096, + "flos": 549967242240.0, + "grad_norm": 0.06082533886363455, + "language_loss": 0.83160555, + "learning_rate": 7.216516432290843e-05, + "loss": 0.8422544, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.23693848, + "step": 4325, + "time_per_iteration": 2.6645991802215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.04284835, + "epoch": 0.8322431704501732, + "flos": 479398155264.0, + "grad_norm": 0.060833888285275674, + "language_loss": 0.82192004, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83258057, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.23181152, + "step": 4326, + "time_per_iteration": 2.5123751163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064937, + "balance_loss_mlp": 1.04145336, + "epoch": 0.8324355521354367, + "flos": 572434407936.0, + "grad_norm": 0.06156259329877496, + "language_loss": 0.85313761, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86378694, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.23486328, + "step": 4327, + "time_per_iteration": 2.6744894981384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069478, + "balance_loss_mlp": 1.04586303, + "epoch": 0.8326279338207002, + "flos": 503454071808.0, + "grad_norm": 0.05527860499037242, + "language_loss": 0.82345808, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83415288, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.23596191, + "step": 4328, + "time_per_iteration": 2.5938544273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064462, + "balance_loss_mlp": 1.04112041, + "epoch": 0.8328203155059638, + "flos": 605743474176.0, + "grad_norm": 0.054755319996718364, + "language_loss": 0.81181324, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82245785, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.23339844, + "step": 4329, + "time_per_iteration": 2.899338960647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066654, + "balance_loss_mlp": 1.04346812, + "epoch": 0.8330126971912274, + "flos": 697798872576.0, + "grad_norm": 0.05583188678549314, + "language_loss": 0.85963029, + "learning_rate": 7.136109128985663e-05, + "loss": 0.87029678, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.23168945, + "step": 4330, + "time_per_iteration": 2.9158973693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04619408, + "epoch": 0.833205078876491, + "flos": 494042706432.0, + "grad_norm": 0.058167926322513455, + "language_loss": 0.86570764, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87640113, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.23168945, + "step": 4331, + "time_per_iteration": 2.552724838256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067021, + "balance_loss_mlp": 1.04359591, + "epoch": 0.8333974605617546, + "flos": 482812793856.0, + "grad_norm": 0.05466728002891162, + "language_loss": 0.82944643, + "learning_rate": 7.104062652673115e-05, + "loss": 0.84011662, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.23425293, + "step": 4332, + "time_per_iteration": 2.590282440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060788, + "balance_loss_mlp": 1.03719616, + "epoch": 0.833589842247018, + "flos": 686821151232.0, + "grad_norm": 0.07782119871046231, + "language_loss": 0.83271265, + "learning_rate": 7.088064391927818e-05, + "loss": 0.84332061, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.23571777, + "step": 4333, + "time_per_iteration": 2.815814733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106529, + "balance_loss_mlp": 1.04137695, + "epoch": 0.8337822239322816, + "flos": 881739343872.0, + "grad_norm": 0.054030606180818834, + "language_loss": 0.82675385, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83740675, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.23901367, + "step": 4334, + "time_per_iteration": 3.115084171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068989, + "balance_loss_mlp": 1.04486084, + "epoch": 0.8339746056175452, + "flos": 497183132160.0, + "grad_norm": 0.059930468970718645, + "language_loss": 0.82674849, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83743834, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.2409668, + "step": 4335, + "time_per_iteration": 2.587022066116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064499, + "balance_loss_mlp": 1.04075205, + "epoch": 0.8341669873028088, + "flos": 510495892992.0, + "grad_norm": 0.061364026256354405, + "language_loss": 0.86592519, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87657017, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.23730469, + "step": 4336, + "time_per_iteration": 2.6579511165618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065906, + "balance_loss_mlp": 1.04231429, + "epoch": 0.8343593689880723, + "flos": 692321209344.0, + "grad_norm": 0.058708202155102585, + "language_loss": 0.84223795, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85289705, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.23583984, + "step": 4337, + "time_per_iteration": 2.7891974449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064349, + "balance_loss_mlp": 1.04047179, + "epoch": 0.8345517506733359, + "flos": 552408367104.0, + "grad_norm": 0.06224088295195056, + "language_loss": 0.7808466, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79149008, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.23864746, + "step": 4338, + "time_per_iteration": 2.7306392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.04151607, + "epoch": 0.8347441323585995, + "flos": 592052613120.0, + "grad_norm": 0.06265232087954678, + "language_loss": 0.76197761, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77262855, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.23608398, + "step": 4339, + "time_per_iteration": 2.866647243499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066981, + "balance_loss_mlp": 1.04350924, + "epoch": 0.834936514043863, + "flos": 614917702656.0, + "grad_norm": 0.05669391690274369, + "language_loss": 0.84682417, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85749394, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.234375, + "step": 4340, + "time_per_iteration": 2.7687973976135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059648, + "balance_loss_mlp": 1.03600907, + "epoch": 0.8351288957291266, + "flos": 467844470784.0, + "grad_norm": 0.057030478490448615, + "language_loss": 0.79855502, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80915147, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.23608398, + "step": 4341, + "time_per_iteration": 2.600409984588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064852, + "balance_loss_mlp": 1.04139209, + "epoch": 0.8353212774143901, + "flos": 509319747072.0, + "grad_norm": 0.05124905327439443, + "language_loss": 0.79103041, + "learning_rate": 6.944830483504328e-05, + "loss": 0.8016789, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.23474121, + "step": 4342, + "time_per_iteration": 2.6188149452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059803, + "balance_loss_mlp": 1.03578269, + "epoch": 0.8355136590996537, + "flos": 687784753152.0, + "grad_norm": 0.0559253848941325, + "language_loss": 0.80927384, + "learning_rate": 6.928999100098483e-05, + "loss": 0.8198719, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.2401123, + "step": 4343, + "time_per_iteration": 2.90815806388855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_mlp": 1.0387243, + "epoch": 0.8357060407849173, + "flos": 984409417728.0, + "grad_norm": 0.07183527883163628, + "language_loss": 0.8404789, + "learning_rate": 6.913184438338138e-05, + "loss": 0.85109377, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.22753906, + "step": 4344, + "time_per_iteration": 3.2577321529388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063929, + "balance_loss_mlp": 1.0402298, + "epoch": 0.8358984224701809, + "flos": 843026393088.0, + "grad_norm": 0.05241203479209445, + "language_loss": 0.85357249, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86421174, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.23669434, + "step": 4345, + "time_per_iteration": 3.163863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.03823972, + "epoch": 0.8360908041554445, + "flos": 626239019520.0, + "grad_norm": 0.06325665273743743, + "language_loss": 0.82319045, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83380902, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.23620605, + "step": 4346, + "time_per_iteration": 2.764380693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061388, + "balance_loss_mlp": 1.03774893, + "epoch": 0.8362831858407079, + "flos": 576068931072.0, + "grad_norm": 0.05825131988616556, + "language_loss": 0.84927475, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85988867, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.2364502, + "step": 4347, + "time_per_iteration": 2.7342264652252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061671, + "balance_loss_mlp": 1.03713763, + "epoch": 0.8364755675259715, + "flos": 833783155200.0, + "grad_norm": 0.0627395482486057, + "language_loss": 0.80987006, + "learning_rate": 6.850093130450569e-05, + "loss": 0.82048672, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.24523926, + "step": 4348, + "time_per_iteration": 3.092926263809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063399, + "balance_loss_mlp": 1.04023659, + "epoch": 0.8366679492112351, + "flos": 582480834048.0, + "grad_norm": 0.07064717826743147, + "language_loss": 0.8640992, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87473315, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.23168945, + "step": 4349, + "time_per_iteration": 2.7598204612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065652, + "balance_loss_mlp": 1.04170275, + "epoch": 0.8368603308964987, + "flos": 611722948608.0, + "grad_norm": 0.07054756370503083, + "language_loss": 0.87762499, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88828146, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.23950195, + "step": 4350, + "time_per_iteration": 2.7956182956695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106363, + "balance_loss_mlp": 1.04061055, + "epoch": 0.8370527125817622, + "flos": 507264062976.0, + "grad_norm": 0.05272263799194958, + "language_loss": 0.85641217, + "learning_rate": 6.802950527014884e-05, + "loss": 0.8670485, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.22998047, + "step": 4351, + "time_per_iteration": 2.759132146835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057966, + "balance_loss_mlp": 1.03429115, + "epoch": 0.8372450942670258, + "flos": 770952619008.0, + "grad_norm": 0.06709118366747789, + "language_loss": 0.82653809, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83711779, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.23657227, + "step": 4352, + "time_per_iteration": 2.9604341983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069422, + "balance_loss_mlp": 1.04628313, + "epoch": 0.8374374759522893, + "flos": 579276168192.0, + "grad_norm": 0.062462890278505795, + "language_loss": 0.84993398, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86062813, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.23144531, + "step": 4353, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060703, + "balance_loss_mlp": 1.03720701, + "epoch": 0.8376298576375529, + "flos": 788129699328.0, + "grad_norm": 0.06847497017559315, + "language_loss": 0.82433045, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83493751, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.23498535, + "step": 4354, + "time_per_iteration": 3.0050246715545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062824, + "balance_loss_mlp": 1.03801692, + "epoch": 0.8378222393228165, + "flos": 577613265408.0, + "grad_norm": 0.058961273709874806, + "language_loss": 0.80724108, + "learning_rate": 6.74032853891452e-05, + "loss": 0.81786942, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.24804688, + "step": 4355, + "time_per_iteration": 2.796504497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063872, + "balance_loss_mlp": 1.03998232, + "epoch": 0.83801462100808, + "flos": 480865766400.0, + "grad_norm": 0.06959860060167573, + "language_loss": 0.82262826, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83326697, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.23864746, + "step": 4356, + "time_per_iteration": 2.6413910388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063736, + "balance_loss_mlp": 1.04075241, + "epoch": 0.8382070026933436, + "flos": 550817044992.0, + "grad_norm": 0.05613301024813433, + "language_loss": 0.89176285, + "learning_rate": 6.709118289932226e-05, + "loss": 0.9024002, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.2298584, + "step": 4357, + "time_per_iteration": 2.7601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065704, + "balance_loss_mlp": 1.04187393, + "epoch": 0.8383993843786072, + "flos": 624968898048.0, + "grad_norm": 0.06393233563848542, + "language_loss": 0.82275569, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83341277, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.23815918, + "step": 4358, + "time_per_iteration": 2.8808021545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067548, + "balance_loss_mlp": 1.04333663, + "epoch": 0.8385917660638708, + "flos": 491169153024.0, + "grad_norm": 0.06064509036579976, + "language_loss": 0.8677128, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87838829, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.24206543, + "step": 4359, + "time_per_iteration": 2.5335707664489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.04286551, + "epoch": 0.8387841477491342, + "flos": 466900692480.0, + "grad_norm": 0.06112190056703723, + "language_loss": 0.86871219, + "learning_rate": 6.662428984145336e-05, + "loss": 0.87937975, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.23913574, + "step": 4360, + "time_per_iteration": 2.5734517574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002992, + "balance_loss_mlp": 0.99631584, + "epoch": 0.8389765294343978, + "flos": 1564188475392.0, + "grad_norm": 0.006766850503245408, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72783178, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.06689453, + "step": 4361, + "time_per_iteration": 5.083199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068212, + "balance_loss_mlp": 1.04532373, + "epoch": 0.8391689111196614, + "flos": 602160708096.0, + "grad_norm": 0.050669789123315594, + "language_loss": 0.83081377, + "learning_rate": 6.631386895903308e-05, + "loss": 0.84149587, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.22888184, + "step": 4362, + "time_per_iteration": 2.8357574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066938, + "balance_loss_mlp": 1.04351294, + "epoch": 0.839361292804925, + "flos": 443047408128.0, + "grad_norm": 0.06770273603538836, + "language_loss": 0.80559838, + "learning_rate": 6.615891104554261e-05, + "loss": 0.81626773, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.23400879, + "step": 4363, + "time_per_iteration": 2.536062479019165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065386, + "balance_loss_mlp": 1.04129338, + "epoch": 0.8395536744901886, + "flos": 594167768064.0, + "grad_norm": 0.0638325900212512, + "language_loss": 0.83115453, + "learning_rate": 6.600412156410057e-05, + "loss": 0.84180838, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.24060059, + "step": 4364, + "time_per_iteration": 2.71862530708313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063359, + "balance_loss_mlp": 1.03946948, + "epoch": 0.8397460561754521, + "flos": 889836171264.0, + "grad_norm": 0.05544062619942032, + "language_loss": 0.85130864, + "learning_rate": 6.58495005748016e-05, + "loss": 0.86194223, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.23864746, + "step": 4365, + "time_per_iteration": 3.1578779220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068008, + "balance_loss_mlp": 1.04511952, + "epoch": 0.8399384378607156, + "flos": 553503020544.0, + "grad_norm": 0.05406052024104549, + "language_loss": 0.89236188, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90304196, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.22900391, + "step": 4366, + "time_per_iteration": 2.6701974868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064854, + "balance_loss_mlp": 1.0414772, + "epoch": 0.8401308195459792, + "flos": 518923832832.0, + "grad_norm": 0.048370646551648334, + "language_loss": 0.83713633, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84778494, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.23364258, + "step": 4367, + "time_per_iteration": 2.6388163566589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069308, + "balance_loss_mlp": 1.04606247, + "epoch": 0.8403232012312428, + "flos": 684933221376.0, + "grad_norm": 0.07156350657988877, + "language_loss": 0.80791634, + "learning_rate": 6.538664915972648e-05, + "loss": 0.81860942, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.23242188, + "step": 4368, + "time_per_iteration": 3.0422544479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066173, + "balance_loss_mlp": 1.04270053, + "epoch": 0.8405155829165063, + "flos": 577672736256.0, + "grad_norm": 0.09577954267651939, + "language_loss": 0.77953506, + "learning_rate": 6.523270273863652e-05, + "loss": 0.79019678, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.23449707, + "step": 4369, + "time_per_iteration": 2.7166330814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066047, + "balance_loss_mlp": 1.04277718, + "epoch": 0.8407079646017699, + "flos": 456627041280.0, + "grad_norm": 0.06624048319732295, + "language_loss": 0.88075888, + "learning_rate": 6.507892510918079e-05, + "loss": 0.89141929, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.23278809, + "step": 4370, + "time_per_iteration": 2.5422723293304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04669082, + "epoch": 0.8409003462870335, + "flos": 534917426688.0, + "grad_norm": 0.07352168072762383, + "language_loss": 0.81707442, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82778412, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.24291992, + "step": 4371, + "time_per_iteration": 2.7493014335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067886, + "balance_loss_mlp": 1.04424715, + "epoch": 0.8410927279722971, + "flos": 556759443456.0, + "grad_norm": 0.07206783249977043, + "language_loss": 0.77901572, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78969461, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.2364502, + "step": 4372, + "time_per_iteration": 2.724076986312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009208, + "balance_loss_mlp": 1.00253248, + "epoch": 0.8412851096575606, + "flos": 1549754270208.0, + "grad_norm": 0.007754945157954517, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78688329, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.06689453, + "step": 4373, + "time_per_iteration": 4.920220851898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069157, + "balance_loss_mlp": 1.04606605, + "epoch": 0.8414774913428241, + "flos": 552042749952.0, + "grad_norm": 0.05517644936985899, + "language_loss": 0.79005659, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80074823, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.2310791, + "step": 4374, + "time_per_iteration": 2.721017599105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070602, + "balance_loss_mlp": 1.04738045, + "epoch": 0.8416698730280877, + "flos": 573015140352.0, + "grad_norm": 0.06478850683255671, + "language_loss": 0.77759355, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78829956, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.2322998, + "step": 4375, + "time_per_iteration": 2.6687657833099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069077, + "balance_loss_mlp": 1.04429305, + "epoch": 0.8418622547133513, + "flos": 758731940352.0, + "grad_norm": 0.06415120451411167, + "language_loss": 0.8027761, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81346691, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.24768066, + "step": 4376, + "time_per_iteration": 2.8901031017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.04222751, + "epoch": 0.8420546363986149, + "flos": 1074156940800.0, + "grad_norm": 0.0640476481214236, + "language_loss": 0.72992682, + "learning_rate": 6.40072128754366e-05, + "loss": 0.74058723, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.23803711, + "step": 4377, + "time_per_iteration": 3.387580156326294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066874, + "balance_loss_mlp": 1.04272175, + "epoch": 0.8422470180838784, + "flos": 525908754432.0, + "grad_norm": 0.05787479724748053, + "language_loss": 0.82898366, + "learning_rate": 6.385478772280933e-05, + "loss": 0.8396523, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.24133301, + "step": 4378, + "time_per_iteration": 2.709947109222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_mlp": 1.04393244, + "epoch": 0.842439399769142, + "flos": 600834060288.0, + "grad_norm": 0.0566504887729659, + "language_loss": 0.82355988, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83423686, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.23754883, + "step": 4379, + "time_per_iteration": 2.740873098373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066145, + "balance_loss_mlp": 1.04244626, + "epoch": 0.8426317814544055, + "flos": 552222987264.0, + "grad_norm": 0.07635289805941703, + "language_loss": 0.86640501, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87706643, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.23669434, + "step": 4380, + "time_per_iteration": 2.8072140216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067646, + "balance_loss_mlp": 1.04449534, + "epoch": 0.8428241631396691, + "flos": 678832980480.0, + "grad_norm": 0.05882632867978376, + "language_loss": 0.78088343, + "learning_rate": 6.33985284608356e-05, + "loss": 0.79155988, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.23120117, + "step": 4381, + "time_per_iteration": 2.8028671741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068573, + "balance_loss_mlp": 1.04549408, + "epoch": 0.8430165448249327, + "flos": 753730748928.0, + "grad_norm": 0.0434369089484086, + "language_loss": 0.79954326, + "learning_rate": 6.324678096896435e-05, + "loss": 0.81022894, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.23046875, + "step": 4382, + "time_per_iteration": 3.0995900630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070257, + "balance_loss_mlp": 1.0466063, + "epoch": 0.8432089265101962, + "flos": 699140574720.0, + "grad_norm": 0.05667250688060797, + "language_loss": 0.81071454, + "learning_rate": 6.30952030397306e-05, + "loss": 0.82141709, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.23657227, + "step": 4383, + "time_per_iteration": 2.8948311805725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062746, + "balance_loss_mlp": 1.0390358, + "epoch": 0.8434013081954598, + "flos": 485767839744.0, + "grad_norm": 0.05498684151789612, + "language_loss": 0.84662998, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85725743, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.23706055, + "step": 4384, + "time_per_iteration": 2.6783289909362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069983, + "balance_loss_mlp": 1.04623687, + "epoch": 0.8435936898807234, + "flos": 520623811584.0, + "grad_norm": 0.060123898153637105, + "language_loss": 0.85518533, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86588514, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.23718262, + "step": 4385, + "time_per_iteration": 2.6172537803649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070212, + "balance_loss_mlp": 1.04614377, + "epoch": 0.843786071565987, + "flos": 785945534976.0, + "grad_norm": 0.07135730466560408, + "language_loss": 0.80699778, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81769991, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.24060059, + "step": 4386, + "time_per_iteration": 2.981780529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011968, + "balance_loss_mlp": 1.00543571, + "epoch": 0.8439784532512504, + "flos": 1446278436864.0, + "grad_norm": 0.00922581639131475, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76848477, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.06542969, + "step": 4387, + "time_per_iteration": 4.87928318977356 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.04326701, + "epoch": 0.844170834936514, + "flos": 708700243968.0, + "grad_norm": 0.061285809051873204, + "language_loss": 0.82608801, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83676207, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.24133301, + "step": 4388, + "time_per_iteration": 2.879666566848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067269, + "balance_loss_mlp": 1.0429976, + "epoch": 0.8443632166217776, + "flos": 483428030976.0, + "grad_norm": 0.06596849430486998, + "language_loss": 0.79634511, + "learning_rate": 6.218929957057922e-05, + "loss": 0.8070178, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.24255371, + "step": 4389, + "time_per_iteration": 2.6863036155700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_mlp": 1.04712808, + "epoch": 0.8445555983070412, + "flos": 678694588416.0, + "grad_norm": 0.06649132591646567, + "language_loss": 0.80325556, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81395948, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.23266602, + "step": 4390, + "time_per_iteration": 2.848747968673706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068419, + "balance_loss_mlp": 1.04500639, + "epoch": 0.8447479799923048, + "flos": 741485477376.0, + "grad_norm": 0.05601705691022872, + "language_loss": 0.74468088, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75536507, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.23388672, + "step": 4391, + "time_per_iteration": 2.977398633956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.04123068, + "epoch": 0.8449403616775683, + "flos": 953306537472.0, + "grad_norm": 0.05451644109037178, + "language_loss": 0.8061679, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81681371, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.23339844, + "step": 4392, + "time_per_iteration": 3.253659725189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068446, + "balance_loss_mlp": 1.04412758, + "epoch": 0.8451327433628318, + "flos": 657363921408.0, + "grad_norm": 0.10297975661606834, + "language_loss": 0.72206116, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73274559, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.24328613, + "step": 4393, + "time_per_iteration": 2.8959717750549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106765, + "balance_loss_mlp": 1.04457116, + "epoch": 0.8453251250480954, + "flos": 446113681920.0, + "grad_norm": 0.07045905469561214, + "language_loss": 0.83664286, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84731936, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.23083496, + "step": 4394, + "time_per_iteration": 2.5246009826660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_mlp": 1.04410589, + "epoch": 0.845517506733359, + "flos": 542767205376.0, + "grad_norm": 0.07052005068233351, + "language_loss": 0.71200818, + "learning_rate": 6.128951512927305e-05, + "loss": 0.72269052, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.24108887, + "step": 4395, + "time_per_iteration": 2.672424077987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066143, + "balance_loss_mlp": 1.04233706, + "epoch": 0.8457098884186226, + "flos": 502440910848.0, + "grad_norm": 0.05507620442760256, + "language_loss": 0.84375072, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85441208, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.23791504, + "step": 4396, + "time_per_iteration": 2.6126549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062805, + "balance_loss_mlp": 1.04011917, + "epoch": 0.8459022701038861, + "flos": 448893259776.0, + "grad_norm": 0.0721067921466401, + "language_loss": 0.79833293, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80896091, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.22692871, + "step": 4397, + "time_per_iteration": 2.6915853023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066096, + "balance_loss_mlp": 1.04310095, + "epoch": 0.8460946517891497, + "flos": 743178115584.0, + "grad_norm": 0.06092470430813476, + "language_loss": 0.75112307, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76178408, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.23010254, + "step": 4398, + "time_per_iteration": 2.9150443077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066373, + "balance_loss_mlp": 1.0431869, + "epoch": 0.8462870334744133, + "flos": 553216324608.0, + "grad_norm": 0.05338597842317466, + "language_loss": 0.8020795, + "learning_rate": 6.069306450876389e-05, + "loss": 0.81274319, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.23193359, + "step": 4399, + "time_per_iteration": 2.750760316848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007946, + "balance_loss_mlp": 1.00146127, + "epoch": 0.8464794151596768, + "flos": 1564877864448.0, + "grad_norm": 0.006476140912531384, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82716513, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.06494141, + "step": 4400, + "time_per_iteration": 4.870280742645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064721, + "balance_loss_mlp": 1.04194021, + "epoch": 0.8466717968449403, + "flos": 550197038592.0, + "grad_norm": 0.05700520466186635, + "language_loss": 0.8022759, + "learning_rate": 6.039586229158084e-05, + "loss": 0.81292307, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.2277832, + "step": 4401, + "time_per_iteration": 2.6463844776153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04520464, + "epoch": 0.8468641785302039, + "flos": 551919038976.0, + "grad_norm": 0.07274796217082238, + "language_loss": 0.85117292, + "learning_rate": 6.024751715835314e-05, + "loss": 0.86186635, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.24145508, + "step": 4402, + "time_per_iteration": 2.7529311180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066722, + "balance_loss_mlp": 1.04289234, + "epoch": 0.8470565602154675, + "flos": 572671544832.0, + "grad_norm": 0.056307872604652406, + "language_loss": 0.87157613, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88224334, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.23803711, + "step": 4403, + "time_per_iteration": 2.751302480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066248, + "balance_loss_mlp": 1.04265642, + "epoch": 0.8472489419007311, + "flos": 472833179136.0, + "grad_norm": 0.06396477003256777, + "language_loss": 0.83970821, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85037065, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.23571777, + "step": 4404, + "time_per_iteration": 2.5374879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065348, + "balance_loss_mlp": 1.04250705, + "epoch": 0.8474413235859947, + "flos": 798020481024.0, + "grad_norm": 0.05899694759203786, + "language_loss": 0.80021083, + "learning_rate": 5.980350635103954e-05, + "loss": 0.81086433, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.22839355, + "step": 4405, + "time_per_iteration": 2.9600727558135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106951, + "balance_loss_mlp": 1.04639554, + "epoch": 0.8476337052712581, + "flos": 502379241984.0, + "grad_norm": 0.06477364721529628, + "language_loss": 0.80659878, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81729388, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.23095703, + "step": 4406, + "time_per_iteration": 2.5580904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065117, + "balance_loss_mlp": 1.04226446, + "epoch": 0.8478260869565217, + "flos": 931971101184.0, + "grad_norm": 0.045408059542421414, + "language_loss": 0.83343709, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84408826, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.22839355, + "step": 4407, + "time_per_iteration": 3.1972086429595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066424, + "balance_loss_mlp": 1.04317832, + "epoch": 0.8480184686417853, + "flos": 708811471872.0, + "grad_norm": 0.06839169803815855, + "language_loss": 0.80941093, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.82007527, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.23242188, + "step": 4408, + "time_per_iteration": 2.857830762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063267, + "balance_loss_mlp": 1.04056954, + "epoch": 0.8482108503270489, + "flos": 614440857600.0, + "grad_norm": 0.048590366986801296, + "language_loss": 0.82484585, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83547854, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.22680664, + "step": 4409, + "time_per_iteration": 2.814972162246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065801, + "balance_loss_mlp": 1.04220927, + "epoch": 0.8484032320123124, + "flos": 531016031232.0, + "grad_norm": 0.07117279722087601, + "language_loss": 0.82206416, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83272225, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.23583984, + "step": 4410, + "time_per_iteration": 2.67344069480896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100989, + "balance_loss_mlp": 1.00340486, + "epoch": 0.848595613697576, + "flos": 1542776315904.0, + "grad_norm": 0.006407685796369742, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77306801, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.06494141, + "step": 4411, + "time_per_iteration": 4.883302927017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067859, + "balance_loss_mlp": 1.0438621, + "epoch": 0.8487879953828396, + "flos": 677342974464.0, + "grad_norm": 0.05787110114111616, + "language_loss": 0.74100566, + "learning_rate": 5.877346528406635e-05, + "loss": 0.75168431, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.23986816, + "step": 4412, + "time_per_iteration": 2.839329481124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066525, + "balance_loss_mlp": 1.04281461, + "epoch": 0.8489803770681031, + "flos": 503673956352.0, + "grad_norm": 0.0786964917797223, + "language_loss": 0.79841238, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80907762, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.23693848, + "step": 4413, + "time_per_iteration": 2.590726137161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106631, + "balance_loss_mlp": 1.04268241, + "epoch": 0.8491727587533667, + "flos": 563186027520.0, + "grad_norm": 0.06106365604230901, + "language_loss": 0.77250433, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78316742, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.23620605, + "step": 4414, + "time_per_iteration": 2.7339437007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062739, + "balance_loss_mlp": 1.03908801, + "epoch": 0.8493651404386302, + "flos": 459784719360.0, + "grad_norm": 0.061516297138754804, + "language_loss": 0.78073561, + "learning_rate": 5.833458746159243e-05, + "loss": 0.791363, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.2364502, + "step": 4415, + "time_per_iteration": 2.587347984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066023, + "balance_loss_mlp": 1.04259825, + "epoch": 0.8495575221238938, + "flos": 461170838016.0, + "grad_norm": 0.06256364296979572, + "language_loss": 0.8204776, + "learning_rate": 5.818863771788013e-05, + "loss": 0.8311379, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.23413086, + "step": 4416, + "time_per_iteration": 2.6552000045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065844, + "balance_loss_mlp": 1.04268169, + "epoch": 0.8497499038091574, + "flos": 870712063488.0, + "grad_norm": 0.05735024034400219, + "language_loss": 0.81449145, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82514989, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.23156738, + "step": 4417, + "time_per_iteration": 3.103299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106067, + "balance_loss_mlp": 1.03838944, + "epoch": 0.849942285494421, + "flos": 779600443392.0, + "grad_norm": 0.06271004199707587, + "language_loss": 0.78202587, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79263258, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.22290039, + "step": 4418, + "time_per_iteration": 2.988527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065221, + "balance_loss_mlp": 1.04155779, + "epoch": 0.8501346671796844, + "flos": 513816556032.0, + "grad_norm": 0.06478557665826518, + "language_loss": 0.84965581, + "learning_rate": 5.775181787135819e-05, + "loss": 0.86030805, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.2364502, + "step": 4419, + "time_per_iteration": 2.660832643508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067372, + "balance_loss_mlp": 1.04471087, + "epoch": 0.850327048864948, + "flos": 621445602816.0, + "grad_norm": 0.05805875265489693, + "language_loss": 0.84014624, + "learning_rate": 5.76065545724877e-05, + "loss": 0.85081995, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.2265625, + "step": 4420, + "time_per_iteration": 2.8057973384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062456, + "balance_loss_mlp": 1.03862631, + "epoch": 0.8505194305502116, + "flos": 774221524992.0, + "grad_norm": 0.07039622216102548, + "language_loss": 0.80085033, + "learning_rate": 5.746146302598454e-05, + "loss": 0.81147492, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.23815918, + "step": 4421, + "time_per_iteration": 2.996438980102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065422, + "balance_loss_mlp": 1.04161644, + "epoch": 0.8507118122354752, + "flos": 465257613312.0, + "grad_norm": 0.05957352392179271, + "language_loss": 0.86582476, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87647903, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.23803711, + "step": 4422, + "time_per_iteration": 2.5918169021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067094, + "balance_loss_mlp": 1.04385972, + "epoch": 0.8509041939207388, + "flos": 534413417472.0, + "grad_norm": 0.06352427691563187, + "language_loss": 0.84973729, + "learning_rate": 5.717179541533257e-05, + "loss": 0.86040825, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.23217773, + "step": 4423, + "time_per_iteration": 2.619405508041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106657, + "balance_loss_mlp": 1.04306161, + "epoch": 0.8510965756060023, + "flos": 583738472448.0, + "grad_norm": 0.06302067631207116, + "language_loss": 0.84753847, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85820413, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.23474121, + "step": 4424, + "time_per_iteration": 2.722527027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066143, + "balance_loss_mlp": 1.04253995, + "epoch": 0.8512889572912659, + "flos": 600841400832.0, + "grad_norm": 0.06546706844239303, + "language_loss": 0.77509212, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78575361, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.23608398, + "step": 4425, + "time_per_iteration": 2.7466671466827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064936, + "balance_loss_mlp": 1.04149961, + "epoch": 0.8514813389765294, + "flos": 654791745024.0, + "grad_norm": 0.06580778450289504, + "language_loss": 0.78853273, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79918212, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.23461914, + "step": 4426, + "time_per_iteration": 2.895609140396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061073, + "balance_loss_mlp": 1.03810143, + "epoch": 0.851673720661793, + "flos": 429761811456.0, + "grad_norm": 0.07272347775054663, + "language_loss": 0.7861743, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.796785, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.22973633, + "step": 4427, + "time_per_iteration": 2.5801033973693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068835, + "balance_loss_mlp": 1.04494596, + "epoch": 0.8518661023470565, + "flos": 641572959744.0, + "grad_norm": 0.07439328255951466, + "language_loss": 0.79870641, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80939472, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.2388916, + "step": 4428, + "time_per_iteration": 2.8004066944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067225, + "balance_loss_mlp": 1.04265594, + "epoch": 0.8520584840323201, + "flos": 562143504384.0, + "grad_norm": 0.06621520947267241, + "language_loss": 0.80070293, + "learning_rate": 5.630692048472363e-05, + "loss": 0.81137514, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.24572754, + "step": 4429, + "time_per_iteration": 2.6723525524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065488, + "balance_loss_mlp": 1.04238546, + "epoch": 0.8522508657175837, + "flos": 527050395648.0, + "grad_norm": 0.0643114416219169, + "language_loss": 0.78664088, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79729569, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.23095703, + "step": 4430, + "time_per_iteration": 2.5892906188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067303, + "balance_loss_mlp": 1.04400945, + "epoch": 0.8524432474028473, + "flos": 499120247808.0, + "grad_norm": 0.052370046731540595, + "language_loss": 0.80731219, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81798524, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.23303223, + "step": 4431, + "time_per_iteration": 2.5757946968078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_mlp": 1.04282999, + "epoch": 0.8526356290881109, + "flos": 421314048000.0, + "grad_norm": 0.075289368546831, + "language_loss": 0.79962564, + "learning_rate": 5.587680773323706e-05, + "loss": 0.81029695, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.24291992, + "step": 4432, + "time_per_iteration": 2.5692286491394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065059, + "balance_loss_mlp": 1.04178977, + "epoch": 0.8528280107733743, + "flos": 507328303104.0, + "grad_norm": 0.06440459040466175, + "language_loss": 0.80769801, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.81834859, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.23278809, + "step": 4433, + "time_per_iteration": 2.5663363933563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068841, + "balance_loss_mlp": 1.04665649, + "epoch": 0.8530203924586379, + "flos": 445893797376.0, + "grad_norm": 0.07593628310052838, + "language_loss": 0.83613151, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.84682, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.22167969, + "step": 4434, + "time_per_iteration": 2.5368459224700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066621, + "balance_loss_mlp": 1.04304099, + "epoch": 0.8532127741439015, + "flos": 657759273984.0, + "grad_norm": 0.06368291451038398, + "language_loss": 0.83650082, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84716707, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.23571777, + "step": 4435, + "time_per_iteration": 2.8207342624664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065551, + "balance_loss_mlp": 1.04325891, + "epoch": 0.8534051558291651, + "flos": 536019420672.0, + "grad_norm": 0.059557208389798, + "language_loss": 0.83486569, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84552121, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.22302246, + "step": 4436, + "time_per_iteration": 2.703518867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.0480994, + "epoch": 0.8535975375144286, + "flos": 533000134656.0, + "grad_norm": 0.07770512635670898, + "language_loss": 0.79491788, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80562592, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.22705078, + "step": 4437, + "time_per_iteration": 2.640202283859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066415, + "balance_loss_mlp": 1.04366946, + "epoch": 0.8537899191996922, + "flos": 574141727232.0, + "grad_norm": 0.07771991144134452, + "language_loss": 0.82565296, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83631706, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.22729492, + "step": 4438, + "time_per_iteration": 2.6925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067393, + "balance_loss_mlp": 1.0444448, + "epoch": 0.8539823008849557, + "flos": 465007993344.0, + "grad_norm": 0.05861374859344016, + "language_loss": 0.8332969, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84397078, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.22924805, + "step": 4439, + "time_per_iteration": 2.65185809135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067106, + "balance_loss_mlp": 1.04427707, + "epoch": 0.8541746825702193, + "flos": 554713671168.0, + "grad_norm": 0.05951750277824149, + "language_loss": 0.81889856, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.8295697, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.22839355, + "step": 4440, + "time_per_iteration": 2.674394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_mlp": 1.04465663, + "epoch": 0.8543670642554829, + "flos": 546391816704.0, + "grad_norm": 0.06592019340949207, + "language_loss": 0.77447701, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.7851547, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.2310791, + "step": 4441, + "time_per_iteration": 2.7579543590545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066142, + "balance_loss_mlp": 1.04333735, + "epoch": 0.8545594459407464, + "flos": 512027744256.0, + "grad_norm": 0.055550756440976644, + "language_loss": 0.82326615, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83392757, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.22802734, + "step": 4442, + "time_per_iteration": 2.6374173164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069636, + "balance_loss_mlp": 1.04568636, + "epoch": 0.85475182762601, + "flos": 421185567744.0, + "grad_norm": 0.062356608880068685, + "language_loss": 0.81928778, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82998419, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.23950195, + "step": 4443, + "time_per_iteration": 2.493833065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072435, + "balance_loss_mlp": 1.04957128, + "epoch": 0.8549442093112736, + "flos": 389435516928.0, + "grad_norm": 0.08995749678251017, + "language_loss": 0.7806946, + "learning_rate": 5.41718898228542e-05, + "loss": 0.79141891, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.22851562, + "step": 4444, + "time_per_iteration": 2.4674642086029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064962, + "balance_loss_mlp": 1.04145384, + "epoch": 0.8551365909965372, + "flos": 605926282752.0, + "grad_norm": 0.06096774879214106, + "language_loss": 0.79480731, + "learning_rate": 5.403093707834334e-05, + "loss": 0.805457, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.23510742, + "step": 4445, + "time_per_iteration": 2.797175884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066081, + "balance_loss_mlp": 1.0426091, + "epoch": 0.8553289726818007, + "flos": 504160713216.0, + "grad_norm": 0.054681566669178006, + "language_loss": 0.79001647, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.8006773, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.23449707, + "step": 4446, + "time_per_iteration": 2.5630698204040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070243, + "balance_loss_mlp": 1.04688966, + "epoch": 0.8555213543670642, + "flos": 557009063424.0, + "grad_norm": 0.05883770467526881, + "language_loss": 0.76065564, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77135801, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.23339844, + "step": 4447, + "time_per_iteration": 2.7396435737609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072768, + "balance_loss_mlp": 1.04790044, + "epoch": 0.8557137360523278, + "flos": 548104278528.0, + "grad_norm": 0.05625744975285459, + "language_loss": 0.74883693, + "learning_rate": 5.360911790663775e-05, + "loss": 0.75956464, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.24865723, + "step": 4448, + "time_per_iteration": 2.6279850006103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069234, + "balance_loss_mlp": 1.0455358, + "epoch": 0.8559061177375914, + "flos": 728182628352.0, + "grad_norm": 0.06704266405669905, + "language_loss": 0.78576261, + "learning_rate": 5.346885805197238e-05, + "loss": 0.79645491, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.23693848, + "step": 4449, + "time_per_iteration": 2.98538875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067263, + "balance_loss_mlp": 1.04454207, + "epoch": 0.856098499422855, + "flos": 535881028608.0, + "grad_norm": 0.07982199666480248, + "language_loss": 0.83324075, + "learning_rate": 5.332877155607085e-05, + "loss": 0.84391338, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.22705078, + "step": 4450, + "time_per_iteration": 2.7068963050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072598, + "balance_loss_mlp": 1.0485177, + "epoch": 0.8562908811081185, + "flos": 573664882176.0, + "grad_norm": 0.05521375375553453, + "language_loss": 0.83632553, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.8470515, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.24072266, + "step": 4451, + "time_per_iteration": 2.6884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066185, + "balance_loss_mlp": 1.0432018, + "epoch": 0.856483262793382, + "flos": 781754872320.0, + "grad_norm": 0.06327105945437797, + "language_loss": 0.80799401, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81865585, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.22973633, + "step": 4452, + "time_per_iteration": 3.093397617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065123, + "balance_loss_mlp": 1.04212737, + "epoch": 0.8566756444786456, + "flos": 455819083776.0, + "grad_norm": 0.04933064645771812, + "language_loss": 0.8479045, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85855579, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.22998047, + "step": 4453, + "time_per_iteration": 2.57102370262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_mlp": 1.04082465, + "epoch": 0.8568680261639092, + "flos": 449382587904.0, + "grad_norm": 0.06346100260124406, + "language_loss": 0.84574735, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85639256, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.23681641, + "step": 4454, + "time_per_iteration": 2.5162353515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066613, + "balance_loss_mlp": 1.04390395, + "epoch": 0.8570604078491728, + "flos": 479976316416.0, + "grad_norm": 0.065712336297566, + "language_loss": 0.8286593, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83932549, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.22705078, + "step": 4455, + "time_per_iteration": 2.533682346343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069011, + "balance_loss_mlp": 1.04520488, + "epoch": 0.8572527895344363, + "flos": 505942184448.0, + "grad_norm": 0.06083249202811438, + "language_loss": 0.85139161, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86208177, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.23815918, + "step": 4456, + "time_per_iteration": 2.5975639820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067778, + "balance_loss_mlp": 1.04509306, + "epoch": 0.8574451712196999, + "flos": 787044957696.0, + "grad_norm": 0.05429388196694402, + "language_loss": 0.83300918, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84368694, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.22692871, + "step": 4457, + "time_per_iteration": 3.022726535797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061605, + "balance_loss_mlp": 1.03793049, + "epoch": 0.8576375529049635, + "flos": 509252935680.0, + "grad_norm": 0.05416309124653123, + "language_loss": 0.7547797, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76539564, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.23657227, + "step": 4458, + "time_per_iteration": 2.6720643043518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101799, + "balance_loss_mlp": 1.01150465, + "epoch": 0.857829934590227, + "flos": 1460772486144.0, + "grad_norm": 0.012361752759178701, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85785276, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.06494141, + "step": 4459, + "time_per_iteration": 4.948476314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067215, + "balance_loss_mlp": 1.04449415, + "epoch": 0.8580223162754905, + "flos": 479296839168.0, + "grad_norm": 0.06039783526010156, + "language_loss": 0.89551371, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90618587, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.22741699, + "step": 4460, + "time_per_iteration": 2.6287126541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068846, + "balance_loss_mlp": 1.04555237, + "epoch": 0.8582146979607541, + "flos": 706231954944.0, + "grad_norm": 0.06717737486032459, + "language_loss": 0.7925657, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80325413, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.23291016, + "step": 4461, + "time_per_iteration": 2.8171298503875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106575, + "balance_loss_mlp": 1.04228938, + "epoch": 0.8584070796460177, + "flos": 765158524416.0, + "grad_norm": 0.05415728496526175, + "language_loss": 0.8288449, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83950245, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.23449707, + "step": 4462, + "time_per_iteration": 2.9669747352600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072466, + "balance_loss_mlp": 1.04925561, + "epoch": 0.8585994613312813, + "flos": 586829339136.0, + "grad_norm": 0.05497147450516782, + "language_loss": 0.85761333, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86833793, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.23193359, + "step": 4463, + "time_per_iteration": 2.7663209438323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064609, + "balance_loss_mlp": 1.04201901, + "epoch": 0.8587918430165449, + "flos": 608295826944.0, + "grad_norm": 0.05208267140887148, + "language_loss": 0.78707975, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79772592, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.22583008, + "step": 4464, + "time_per_iteration": 2.7761847972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065329, + "balance_loss_mlp": 1.0422976, + "epoch": 0.8589842247018084, + "flos": 588981570048.0, + "grad_norm": 0.06400063819482826, + "language_loss": 0.81218016, + "learning_rate": 5.124831399159535e-05, + "loss": 0.82283336, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.23034668, + "step": 4465, + "time_per_iteration": 2.7178432941436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_mlp": 1.04617512, + "epoch": 0.8591766063870719, + "flos": 543879111168.0, + "grad_norm": 0.07725520700778, + "language_loss": 0.78933436, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.80002844, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.2322998, + "step": 4466, + "time_per_iteration": 2.6696112155914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070192, + "balance_loss_mlp": 1.04685104, + "epoch": 0.8593689880723355, + "flos": 493756010496.0, + "grad_norm": 0.059887817655499394, + "language_loss": 0.80918819, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81989014, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.2331543, + "step": 4467, + "time_per_iteration": 2.6542410850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067848, + "balance_loss_mlp": 1.04422045, + "epoch": 0.8595613697575991, + "flos": 533909408256.0, + "grad_norm": 0.058922665395114725, + "language_loss": 0.83588117, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84655964, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.23608398, + "step": 4468, + "time_per_iteration": 2.603156805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070297, + "balance_loss_mlp": 1.04666996, + "epoch": 0.8597537514428626, + "flos": 617628271104.0, + "grad_norm": 0.06453385462922813, + "language_loss": 0.75768328, + "learning_rate": 5.070013822961328e-05, + "loss": 0.76838624, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.23620605, + "step": 4469, + "time_per_iteration": 2.7231431007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.04199243, + "epoch": 0.8599461331281262, + "flos": 608730826752.0, + "grad_norm": 0.07204075811726149, + "language_loss": 0.83518052, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84583598, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.23547363, + "step": 4470, + "time_per_iteration": 2.717693328857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068389, + "balance_loss_mlp": 1.04436803, + "epoch": 0.8601385148133898, + "flos": 551252044800.0, + "grad_norm": 0.06516194685738681, + "language_loss": 0.83412385, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84480774, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.2401123, + "step": 4471, + "time_per_iteration": 2.649050712585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065346, + "balance_loss_mlp": 1.04213572, + "epoch": 0.8603308964986534, + "flos": 581200800768.0, + "grad_norm": 0.045536307661976354, + "language_loss": 0.81605756, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82671106, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.23205566, + "step": 4472, + "time_per_iteration": 2.82643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106475, + "balance_loss_mlp": 1.04182613, + "epoch": 0.8605232781839169, + "flos": 629013828096.0, + "grad_norm": 0.06591962589053263, + "language_loss": 0.75219733, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.7628448, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.22912598, + "step": 4473, + "time_per_iteration": 2.810159206390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064778, + "balance_loss_mlp": 1.04146099, + "epoch": 0.8607156598691804, + "flos": 468141078528.0, + "grad_norm": 0.06408761909970127, + "language_loss": 0.77287406, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78352183, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.23291016, + "step": 4474, + "time_per_iteration": 2.5086567401885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065868, + "balance_loss_mlp": 1.04340887, + "epoch": 0.860908041554444, + "flos": 488394344448.0, + "grad_norm": 0.05383413396861342, + "language_loss": 0.82797289, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83863151, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.22436523, + "step": 4475, + "time_per_iteration": 2.624570608139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107092, + "balance_loss_mlp": 1.04825842, + "epoch": 0.8611004232397076, + "flos": 592094831616.0, + "grad_norm": 0.06546574950256999, + "language_loss": 0.80683541, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81754458, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.2265625, + "step": 4476, + "time_per_iteration": 2.663764476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070396, + "balance_loss_mlp": 1.04612505, + "epoch": 0.8612928049249712, + "flos": 774209041920.0, + "grad_norm": 0.07263779863645746, + "language_loss": 0.86455739, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87526137, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.24255371, + "step": 4477, + "time_per_iteration": 3.027892589569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073125, + "balance_loss_mlp": 1.04973626, + "epoch": 0.8614851866102347, + "flos": 537553843200.0, + "grad_norm": 0.061884066626690507, + "language_loss": 0.82510734, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83583856, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.23388672, + "step": 4478, + "time_per_iteration": 2.6411685943603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066901, + "balance_loss_mlp": 1.04391742, + "epoch": 0.8616775682954982, + "flos": 565916419584.0, + "grad_norm": 0.06082112959712949, + "language_loss": 0.79257238, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80324137, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.2298584, + "step": 4479, + "time_per_iteration": 2.668093681335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070465, + "balance_loss_mlp": 1.04654002, + "epoch": 0.8618699499807618, + "flos": 481592231424.0, + "grad_norm": 0.06361737630648884, + "language_loss": 0.81400502, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82470965, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.23937988, + "step": 4480, + "time_per_iteration": 2.5984365940093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069896, + "balance_loss_mlp": 1.04482687, + "epoch": 0.8620623316660254, + "flos": 649506802176.0, + "grad_norm": 0.07470953092140498, + "language_loss": 0.74230057, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75299954, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.25073242, + "step": 4481, + "time_per_iteration": 2.8063738346099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067634, + "balance_loss_mlp": 1.0446502, + "epoch": 0.862254713351289, + "flos": 751781523456.0, + "grad_norm": 0.07440108509254313, + "language_loss": 0.85886705, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86954337, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.2298584, + "step": 4482, + "time_per_iteration": 2.9675238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069198, + "balance_loss_mlp": 1.04586911, + "epoch": 0.8624470950365525, + "flos": 841543727616.0, + "grad_norm": 0.0531227577741629, + "language_loss": 0.78093559, + "learning_rate": 4.880352388488024e-05, + "loss": 0.79162753, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.23303223, + "step": 4483, + "time_per_iteration": 3.2197213172912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070235, + "balance_loss_mlp": 1.04753709, + "epoch": 0.8626394767218161, + "flos": 754793468928.0, + "grad_norm": 0.06843329806115567, + "language_loss": 0.83035022, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84105253, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.22705078, + "step": 4484, + "time_per_iteration": 2.9174022674560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065694, + "balance_loss_mlp": 1.04180419, + "epoch": 0.8628318584070797, + "flos": 703585626624.0, + "grad_norm": 0.05765697174756403, + "language_loss": 0.82464355, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83530051, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.23876953, + "step": 4485, + "time_per_iteration": 2.8672118186950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072261, + "balance_loss_mlp": 1.04927754, + "epoch": 0.8630242400923432, + "flos": 471244428288.0, + "grad_norm": 0.061061631924157964, + "language_loss": 0.78096372, + "learning_rate": 4.840156846389487e-05, + "loss": 0.7916863, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.22973633, + "step": 4486, + "time_per_iteration": 2.559305429458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068405, + "balance_loss_mlp": 1.04474235, + "epoch": 0.8632166217776067, + "flos": 964363553280.0, + "grad_norm": 0.06474745761042122, + "language_loss": 0.77492851, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78561258, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.23620605, + "step": 4487, + "time_per_iteration": 3.1866891384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.04441333, + "epoch": 0.8634090034628703, + "flos": 767913509376.0, + "grad_norm": 0.07806786642802178, + "language_loss": 0.7887038, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79937685, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.22900391, + "step": 4488, + "time_per_iteration": 2.9155101776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070862, + "balance_loss_mlp": 1.04682934, + "epoch": 0.8636013851481339, + "flos": 520591504896.0, + "grad_norm": 0.06399781254677851, + "language_loss": 0.8339777, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84468627, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.24035645, + "step": 4489, + "time_per_iteration": 2.7343907356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073021, + "balance_loss_mlp": 1.04947686, + "epoch": 0.8637937668333975, + "flos": 632144342016.0, + "grad_norm": 0.06306322296515987, + "language_loss": 0.80910051, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81983078, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.23547363, + "step": 4490, + "time_per_iteration": 2.7213432788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063802, + "balance_loss_mlp": 1.04212999, + "epoch": 0.8639861485186611, + "flos": 856094676480.0, + "grad_norm": 0.0662889748017123, + "language_loss": 0.76684427, + "learning_rate": 4.773514997362e-05, + "loss": 0.77748227, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.21679688, + "step": 4491, + "time_per_iteration": 3.1134567260742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107113, + "balance_loss_mlp": 1.04876614, + "epoch": 0.8641785302039245, + "flos": 481261118976.0, + "grad_norm": 0.061534938742705755, + "language_loss": 0.77915752, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.78986883, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.22375488, + "step": 4492, + "time_per_iteration": 2.5515682697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076479, + "balance_loss_mlp": 1.05334055, + "epoch": 0.8643709118891881, + "flos": 504637558272.0, + "grad_norm": 0.056633924458127455, + "language_loss": 0.80623692, + "learning_rate": 4.746981130927675e-05, + "loss": 0.8170017, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.23120117, + "step": 4493, + "time_per_iteration": 2.587090015411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066718, + "balance_loss_mlp": 1.04368663, + "epoch": 0.8645632935744517, + "flos": 552368719872.0, + "grad_norm": 0.061248751366218074, + "language_loss": 0.82094586, + "learning_rate": 4.733740548306908e-05, + "loss": 0.831613, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.23034668, + "step": 4494, + "time_per_iteration": 2.8737690448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070355, + "balance_loss_mlp": 1.04671621, + "epoch": 0.8647556752597153, + "flos": 524737751040.0, + "grad_norm": 0.05950385437466058, + "language_loss": 0.84496897, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.85567254, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.23632812, + "step": 4495, + "time_per_iteration": 2.56300687789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067913, + "balance_loss_mlp": 1.04403555, + "epoch": 0.8649480569449788, + "flos": 787768851456.0, + "grad_norm": 0.05732436588564149, + "language_loss": 0.82358348, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83426261, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.23876953, + "step": 4496, + "time_per_iteration": 3.0807862281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067, + "balance_loss_mlp": 1.04376626, + "epoch": 0.8651404386302424, + "flos": 763863810048.0, + "grad_norm": 0.05411692865877634, + "language_loss": 0.76783019, + "learning_rate": 4.694124264495225e-05, + "loss": 0.7785002, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.23254395, + "step": 4497, + "time_per_iteration": 3.0178675651550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069027, + "balance_loss_mlp": 1.04600811, + "epoch": 0.865332820315506, + "flos": 539893651968.0, + "grad_norm": 0.06388786819462051, + "language_loss": 0.82005155, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83074188, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.23010254, + "step": 4498, + "time_per_iteration": 2.7146639823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018384, + "balance_loss_mlp": 1.01185119, + "epoch": 0.8655252020007695, + "flos": 1476632830464.0, + "grad_norm": 0.008099303478645539, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80192828, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.06542969, + "step": 4499, + "time_per_iteration": 4.753086090087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107094, + "balance_loss_mlp": 1.04775345, + "epoch": 0.8657175836860331, + "flos": 517369586688.0, + "grad_norm": 0.053545746789140966, + "language_loss": 0.82837707, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83908641, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.23168945, + "step": 4500, + "time_per_iteration": 2.6927261352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070408, + "balance_loss_mlp": 1.04632783, + "epoch": 0.8659099653712966, + "flos": 590523333120.0, + "grad_norm": 0.0655677470147481, + "language_loss": 0.80042905, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81113315, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.24072266, + "step": 4501, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071603, + "balance_loss_mlp": 1.04906011, + "epoch": 0.8661023470565602, + "flos": 590449181184.0, + "grad_norm": 0.06647391146116884, + "language_loss": 0.88215441, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89287043, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.22546387, + "step": 4502, + "time_per_iteration": 2.8460209369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.04535317, + "epoch": 0.8662947287418238, + "flos": 567670726656.0, + "grad_norm": 0.06408588375541317, + "language_loss": 0.79417884, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80486345, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.2310791, + "step": 4503, + "time_per_iteration": 2.7376766204833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.04433703, + "epoch": 0.8664871104270874, + "flos": 515929139712.0, + "grad_norm": 0.04956343701508282, + "language_loss": 0.82059669, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83126605, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.22595215, + "step": 4504, + "time_per_iteration": 2.7517826557159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074981, + "balance_loss_mlp": 1.05140173, + "epoch": 0.866679492112351, + "flos": 557263452672.0, + "grad_norm": 0.0593924195885579, + "language_loss": 0.78475362, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79550344, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.2355957, + "step": 4505, + "time_per_iteration": 2.7980902194976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068297, + "balance_loss_mlp": 1.04519379, + "epoch": 0.8668718737976144, + "flos": 722448004608.0, + "grad_norm": 0.07401296676865261, + "language_loss": 0.82072198, + "learning_rate": 4.57622578599054e-05, + "loss": 0.83140492, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.2310791, + "step": 4506, + "time_per_iteration": 2.901148796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070825, + "balance_loss_mlp": 1.046983, + "epoch": 0.867064255482878, + "flos": 600705580032.0, + "grad_norm": 0.06824743782303751, + "language_loss": 0.84235609, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.8530643, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.23840332, + "step": 4507, + "time_per_iteration": 2.712239980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067376, + "balance_loss_mlp": 1.04275918, + "epoch": 0.8672566371681416, + "flos": 803527879680.0, + "grad_norm": 0.05761881771366499, + "language_loss": 0.76407146, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77474517, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.24621582, + "step": 4508, + "time_per_iteration": 3.0358455181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070674, + "balance_loss_mlp": 1.04858494, + "epoch": 0.8674490188534052, + "flos": 627368177664.0, + "grad_norm": 0.05601064601352948, + "language_loss": 0.83846825, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84917504, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.22094727, + "step": 4509, + "time_per_iteration": 2.733057737350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073461, + "balance_loss_mlp": 1.05034614, + "epoch": 0.8676414005386687, + "flos": 727831692288.0, + "grad_norm": 0.05491630490316898, + "language_loss": 0.86686462, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87759924, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.2310791, + "step": 4510, + "time_per_iteration": 2.9853105545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_mlp": 1.0446918, + "epoch": 0.8678337822239323, + "flos": 539972573184.0, + "grad_norm": 0.06374912494206549, + "language_loss": 0.80658293, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81725538, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.2253418, + "step": 4511, + "time_per_iteration": 2.75742244720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073846, + "balance_loss_mlp": 1.05014682, + "epoch": 0.8680261639091958, + "flos": 507521023488.0, + "grad_norm": 0.07370272998017972, + "language_loss": 0.79475594, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80549443, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.23706055, + "step": 4512, + "time_per_iteration": 2.5551512241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.05068421, + "epoch": 0.8682185455944594, + "flos": 487126794240.0, + "grad_norm": 0.05959366195168138, + "language_loss": 0.81097651, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82170677, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.22338867, + "step": 4513, + "time_per_iteration": 2.6309256553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064992, + "balance_loss_mlp": 1.0413053, + "epoch": 0.868410927279723, + "flos": 603690361344.0, + "grad_norm": 0.08611526780858492, + "language_loss": 0.81241572, + "learning_rate": 4.472626206030528e-05, + "loss": 0.82306564, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.23669434, + "step": 4514, + "time_per_iteration": 2.753772258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065128, + "balance_loss_mlp": 1.04249024, + "epoch": 0.8686033089649865, + "flos": 1118985186816.0, + "grad_norm": 0.06244049051454504, + "language_loss": 0.85057306, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.86122435, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.22631836, + "step": 4515, + "time_per_iteration": 3.37809157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071275, + "balance_loss_mlp": 1.04789805, + "epoch": 0.8687956906502501, + "flos": 568019091456.0, + "grad_norm": 0.07169275987782167, + "language_loss": 0.84147042, + "learning_rate": 4.446902963685862e-05, + "loss": 0.85218316, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.23364258, + "step": 4516, + "time_per_iteration": 2.7013230323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072826, + "balance_loss_mlp": 1.04969954, + "epoch": 0.8689880723355137, + "flos": 544338703872.0, + "grad_norm": 0.05707773988768033, + "language_loss": 0.84542006, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.8561483, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.23132324, + "step": 4517, + "time_per_iteration": 2.680288553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064511, + "balance_loss_mlp": 1.04248095, + "epoch": 0.8691804540207773, + "flos": 457425086976.0, + "grad_norm": 0.054403419993017434, + "language_loss": 0.86638057, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.8770256, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.22045898, + "step": 4518, + "time_per_iteration": 2.600592851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072363, + "balance_loss_mlp": 1.04929602, + "epoch": 0.8693728357060407, + "flos": 591872375808.0, + "grad_norm": 0.10977029070372525, + "language_loss": 0.80449891, + "learning_rate": 4.40845075221456e-05, + "loss": 0.81522256, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.23046875, + "step": 4519, + "time_per_iteration": 2.6959917545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068235, + "balance_loss_mlp": 1.04550183, + "epoch": 0.8695652173913043, + "flos": 680263515648.0, + "grad_norm": 0.07063102668534314, + "language_loss": 0.79929483, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80997729, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.22741699, + "step": 4520, + "time_per_iteration": 2.875837564468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072228, + "balance_loss_mlp": 1.04933965, + "epoch": 0.8697575990765679, + "flos": 492362551296.0, + "grad_norm": 0.05846160693861355, + "language_loss": 0.78477466, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79549694, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.22875977, + "step": 4521, + "time_per_iteration": 2.58884334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.04863048, + "epoch": 0.8699499807618315, + "flos": 526949079552.0, + "grad_norm": 0.06228511759625967, + "language_loss": 0.81880158, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82952034, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.2322998, + "step": 4522, + "time_per_iteration": 2.6632633209228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070463, + "balance_loss_mlp": 1.04691923, + "epoch": 0.8701423624470951, + "flos": 814342616064.0, + "grad_norm": 0.061067995662179124, + "language_loss": 0.80669498, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81739956, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.23547363, + "step": 4523, + "time_per_iteration": 3.109477996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069602, + "balance_loss_mlp": 1.04596305, + "epoch": 0.8703347441323586, + "flos": 556789178880.0, + "grad_norm": 0.05389586568616872, + "language_loss": 0.88408816, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89478421, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.2364502, + "step": 4524, + "time_per_iteration": 2.649951934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067785, + "balance_loss_mlp": 1.04521906, + "epoch": 0.8705271258176221, + "flos": 585443220480.0, + "grad_norm": 0.050832171139039956, + "language_loss": 0.84775817, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85843599, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.22558594, + "step": 4525, + "time_per_iteration": 2.793938398361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074171, + "balance_loss_mlp": 1.05107999, + "epoch": 0.8707195075028857, + "flos": 669216411648.0, + "grad_norm": 0.05409510643219343, + "language_loss": 0.85498273, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86572444, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.23071289, + "step": 4526, + "time_per_iteration": 2.8929967880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072832, + "balance_loss_mlp": 1.04955089, + "epoch": 0.8709118891881493, + "flos": 520391443968.0, + "grad_norm": 0.05925337750259882, + "language_loss": 0.83831525, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84904361, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.23242188, + "step": 4527, + "time_per_iteration": 2.7890868186950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073321, + "balance_loss_mlp": 1.04992008, + "epoch": 0.8711042708734128, + "flos": 553208984064.0, + "grad_norm": 0.0613716870727963, + "language_loss": 0.81508851, + "learning_rate": 4.294050463490401e-05, + "loss": 0.8258217, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.23376465, + "step": 4528, + "time_per_iteration": 2.6644904613494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071211, + "balance_loss_mlp": 1.04809642, + "epoch": 0.8712966525586764, + "flos": 502193862144.0, + "grad_norm": 0.06354318579498781, + "language_loss": 0.82647032, + "learning_rate": 4.281427977823094e-05, + "loss": 0.8371824, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.23095703, + "step": 4529, + "time_per_iteration": 2.7259349822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.04900551, + "epoch": 0.87148903424394, + "flos": 804096129024.0, + "grad_norm": 0.07176062030792234, + "language_loss": 0.73956883, + "learning_rate": 4.268823241679593e-05, + "loss": 0.75028968, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.23071289, + "step": 4530, + "time_per_iteration": 3.0482122898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066396, + "balance_loss_mlp": 1.04405594, + "epoch": 0.8716814159292036, + "flos": 773438160384.0, + "grad_norm": 0.05324576047171836, + "language_loss": 0.86157966, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87224358, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.2232666, + "step": 4531, + "time_per_iteration": 3.0000054836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_mlp": 1.05127263, + "epoch": 0.8718737976144671, + "flos": 486835329024.0, + "grad_norm": 0.0693772936852786, + "language_loss": 0.85297459, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86371231, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.22509766, + "step": 4532, + "time_per_iteration": 2.5742595195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066232, + "balance_loss_mlp": 1.04420245, + "epoch": 0.8720661792997306, + "flos": 584123913216.0, + "grad_norm": 0.05401834066976159, + "language_loss": 0.78774154, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79840392, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.22021484, + "step": 4533, + "time_per_iteration": 2.7111940383911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012669, + "balance_loss_mlp": 1.00599337, + "epoch": 0.8722585609849942, + "flos": 1495942318080.0, + "grad_norm": 0.007267099236894943, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81979477, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.06689453, + "step": 4534, + "time_per_iteration": 4.824821472167969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066979, + "balance_loss_mlp": 1.04374468, + "epoch": 0.8724509426702578, + "flos": 596169123840.0, + "grad_norm": 0.08694763070174955, + "language_loss": 0.87074769, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88141751, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.23242188, + "step": 4535, + "time_per_iteration": 2.7852017879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073329, + "balance_loss_mlp": 1.04880726, + "epoch": 0.8726433243555214, + "flos": 443635481088.0, + "grad_norm": 0.05794421174501006, + "language_loss": 0.81177545, + "learning_rate": 4.193567838376888e-05, + "loss": 0.82250875, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.24499512, + "step": 4536, + "time_per_iteration": 2.5551156997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_mlp": 1.04312158, + "epoch": 0.8728357060407849, + "flos": 553181819904.0, + "grad_norm": 0.06646899623951459, + "language_loss": 0.82300961, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83366895, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.22827148, + "step": 4537, + "time_per_iteration": 2.639230728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066126, + "balance_loss_mlp": 1.04342866, + "epoch": 0.8730280877260485, + "flos": 627807946752.0, + "grad_norm": 0.06707141042951141, + "language_loss": 0.79122996, + "learning_rate": 4.16862492117136e-05, + "loss": 0.80189127, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.22692871, + "step": 4538, + "time_per_iteration": 2.806157350540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064927, + "balance_loss_mlp": 1.04149008, + "epoch": 0.873220469411312, + "flos": 535384359936.0, + "grad_norm": 0.06758172406923339, + "language_loss": 0.80222809, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81287742, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.23425293, + "step": 4539, + "time_per_iteration": 2.721412420272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069845, + "balance_loss_mlp": 1.04654002, + "epoch": 0.8734128510965756, + "flos": 561883972608.0, + "grad_norm": 0.057519719081089375, + "language_loss": 0.84056413, + "learning_rate": 4.143753177230242e-05, + "loss": 0.85126257, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.2331543, + "step": 4540, + "time_per_iteration": 2.706616163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067834, + "balance_loss_mlp": 1.04486275, + "epoch": 0.8736052327818392, + "flos": 686467643904.0, + "grad_norm": 0.06653511461765922, + "language_loss": 0.79649061, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80716896, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.22973633, + "step": 4541, + "time_per_iteration": 2.9499471187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.04360437, + "epoch": 0.8737976144671027, + "flos": 531673113600.0, + "grad_norm": 0.06068280649961363, + "language_loss": 0.81656998, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82723451, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.22875977, + "step": 4542, + "time_per_iteration": 2.791944980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070959, + "balance_loss_mlp": 1.04743886, + "epoch": 0.8739899961523663, + "flos": 575592086016.0, + "grad_norm": 0.05207946464598878, + "language_loss": 0.82126415, + "learning_rate": 4.106579095649649e-05, + "loss": 0.83197367, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.23510742, + "step": 4543, + "time_per_iteration": 2.881683826446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069601, + "balance_loss_mlp": 1.0456636, + "epoch": 0.8741823778376299, + "flos": 731332965888.0, + "grad_norm": 0.06114890089008731, + "language_loss": 0.76864976, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77934569, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.23937988, + "step": 4544, + "time_per_iteration": 2.8996529579162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067897, + "balance_loss_mlp": 1.04386425, + "epoch": 0.8743747595228935, + "flos": 567080082432.0, + "grad_norm": 0.05969360232616191, + "language_loss": 0.8386988, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84937775, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.2401123, + "step": 4545, + "time_per_iteration": 2.7387166023254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067094, + "balance_loss_mlp": 1.04331172, + "epoch": 0.8745671412081569, + "flos": 493370569728.0, + "grad_norm": 0.056432657681556246, + "language_loss": 0.82131648, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83198744, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.23779297, + "step": 4546, + "time_per_iteration": 2.575007915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106455, + "balance_loss_mlp": 1.04134023, + "epoch": 0.8747595228934205, + "flos": 524139766272.0, + "grad_norm": 0.05353066559861675, + "language_loss": 0.83669919, + "learning_rate": 4.057263119533233e-05, + "loss": 0.8473447, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.23193359, + "step": 4547, + "time_per_iteration": 2.627749443054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065846, + "balance_loss_mlp": 1.04327965, + "epoch": 0.8749519045786841, + "flos": 744349118976.0, + "grad_norm": 0.0643040936183351, + "language_loss": 0.80280411, + "learning_rate": 4.044978704935853e-05, + "loss": 0.81346262, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.22570801, + "step": 4548, + "time_per_iteration": 3.0364036560058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.05203199, + "epoch": 0.8751442862639477, + "flos": 594278995968.0, + "grad_norm": 0.06542874520616514, + "language_loss": 0.80029333, + "learning_rate": 4.032712131660027e-05, + "loss": 0.81104398, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.23034668, + "step": 4549, + "time_per_iteration": 2.8610854148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067604, + "balance_loss_mlp": 1.04379821, + "epoch": 0.8753366679492113, + "flos": 496530819072.0, + "grad_norm": 0.0491879081167887, + "language_loss": 0.78780919, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79848522, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.23791504, + "step": 4550, + "time_per_iteration": 2.724942684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067747, + "balance_loss_mlp": 1.04468012, + "epoch": 0.8755290496344748, + "flos": 489864526848.0, + "grad_norm": 0.07218852967777867, + "language_loss": 0.82172048, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.832398, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.23059082, + "step": 4551, + "time_per_iteration": 2.6174588203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069001, + "balance_loss_mlp": 1.04489684, + "epoch": 0.8757214313197383, + "flos": 591859892736.0, + "grad_norm": 0.06050960233679645, + "language_loss": 0.81907594, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82976592, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.24108887, + "step": 4552, + "time_per_iteration": 2.8340346813201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.04184747, + "epoch": 0.8759138130050019, + "flos": 976843763712.0, + "grad_norm": 0.06903868945856485, + "language_loss": 0.78443825, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.7950899, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.2331543, + "step": 4553, + "time_per_iteration": 3.2752091884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066678, + "balance_loss_mlp": 1.04353917, + "epoch": 0.8761061946902655, + "flos": 802764338688.0, + "grad_norm": 0.05543328693118016, + "language_loss": 0.78072476, + "learning_rate": 3.971647051542243e-05, + "loss": 0.79139149, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.23132324, + "step": 4554, + "time_per_iteration": 3.1207494735717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106636, + "balance_loss_mlp": 1.04254174, + "epoch": 0.8762985763755291, + "flos": 698495602176.0, + "grad_norm": 0.07396612314810898, + "language_loss": 0.74972981, + "learning_rate": 3.95948762596155e-05, + "loss": 0.76039344, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.23815918, + "step": 4555, + "time_per_iteration": 2.987738847732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066573, + "balance_loss_mlp": 1.04325533, + "epoch": 0.8764909580607926, + "flos": 629717898240.0, + "grad_norm": 0.06305038969435892, + "language_loss": 0.80471361, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81537932, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.2331543, + "step": 4556, + "time_per_iteration": 2.846748113632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071271, + "balance_loss_mlp": 1.04776287, + "epoch": 0.8766833397460562, + "flos": 481545243648.0, + "grad_norm": 0.054215636219797275, + "language_loss": 0.80461884, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81533158, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.23498535, + "step": 4557, + "time_per_iteration": 2.6457924842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070243, + "balance_loss_mlp": 1.04678226, + "epoch": 0.8768757214313198, + "flos": 407734414848.0, + "grad_norm": 0.0647505040176177, + "language_loss": 0.78280514, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79350758, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.23461914, + "step": 4558, + "time_per_iteration": 2.488900899887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069964, + "balance_loss_mlp": 1.04682493, + "epoch": 0.8770681031165833, + "flos": 582582150144.0, + "grad_norm": 0.09809770329517786, + "language_loss": 0.82181263, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.8325122, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.23132324, + "step": 4559, + "time_per_iteration": 2.706878185272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069373, + "balance_loss_mlp": 1.04493523, + "epoch": 0.8772604848018468, + "flos": 508687257600.0, + "grad_norm": 0.06124524384081237, + "language_loss": 0.80961919, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.82031298, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.2442627, + "step": 4560, + "time_per_iteration": 2.6287667751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010685, + "balance_loss_mlp": 1.04520679, + "epoch": 0.8774528664871104, + "flos": 408836408832.0, + "grad_norm": 0.06993098532938485, + "language_loss": 0.85462463, + "learning_rate": 3.886906601970913e-05, + "loss": 0.8653096, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.23254395, + "step": 4561, + "time_per_iteration": 2.4868052005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064436, + "balance_loss_mlp": 1.04136872, + "epoch": 0.877645248172374, + "flos": 500844819456.0, + "grad_norm": 0.047628659589740864, + "language_loss": 0.8361448, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84678912, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.23046875, + "step": 4562, + "time_per_iteration": 2.6149418354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.04525661, + "epoch": 0.8778376298576376, + "flos": 633145019904.0, + "grad_norm": 0.05889169926682073, + "language_loss": 0.78304517, + "learning_rate": 3.862856098834189e-05, + "loss": 0.79372722, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.22912598, + "step": 4563, + "time_per_iteration": 2.857564687728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.04482722, + "epoch": 0.8780300115429012, + "flos": 533988329472.0, + "grad_norm": 0.062171034291002715, + "language_loss": 0.8044073, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81508875, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.2331543, + "step": 4564, + "time_per_iteration": 2.798351526260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064203, + "balance_loss_mlp": 1.04216146, + "epoch": 0.8782223932281646, + "flos": 511662127104.0, + "grad_norm": 0.052130218938398365, + "language_loss": 0.77531612, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78595817, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.22058105, + "step": 4565, + "time_per_iteration": 2.5758793354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064441, + "balance_loss_mlp": 1.04098022, + "epoch": 0.8784147749134282, + "flos": 780714547200.0, + "grad_norm": 0.06308413249676079, + "language_loss": 0.70176268, + "learning_rate": 3.826914695965766e-05, + "loss": 0.71240711, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.23449707, + "step": 4566, + "time_per_iteration": 3.148693084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072907, + "balance_loss_mlp": 1.04900551, + "epoch": 0.8786071565986918, + "flos": 561004434432.0, + "grad_norm": 0.06612384532340969, + "language_loss": 0.75962007, + "learning_rate": 3.814970074111279e-05, + "loss": 0.77034914, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.2388916, + "step": 4567, + "time_per_iteration": 2.6695103645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066088, + "balance_loss_mlp": 1.0432117, + "epoch": 0.8787995382839554, + "flos": 603448081920.0, + "grad_norm": 0.05080752532897636, + "language_loss": 0.77430034, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78496122, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.2286377, + "step": 4568, + "time_per_iteration": 2.8336360454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010713, + "balance_loss_mlp": 1.04786336, + "epoch": 0.8789919199692189, + "flos": 560233552896.0, + "grad_norm": 0.04760794267274833, + "language_loss": 0.85342216, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.86413515, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.23425293, + "step": 4569, + "time_per_iteration": 2.661607027053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068074, + "balance_loss_mlp": 1.04515028, + "epoch": 0.8791843016544825, + "flos": 539115429888.0, + "grad_norm": 0.07652247848761115, + "language_loss": 0.82203466, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.83271539, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.22912598, + "step": 4570, + "time_per_iteration": 2.638333797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066012, + "balance_loss_mlp": 1.04247975, + "epoch": 0.8793766833397461, + "flos": 1008699899904.0, + "grad_norm": 0.060157796139736874, + "language_loss": 0.79238844, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80304855, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.23522949, + "step": 4571, + "time_per_iteration": 3.3326447010040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.04619253, + "epoch": 0.8795690650250096, + "flos": 678637688832.0, + "grad_norm": 0.06297064041998576, + "language_loss": 0.81085575, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82154483, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.22705078, + "step": 4572, + "time_per_iteration": 2.864213466644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064824, + "balance_loss_mlp": 1.04248405, + "epoch": 0.8797614467102732, + "flos": 453432287232.0, + "grad_norm": 0.07526696415959916, + "language_loss": 0.88917291, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.8998211, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.22338867, + "step": 4573, + "time_per_iteration": 2.5274643898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061759, + "balance_loss_mlp": 1.03906155, + "epoch": 0.8799538283955367, + "flos": 550913591808.0, + "grad_norm": 0.05173001406565122, + "language_loss": 0.84684122, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85745883, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.22692871, + "step": 4574, + "time_per_iteration": 2.658022880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067289, + "balance_loss_mlp": 1.04462719, + "epoch": 0.8801462100808003, + "flos": 807429275136.0, + "grad_norm": 0.06273590895888136, + "language_loss": 0.84730029, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85797316, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.2265625, + "step": 4575, + "time_per_iteration": 3.0476410388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066559, + "balance_loss_mlp": 1.04321766, + "epoch": 0.8803385917660639, + "flos": 768694302720.0, + "grad_norm": 0.06194945709094393, + "language_loss": 0.84745121, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85811675, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.23327637, + "step": 4576, + "time_per_iteration": 2.9632747173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106577, + "balance_loss_mlp": 1.04261923, + "epoch": 0.8805309734513275, + "flos": 567339614208.0, + "grad_norm": 0.05151265510881933, + "language_loss": 0.81348932, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82414699, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.23156738, + "step": 4577, + "time_per_iteration": 2.8133304119110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063951, + "balance_loss_mlp": 1.04054976, + "epoch": 0.880723355136591, + "flos": 679779330048.0, + "grad_norm": 0.05836612347560367, + "language_loss": 0.81520814, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82584763, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.23388672, + "step": 4578, + "time_per_iteration": 2.8322298526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065318, + "balance_loss_mlp": 1.04195356, + "epoch": 0.8809157368218545, + "flos": 565629723648.0, + "grad_norm": 0.05785232001619173, + "language_loss": 0.79189932, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80255246, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.23339844, + "step": 4579, + "time_per_iteration": 2.752981424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065349, + "balance_loss_mlp": 1.04259169, + "epoch": 0.8811081185071181, + "flos": 515407878144.0, + "grad_norm": 0.05306189241878955, + "language_loss": 0.7616868, + "learning_rate": 3.661323354789586e-05, + "loss": 0.7723403, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.22741699, + "step": 4580, + "time_per_iteration": 2.6905887126922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071852, + "balance_loss_mlp": 1.04929709, + "epoch": 0.8813005001923817, + "flos": 594343236096.0, + "grad_norm": 0.08666583481538859, + "language_loss": 0.81318676, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82390535, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.22558594, + "step": 4581, + "time_per_iteration": 2.717012405395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063192, + "balance_loss_mlp": 1.04038763, + "epoch": 0.8814928818776453, + "flos": 666940843008.0, + "grad_norm": 0.065573570347403, + "language_loss": 0.79248452, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80311644, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.22802734, + "step": 4582, + "time_per_iteration": 2.8327713012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061504, + "balance_loss_mlp": 1.03867507, + "epoch": 0.8816852635629088, + "flos": 609153343488.0, + "grad_norm": 0.06392391603853627, + "language_loss": 0.85894233, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86955738, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.22814941, + "step": 4583, + "time_per_iteration": 2.738196611404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062285, + "balance_loss_mlp": 1.03962302, + "epoch": 0.8818776452481724, + "flos": 480379009536.0, + "grad_norm": 0.05949601361971884, + "language_loss": 0.82285428, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83347714, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.2265625, + "step": 4584, + "time_per_iteration": 2.578489303588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072275, + "balance_loss_mlp": 1.04852891, + "epoch": 0.882070026933436, + "flos": 1045394242560.0, + "grad_norm": 0.062197672953105604, + "language_loss": 0.73814642, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74886918, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.23742676, + "step": 4585, + "time_per_iteration": 3.293839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063064, + "balance_loss_mlp": 1.03912687, + "epoch": 0.8822624086186995, + "flos": 474409446912.0, + "grad_norm": 0.05599443647615175, + "language_loss": 0.79877216, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80940282, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.23925781, + "step": 4586, + "time_per_iteration": 2.6354963779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066824, + "balance_loss_mlp": 1.04409122, + "epoch": 0.882454790303963, + "flos": 654023434752.0, + "grad_norm": 0.187385514178466, + "language_loss": 0.82061517, + "learning_rate": 3.579849183630485e-05, + "loss": 0.83128339, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.22717285, + "step": 4587, + "time_per_iteration": 2.8057334423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065861, + "balance_loss_mlp": 1.04243684, + "epoch": 0.8826471719892266, + "flos": 470325242880.0, + "grad_norm": 0.0652059168593454, + "language_loss": 0.7890746, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79973322, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.23425293, + "step": 4588, + "time_per_iteration": 2.6349856853485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106342, + "balance_loss_mlp": 1.04065084, + "epoch": 0.8828395536744902, + "flos": 468753744384.0, + "grad_norm": 0.05333811526852745, + "language_loss": 0.83772522, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84835941, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.22766113, + "step": 4589, + "time_per_iteration": 2.6871464252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060113, + "balance_loss_mlp": 1.03739214, + "epoch": 0.8830319353597538, + "flos": 721377944064.0, + "grad_norm": 0.05489948946864513, + "language_loss": 0.81401742, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82461852, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.22705078, + "step": 4590, + "time_per_iteration": 2.9603042602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.04144478, + "epoch": 0.8832243170450174, + "flos": 443277204480.0, + "grad_norm": 0.06729262169769755, + "language_loss": 0.81461012, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.8252542, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.22949219, + "step": 4591, + "time_per_iteration": 2.5702083110809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106648, + "balance_loss_mlp": 1.04310322, + "epoch": 0.8834166987302808, + "flos": 566583413760.0, + "grad_norm": 0.07104609696133038, + "language_loss": 0.82291472, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83357948, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.23364258, + "step": 4592, + "time_per_iteration": 2.7355880737304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064561, + "balance_loss_mlp": 1.04203045, + "epoch": 0.8836090804155444, + "flos": 609316328448.0, + "grad_norm": 0.07360243173205684, + "language_loss": 0.82330287, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83394849, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.2253418, + "step": 4593, + "time_per_iteration": 2.7873008251190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068103, + "balance_loss_mlp": 1.04457128, + "epoch": 0.883801462100808, + "flos": 557065963008.0, + "grad_norm": 0.06221990511497487, + "language_loss": 0.80560136, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81628239, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.23510742, + "step": 4594, + "time_per_iteration": 2.729846477508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_mlp": 1.04311657, + "epoch": 0.8839938437860716, + "flos": 516188671488.0, + "grad_norm": 0.07560096457235571, + "language_loss": 0.77495778, + "learning_rate": 3.487817247139064e-05, + "loss": 0.7856288, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.23974609, + "step": 4595, + "time_per_iteration": 2.6014037132263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065125, + "balance_loss_mlp": 1.04190314, + "epoch": 0.8841862254713351, + "flos": 713696292864.0, + "grad_norm": 0.08393319436175375, + "language_loss": 0.79193687, + "learning_rate": 3.47639446766777e-05, + "loss": 0.80258811, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.23193359, + "step": 4596, + "time_per_iteration": 2.880234479904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062087, + "balance_loss_mlp": 1.03944921, + "epoch": 0.8843786071565987, + "flos": 833975875584.0, + "grad_norm": 0.05899737308739052, + "language_loss": 0.82822627, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.8388471, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.22607422, + "step": 4597, + "time_per_iteration": 3.0193350315093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065501, + "balance_loss_mlp": 1.04293513, + "epoch": 0.8845709888418622, + "flos": 656884505088.0, + "grad_norm": 0.051134227194142116, + "language_loss": 0.83159703, + "learning_rate": 3.453603099349462e-05, + "loss": 0.84225208, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.22570801, + "step": 4598, + "time_per_iteration": 2.904973030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065085, + "balance_loss_mlp": 1.04247046, + "epoch": 0.8847633705271258, + "flos": 523326666240.0, + "grad_norm": 0.0641863299399463, + "language_loss": 0.81212854, + "learning_rate": 3.442234519350823e-05, + "loss": 0.82277942, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.22607422, + "step": 4599, + "time_per_iteration": 2.7764077186584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065009, + "balance_loss_mlp": 1.0417037, + "epoch": 0.8849557522123894, + "flos": 548591035392.0, + "grad_norm": 0.05892360485500338, + "language_loss": 0.84439909, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85504919, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.23303223, + "step": 4600, + "time_per_iteration": 2.655726671218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069456, + "balance_loss_mlp": 1.04689002, + "epoch": 0.8851481338976529, + "flos": 622372128768.0, + "grad_norm": 0.05693067043210366, + "language_loss": 0.839571, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.85026556, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.22558594, + "step": 4601, + "time_per_iteration": 2.778089761734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067376, + "balance_loss_mlp": 1.04427338, + "epoch": 0.8853405155829165, + "flos": 444359374848.0, + "grad_norm": 0.07032793668671747, + "language_loss": 0.80884451, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81951827, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.23083496, + "step": 4602, + "time_per_iteration": 2.5838189125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061588, + "balance_loss_mlp": 1.0376389, + "epoch": 0.8855328972681801, + "flos": 730470680064.0, + "grad_norm": 0.05264310047515772, + "language_loss": 0.78328097, + "learning_rate": 3.396940996663683e-05, + "loss": 0.79389679, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.23950195, + "step": 4603, + "time_per_iteration": 2.917466163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064302, + "balance_loss_mlp": 1.04041266, + "epoch": 0.8857252789534437, + "flos": 487376414208.0, + "grad_norm": 0.06259539503163569, + "language_loss": 0.79090303, + "learning_rate": 3.385662837299375e-05, + "loss": 0.8015461, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.23901367, + "step": 4604, + "time_per_iteration": 2.5593533515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066302, + "balance_loss_mlp": 1.04194784, + "epoch": 0.8859176606387072, + "flos": 508556206080.0, + "grad_norm": 0.06681364763275595, + "language_loss": 0.817626, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82828903, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.2434082, + "step": 4605, + "time_per_iteration": 2.7382400035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068054, + "balance_loss_mlp": 1.04427195, + "epoch": 0.8861100423239707, + "flos": 516628440576.0, + "grad_norm": 0.1197682640175093, + "language_loss": 0.85860205, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86928248, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.23791504, + "step": 4606, + "time_per_iteration": 2.738442897796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.04325604, + "epoch": 0.8863024240092343, + "flos": 626975396352.0, + "grad_norm": 0.0522441462901882, + "language_loss": 0.79691124, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80757129, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.22753906, + "step": 4607, + "time_per_iteration": 2.7272424697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063318, + "balance_loss_mlp": 1.03960764, + "epoch": 0.8864948056944979, + "flos": 766910260224.0, + "grad_norm": 0.055573824499614524, + "language_loss": 0.83738887, + "learning_rate": 3.340731216429083e-05, + "loss": 0.8480221, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.23681641, + "step": 4608, + "time_per_iteration": 2.9728434085845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002821, + "balance_loss_mlp": 0.99604982, + "epoch": 0.8866871873797615, + "flos": 1502331452928.0, + "grad_norm": 0.0055616018334323225, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79833776, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.06787109, + "step": 4609, + "time_per_iteration": 4.811368942260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.04073143, + "epoch": 0.886879569065025, + "flos": 811516050432.0, + "grad_norm": 0.05668171590793676, + "language_loss": 0.81769073, + "learning_rate": 3.3183740769755e-05, + "loss": 0.8283273, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.22937012, + "step": 4610, + "time_per_iteration": 3.0735855102539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002743, + "balance_loss_mlp": 0.99597168, + "epoch": 0.8870719507502886, + "flos": 1582838309376.0, + "grad_norm": 0.005567731821103193, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.7791357, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.06787109, + "step": 4611, + "time_per_iteration": 4.918694734573364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064613, + "balance_loss_mlp": 1.0412122, + "epoch": 0.8872643324355521, + "flos": 634027129344.0, + "grad_norm": 0.06376085793205072, + "language_loss": 0.75125146, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76189762, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.23400879, + "step": 4612, + "time_per_iteration": 2.755192995071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067352, + "balance_loss_mlp": 1.04434419, + "epoch": 0.8874567141208157, + "flos": 535755119616.0, + "grad_norm": 0.06524181347271532, + "language_loss": 0.82796997, + "learning_rate": 3.284974304209532e-05, + "loss": 0.83864343, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.2298584, + "step": 4613, + "time_per_iteration": 2.6614327430725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107072, + "balance_loss_mlp": 1.04793918, + "epoch": 0.8876490958060793, + "flos": 1566302552064.0, + "grad_norm": 0.05648931496157644, + "language_loss": 0.79739761, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80810487, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.22766113, + "step": 4614, + "time_per_iteration": 3.8716471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106887, + "balance_loss_mlp": 1.04496849, + "epoch": 0.8878414774913428, + "flos": 636633810432.0, + "grad_norm": 0.057359413666851676, + "language_loss": 0.85207993, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.86276865, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.2388916, + "step": 4615, + "time_per_iteration": 2.82585072517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071116, + "balance_loss_mlp": 1.04854965, + "epoch": 0.8880338591766064, + "flos": 496429502976.0, + "grad_norm": 0.06508067995275697, + "language_loss": 0.81528175, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82599294, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.22558594, + "step": 4616, + "time_per_iteration": 2.6278040409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.04114652, + "epoch": 0.88822624086187, + "flos": 542861180928.0, + "grad_norm": 0.05861112431241466, + "language_loss": 0.7983259, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.80897093, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.23352051, + "step": 4617, + "time_per_iteration": 2.6568470001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067803, + "balance_loss_mlp": 1.0444026, + "epoch": 0.8884186225471336, + "flos": 551822865408.0, + "grad_norm": 0.07539491877952685, + "language_loss": 0.84058613, + "learning_rate": 3.229670801173418e-05, + "loss": 0.85126418, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.23413086, + "step": 4618, + "time_per_iteration": 2.6135919094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003042, + "balance_loss_mlp": 0.99631864, + "epoch": 0.888611004232397, + "flos": 1565263305216.0, + "grad_norm": 0.004685969787491016, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79515243, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.06738281, + "step": 4619, + "time_per_iteration": 4.9886579513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070198, + "balance_loss_mlp": 1.04795337, + "epoch": 0.8888033859176606, + "flos": 767028828672.0, + "grad_norm": 0.052062483272785565, + "language_loss": 0.82744682, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83814883, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.22241211, + "step": 4620, + "time_per_iteration": 2.994854211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062487, + "balance_loss_mlp": 1.03959835, + "epoch": 0.8889957676029242, + "flos": 934110849024.0, + "grad_norm": 0.05371563368015874, + "language_loss": 0.84353495, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85415977, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.22875977, + "step": 4621, + "time_per_iteration": 3.1302571296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.04323435, + "epoch": 0.8891881492881878, + "flos": 589611488256.0, + "grad_norm": 0.059931764934333366, + "language_loss": 0.81823874, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82889962, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.22851562, + "step": 4622, + "time_per_iteration": 2.8172430992126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065673, + "balance_loss_mlp": 1.04206991, + "epoch": 0.8893805309734514, + "flos": 540718861824.0, + "grad_norm": 0.06080004343800978, + "language_loss": 0.82491469, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83557141, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.23608398, + "step": 4623, + "time_per_iteration": 2.7272207736968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066977, + "balance_loss_mlp": 1.04382658, + "epoch": 0.8895729126587149, + "flos": 560095160832.0, + "grad_norm": 0.0915201090359006, + "language_loss": 0.81910133, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82977104, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.23144531, + "step": 4624, + "time_per_iteration": 2.680405855178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_mlp": 1.042799, + "epoch": 0.8897652943439784, + "flos": 610154021376.0, + "grad_norm": 0.05519265472563439, + "language_loss": 0.81301922, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82367849, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.23132324, + "step": 4625, + "time_per_iteration": 2.7328474521636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061825, + "balance_loss_mlp": 1.0387218, + "epoch": 0.889957676029242, + "flos": 917847811584.0, + "grad_norm": 0.06343567074213599, + "language_loss": 0.77644289, + "learning_rate": 3.142129625539969e-05, + "loss": 0.7870611, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.23095703, + "step": 4626, + "time_per_iteration": 3.2025320529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061503, + "balance_loss_mlp": 1.0387342, + "epoch": 0.8901500577145056, + "flos": 488698292736.0, + "grad_norm": 0.05768505937454957, + "language_loss": 0.8051129, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81572795, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.22753906, + "step": 4627, + "time_per_iteration": 2.544037342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.04235685, + "epoch": 0.8903424393997691, + "flos": 733648181760.0, + "grad_norm": 0.058375027413353715, + "language_loss": 0.81123602, + "learning_rate": 3.120426165316398e-05, + "loss": 0.82189703, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.23754883, + "step": 4628, + "time_per_iteration": 2.983144760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063743, + "balance_loss_mlp": 1.04085517, + "epoch": 0.8905348210850327, + "flos": 519813282816.0, + "grad_norm": 0.052566264576619524, + "language_loss": 0.82051194, + "learning_rate": 3.109601733496881e-05, + "loss": 0.83114934, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.22875977, + "step": 4629, + "time_per_iteration": 2.6456775665283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064735, + "balance_loss_mlp": 1.04144096, + "epoch": 0.8907272027702963, + "flos": 578976989184.0, + "grad_norm": 0.05590754047584808, + "language_loss": 0.7958827, + "learning_rate": 3.098795506144458e-05, + "loss": 0.80653006, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.23278809, + "step": 4630, + "time_per_iteration": 2.833200454711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106493, + "balance_loss_mlp": 1.04224443, + "epoch": 0.8909195844555599, + "flos": 893628910080.0, + "grad_norm": 0.07302437045654908, + "language_loss": 0.79246753, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80311686, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.22668457, + "step": 4631, + "time_per_iteration": 3.1019132137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065408, + "balance_loss_mlp": 1.0416975, + "epoch": 0.8911119661408234, + "flos": 549865926144.0, + "grad_norm": 0.05634010359130886, + "language_loss": 0.84410584, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85475999, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.23681641, + "step": 4632, + "time_per_iteration": 2.674358367919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069142, + "balance_loss_mlp": 1.04496682, + "epoch": 0.8913043478260869, + "flos": 481139979264.0, + "grad_norm": 0.06730907628269114, + "language_loss": 0.83857995, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84927142, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.24182129, + "step": 4633, + "time_per_iteration": 2.632319927215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106984, + "balance_loss_mlp": 1.04641593, + "epoch": 0.8914967295113505, + "flos": 484581782016.0, + "grad_norm": 0.052289564951872355, + "language_loss": 0.85386705, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86456549, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.23425293, + "step": 4634, + "time_per_iteration": 2.6371240615844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065464, + "balance_loss_mlp": 1.04192066, + "epoch": 0.8916891111966141, + "flos": 445664001024.0, + "grad_norm": 0.0695716528121872, + "language_loss": 0.8163662, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82702088, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.23535156, + "step": 4635, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062296, + "balance_loss_mlp": 1.03908598, + "epoch": 0.8918814928818777, + "flos": 564016379904.0, + "grad_norm": 0.05391232226326787, + "language_loss": 0.78488344, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79550636, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.23217773, + "step": 4636, + "time_per_iteration": 2.738752603530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069592, + "balance_loss_mlp": 1.04635811, + "epoch": 0.8920738745671412, + "flos": 575943022080.0, + "grad_norm": 0.06251637060913222, + "language_loss": 0.81325352, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82394946, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.23242188, + "step": 4637, + "time_per_iteration": 2.644911050796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066751, + "balance_loss_mlp": 1.04454279, + "epoch": 0.8922662562524047, + "flos": 620180623872.0, + "grad_norm": 0.0534185422238087, + "language_loss": 0.83658934, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.8472569, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.2220459, + "step": 4638, + "time_per_iteration": 2.768505573272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107185, + "balance_loss_mlp": 1.04829395, + "epoch": 0.8924586379376683, + "flos": 583624673280.0, + "grad_norm": 0.05706830171658043, + "language_loss": 0.79804647, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80876493, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.23522949, + "step": 4639, + "time_per_iteration": 2.731898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067488, + "balance_loss_mlp": 1.04464698, + "epoch": 0.8926510196229319, + "flos": 525177520128.0, + "grad_norm": 0.05117641357239232, + "language_loss": 0.81824827, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82892317, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.22839355, + "step": 4640, + "time_per_iteration": 2.7508621215820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067855, + "balance_loss_mlp": 1.04493129, + "epoch": 0.8928434013081955, + "flos": 486669772800.0, + "grad_norm": 0.063972320379488, + "language_loss": 0.80840433, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81908286, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.22912598, + "step": 4641, + "time_per_iteration": 2.5344278812408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005658, + "balance_loss_mlp": 0.99893486, + "epoch": 0.893035782993459, + "flos": 1448302560768.0, + "grad_norm": 0.003483753852343544, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.8133651, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.06738281, + "step": 4642, + "time_per_iteration": 4.7115797996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066278, + "balance_loss_mlp": 1.04297304, + "epoch": 0.8932281646787226, + "flos": 611320255488.0, + "grad_norm": 0.06490125129473394, + "language_loss": 0.80942553, + "learning_rate": 2.95997305629786e-05, + "loss": 0.82008833, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.23278809, + "step": 4643, + "time_per_iteration": 2.776724100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064375, + "balance_loss_mlp": 1.04103351, + "epoch": 0.8934205463639862, + "flos": 565760775168.0, + "grad_norm": 0.057617911154696326, + "language_loss": 0.84784567, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85848939, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.23352051, + "step": 4644, + "time_per_iteration": 2.630427122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068617, + "balance_loss_mlp": 1.04492998, + "epoch": 0.8936129280492497, + "flos": 488431420416.0, + "grad_norm": 0.06553563989756882, + "language_loss": 0.78306258, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79374874, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.23681641, + "step": 4645, + "time_per_iteration": 2.6144680976867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066216, + "balance_loss_mlp": 1.04387641, + "epoch": 0.8938053097345132, + "flos": 886490542080.0, + "grad_norm": 0.0528405024741521, + "language_loss": 0.80833423, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81899637, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.22338867, + "step": 4646, + "time_per_iteration": 3.2429933547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068666, + "balance_loss_mlp": 1.04599285, + "epoch": 0.8939976914197768, + "flos": 593285658624.0, + "grad_norm": 0.05949361617339142, + "language_loss": 0.84359968, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85428637, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.22680664, + "step": 4647, + "time_per_iteration": 2.710167407989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068246, + "balance_loss_mlp": 1.04401064, + "epoch": 0.8941900731050404, + "flos": 523247745024.0, + "grad_norm": 0.0659464848891027, + "language_loss": 0.81186658, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82254899, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.2421875, + "step": 4648, + "time_per_iteration": 2.651339292526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064898, + "balance_loss_mlp": 1.04204559, + "epoch": 0.894382454790304, + "flos": 800582745600.0, + "grad_norm": 0.05875794324616096, + "language_loss": 0.81119418, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82184315, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.2286377, + "step": 4649, + "time_per_iteration": 3.044119358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069765, + "balance_loss_mlp": 1.04760432, + "epoch": 0.8945748364755676, + "flos": 479037307392.0, + "grad_norm": 0.05520315549667398, + "language_loss": 0.85046721, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.86116481, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.22180176, + "step": 4650, + "time_per_iteration": 2.6633942127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070963, + "balance_loss_mlp": 1.04739583, + "epoch": 0.894767218160831, + "flos": 508776090624.0, + "grad_norm": 0.06916128271442132, + "language_loss": 0.83380258, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84451222, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.23547363, + "step": 4651, + "time_per_iteration": 2.6715307235717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_mlp": 1.04138315, + "epoch": 0.8949595998460946, + "flos": 685857549312.0, + "grad_norm": 0.0591722650352258, + "language_loss": 0.82184136, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.8324852, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.23010254, + "step": 4652, + "time_per_iteration": 2.8870623111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064747, + "balance_loss_mlp": 1.04148972, + "epoch": 0.8951519815313582, + "flos": 799920520704.0, + "grad_norm": 0.05600884450087048, + "language_loss": 0.77608907, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78673655, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.23242188, + "step": 4653, + "time_per_iteration": 2.9963629245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_mlp": 1.04324377, + "epoch": 0.8953443632166218, + "flos": 666740782080.0, + "grad_norm": 0.06357991469427139, + "language_loss": 0.86607301, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87673807, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.23254395, + "step": 4654, + "time_per_iteration": 2.7694337368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065408, + "balance_loss_mlp": 1.04244781, + "epoch": 0.8955367449018854, + "flos": 644977686528.0, + "grad_norm": 0.055922367337583156, + "language_loss": 0.83340573, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84405977, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.22961426, + "step": 4655, + "time_per_iteration": 2.8329148292541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_mlp": 1.04099762, + "epoch": 0.8957291265871489, + "flos": 808714077696.0, + "grad_norm": 0.05938034328868387, + "language_loss": 0.78094238, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.79159427, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.24194336, + "step": 4656, + "time_per_iteration": 3.0359058380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070668, + "balance_loss_mlp": 1.04611111, + "epoch": 0.8959215082724125, + "flos": 518923832832.0, + "grad_norm": 0.05192477429276855, + "language_loss": 0.77056348, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78127015, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.24572754, + "step": 4657, + "time_per_iteration": 2.618851661682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.04416966, + "epoch": 0.896113889957676, + "flos": 476917383168.0, + "grad_norm": 0.058088302733293115, + "language_loss": 0.77451009, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78519166, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.23950195, + "step": 4658, + "time_per_iteration": 2.6347408294677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065334, + "balance_loss_mlp": 1.04195762, + "epoch": 0.8963062716429396, + "flos": 518162863104.0, + "grad_norm": 0.05167614714836421, + "language_loss": 0.83363634, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84428966, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.23352051, + "step": 4659, + "time_per_iteration": 2.6101901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069072, + "balance_loss_mlp": 1.04588568, + "epoch": 0.8964986533282031, + "flos": 508484625408.0, + "grad_norm": 0.061239712200733556, + "language_loss": 0.81603789, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82672858, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.23181152, + "step": 4660, + "time_per_iteration": 2.6643130779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067848, + "balance_loss_mlp": 1.0449475, + "epoch": 0.8966910350134667, + "flos": 536076320256.0, + "grad_norm": 0.07910275074584235, + "language_loss": 0.81423545, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82491398, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.22900391, + "step": 4661, + "time_per_iteration": 2.621864080429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106623, + "balance_loss_mlp": 1.04335344, + "epoch": 0.8968834166987303, + "flos": 723226226688.0, + "grad_norm": 0.058143333452195634, + "language_loss": 0.84462941, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85529172, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.2286377, + "step": 4662, + "time_per_iteration": 2.8383162021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106546, + "balance_loss_mlp": 1.04339433, + "epoch": 0.8970757983839939, + "flos": 681686710272.0, + "grad_norm": 0.05513580414518432, + "language_loss": 0.84041762, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.85107225, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.22070312, + "step": 4663, + "time_per_iteration": 2.9471535682678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067793, + "balance_loss_mlp": 1.04407024, + "epoch": 0.8972681800692575, + "flos": 613037486592.0, + "grad_norm": 0.08773189302167883, + "language_loss": 0.76343608, + "learning_rate": 2.742244971856006e-05, + "loss": 0.77411401, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.23706055, + "step": 4664, + "time_per_iteration": 2.7563586235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.04091692, + "epoch": 0.8974605617545209, + "flos": 572350344192.0, + "grad_norm": 0.05238524787206957, + "language_loss": 0.83254212, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84318304, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.23168945, + "step": 4665, + "time_per_iteration": 2.716057062149048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.04294896, + "epoch": 0.8976529434397845, + "flos": 520418608128.0, + "grad_norm": 0.06037902088521838, + "language_loss": 0.8723467, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88301122, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.23486328, + "step": 4666, + "time_per_iteration": 2.645430564880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.04494941, + "epoch": 0.8978453251250481, + "flos": 471355656192.0, + "grad_norm": 0.05147791161429302, + "language_loss": 0.82619303, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83687758, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.23498535, + "step": 4667, + "time_per_iteration": 2.6113970279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068602, + "balance_loss_mlp": 1.04610741, + "epoch": 0.8980377068103117, + "flos": 591659831808.0, + "grad_norm": 0.06407487894288708, + "language_loss": 0.82130563, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.83199167, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.22509766, + "step": 4668, + "time_per_iteration": 2.775261640548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010737, + "balance_loss_mlp": 1.05051422, + "epoch": 0.8982300884955752, + "flos": 767619472896.0, + "grad_norm": 0.06672049349716365, + "language_loss": 0.82899266, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83972967, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.23181152, + "step": 4669, + "time_per_iteration": 2.9366719722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065808, + "balance_loss_mlp": 1.04299164, + "epoch": 0.8984224701808388, + "flos": 844575496704.0, + "grad_norm": 0.05547032389255281, + "language_loss": 0.77663457, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78729266, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.22814941, + "step": 4670, + "time_per_iteration": 3.2214982509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070125, + "balance_loss_mlp": 1.04733181, + "epoch": 0.8986148518661023, + "flos": 757661879808.0, + "grad_norm": 0.08084451627969469, + "language_loss": 0.76220548, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77290672, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.2277832, + "step": 4671, + "time_per_iteration": 3.152304172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066997, + "balance_loss_mlp": 1.04413223, + "epoch": 0.8988072335513659, + "flos": 563070030336.0, + "grad_norm": 0.08463920204686945, + "language_loss": 0.77184796, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.78251791, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.2286377, + "step": 4672, + "time_per_iteration": 2.7046709060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067256, + "balance_loss_mlp": 1.04339075, + "epoch": 0.8989996152366295, + "flos": 492683751936.0, + "grad_norm": 0.06216654678509643, + "language_loss": 0.86926317, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87993574, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.23852539, + "step": 4673, + "time_per_iteration": 2.5472865104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067016, + "balance_loss_mlp": 1.04331684, + "epoch": 0.899191996921893, + "flos": 542567144448.0, + "grad_norm": 0.07568061598411091, + "language_loss": 0.76223564, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.77290577, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.23681641, + "step": 4674, + "time_per_iteration": 2.648810863494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066639, + "balance_loss_mlp": 1.04321444, + "epoch": 0.8993843786071566, + "flos": 471325920768.0, + "grad_norm": 0.07030334543630777, + "language_loss": 0.80331755, + "learning_rate": 2.631423662948984e-05, + "loss": 0.81398392, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.23425293, + "step": 4675, + "time_per_iteration": 2.601430892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107006, + "balance_loss_mlp": 1.04677892, + "epoch": 0.8995767602924202, + "flos": 526726623744.0, + "grad_norm": 0.0574995640777522, + "language_loss": 0.82629097, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83699161, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.23278809, + "step": 4676, + "time_per_iteration": 2.780759811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.04313004, + "epoch": 0.8997691419776838, + "flos": 557634212352.0, + "grad_norm": 0.0657953043961064, + "language_loss": 0.84578764, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85645092, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.23181152, + "step": 4677, + "time_per_iteration": 2.6718125343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072385, + "balance_loss_mlp": 1.04937708, + "epoch": 0.8999615236629472, + "flos": 639027947520.0, + "grad_norm": 0.05825009519251717, + "language_loss": 0.80745816, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81818199, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.22998047, + "step": 4678, + "time_per_iteration": 2.8580267429351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003846, + "balance_loss_mlp": 0.99717021, + "epoch": 0.9001539053482108, + "flos": 1431510547968.0, + "grad_norm": 0.007707112448869433, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86787868, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.06689453, + "step": 4679, + "time_per_iteration": 4.804095029830933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106824, + "balance_loss_mlp": 1.0444932, + "epoch": 0.9003462870334744, + "flos": 566877450240.0, + "grad_norm": 0.06250432124299872, + "language_loss": 0.80040658, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.81108892, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.23730469, + "step": 4680, + "time_per_iteration": 2.858875274658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065394, + "balance_loss_mlp": 1.04134905, + "epoch": 0.900538668718738, + "flos": 538655837184.0, + "grad_norm": 0.05470600652394018, + "language_loss": 0.78754449, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79819846, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.2401123, + "step": 4681, + "time_per_iteration": 2.6454641819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066228, + "balance_loss_mlp": 1.04298246, + "epoch": 0.9007310504040016, + "flos": 488387003904.0, + "grad_norm": 0.0857407737002147, + "language_loss": 0.86202192, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.87268418, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.2322998, + "step": 4682, + "time_per_iteration": 2.5687954425811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.04095745, + "epoch": 0.9009234320892651, + "flos": 652901617152.0, + "grad_norm": 0.06514084212539079, + "language_loss": 0.78763062, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79827344, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.2331543, + "step": 4683, + "time_per_iteration": 2.872545003890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.04105759, + "epoch": 0.9011158137745287, + "flos": 545569178112.0, + "grad_norm": 0.13335470747137296, + "language_loss": 0.85694379, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.867589, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.234375, + "step": 4684, + "time_per_iteration": 2.694607734680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062379, + "balance_loss_mlp": 1.03976488, + "epoch": 0.9013081954597922, + "flos": 559699808256.0, + "grad_norm": 0.061629195170929164, + "language_loss": 0.82466996, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83529371, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.22607422, + "step": 4685, + "time_per_iteration": 2.664644956588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106963, + "balance_loss_mlp": 1.04603863, + "epoch": 0.9015005771450558, + "flos": 728652132864.0, + "grad_norm": 0.05255805412732331, + "language_loss": 0.80887723, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.81957352, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.23583984, + "step": 4686, + "time_per_iteration": 2.9304230213165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067801, + "balance_loss_mlp": 1.04424536, + "epoch": 0.9016929588303193, + "flos": 517416574464.0, + "grad_norm": 0.061119646274193154, + "language_loss": 0.81590062, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82657862, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.23535156, + "step": 4687, + "time_per_iteration": 2.7988228797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065289, + "balance_loss_mlp": 1.04235315, + "epoch": 0.9018853405155829, + "flos": 622335052800.0, + "grad_norm": 0.05484090421753951, + "language_loss": 0.86004657, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87069947, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.22937012, + "step": 4688, + "time_per_iteration": 2.86004638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066048, + "balance_loss_mlp": 1.04178929, + "epoch": 0.9020777222008465, + "flos": 523284820992.0, + "grad_norm": 0.06490737682713338, + "language_loss": 0.78058898, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.79124951, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.24255371, + "step": 4689, + "time_per_iteration": 2.6526527404785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061292, + "balance_loss_mlp": 1.03895235, + "epoch": 0.9022701038861101, + "flos": 633713269248.0, + "grad_norm": 0.049701215658093066, + "language_loss": 0.82043481, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.83104765, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.22351074, + "step": 4690, + "time_per_iteration": 2.844237804412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061716, + "balance_loss_mlp": 1.03930461, + "epoch": 0.9024624855713737, + "flos": 513295294464.0, + "grad_norm": 0.056724167517837494, + "language_loss": 0.84453869, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85515589, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.22412109, + "step": 4691, + "time_per_iteration": 2.643986940383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061795, + "balance_loss_mlp": 1.03883481, + "epoch": 0.9026548672566371, + "flos": 477411480576.0, + "grad_norm": 0.05847332764386979, + "language_loss": 0.86467588, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87529379, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.22961426, + "step": 4692, + "time_per_iteration": 2.6611626148223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064053, + "balance_loss_mlp": 1.04028225, + "epoch": 0.9028472489419007, + "flos": 661994353152.0, + "grad_norm": 0.05749458079736531, + "language_loss": 0.73733985, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74798036, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.23779297, + "step": 4693, + "time_per_iteration": 2.8610541820526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062773, + "balance_loss_mlp": 1.03946733, + "epoch": 0.9030396306271643, + "flos": 534588885504.0, + "grad_norm": 0.05856701308084892, + "language_loss": 0.82619125, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.83681893, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.23291016, + "step": 4694, + "time_per_iteration": 2.6105546951293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063992, + "balance_loss_mlp": 1.04143786, + "epoch": 0.9032320123124279, + "flos": 801032426496.0, + "grad_norm": 0.054990789226307006, + "language_loss": 0.82393968, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83457965, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.22546387, + "step": 4695, + "time_per_iteration": 2.9821743965148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064335, + "balance_loss_mlp": 1.04110062, + "epoch": 0.9034243939976914, + "flos": 553942789632.0, + "grad_norm": 0.07759884648218378, + "language_loss": 0.76665241, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77729577, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.23205566, + "step": 4696, + "time_per_iteration": 2.6665611267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061692, + "balance_loss_mlp": 1.03874457, + "epoch": 0.903616775682955, + "flos": 503903752704.0, + "grad_norm": 0.10162314375080486, + "language_loss": 0.82691509, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83753198, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.22949219, + "step": 4697, + "time_per_iteration": 2.584268093109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063613, + "balance_loss_mlp": 1.04124892, + "epoch": 0.9038091573682185, + "flos": 436297052160.0, + "grad_norm": 0.0664097832888091, + "language_loss": 0.78661013, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79724634, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.22387695, + "step": 4698, + "time_per_iteration": 2.604700803756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063555, + "balance_loss_mlp": 1.03962958, + "epoch": 0.9040015390534821, + "flos": 532916070912.0, + "grad_norm": 0.07742814353956143, + "language_loss": 0.81602848, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.82666409, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.23901367, + "step": 4699, + "time_per_iteration": 2.691525936126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064709, + "balance_loss_mlp": 1.04179668, + "epoch": 0.9041939207387457, + "flos": 564307845120.0, + "grad_norm": 0.0614656758033835, + "language_loss": 0.80404103, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81468809, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.22900391, + "step": 4700, + "time_per_iteration": 2.757578134536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061234, + "balance_loss_mlp": 1.03814268, + "epoch": 0.9043863024240092, + "flos": 515509194240.0, + "grad_norm": 0.07278260404802209, + "language_loss": 0.77661544, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78722775, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.23071289, + "step": 4701, + "time_per_iteration": 2.5736379623413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_mlp": 0.99531454, + "epoch": 0.9045786841092728, + "flos": 1277949063168.0, + "grad_norm": 0.011886728368769089, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73931825, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.06738281, + "step": 4702, + "time_per_iteration": 4.96384072303772 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_mlp": 1.04281723, + "epoch": 0.9047710657945364, + "flos": 585841144320.0, + "grad_norm": 0.05895236346667512, + "language_loss": 0.83045083, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.84111637, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.23742676, + "step": 4703, + "time_per_iteration": 2.681264877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061673, + "balance_loss_mlp": 1.0387013, + "epoch": 0.9049634474798, + "flos": 571937739264.0, + "grad_norm": 0.07313912581124954, + "language_loss": 0.79802144, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80863822, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.22961426, + "step": 4704, + "time_per_iteration": 2.7477691173553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064397, + "balance_loss_mlp": 1.0415324, + "epoch": 0.9051558291650635, + "flos": 572619787776.0, + "grad_norm": 0.08577028625924395, + "language_loss": 0.7446878, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75533175, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.22851562, + "step": 4705, + "time_per_iteration": 2.7316272258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063321, + "balance_loss_mlp": 1.03981328, + "epoch": 0.905348210850327, + "flos": 540538624512.0, + "grad_norm": 0.07345375588943581, + "language_loss": 0.79496109, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80559433, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.23486328, + "step": 4706, + "time_per_iteration": 2.670381784439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061785, + "balance_loss_mlp": 1.03881335, + "epoch": 0.9055405925355906, + "flos": 516381391872.0, + "grad_norm": 0.07600321724106415, + "language_loss": 0.81767797, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82829583, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.22961426, + "step": 4707, + "time_per_iteration": 2.613689661026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059398, + "balance_loss_mlp": 1.03645039, + "epoch": 0.9057329742208542, + "flos": 914643145728.0, + "grad_norm": 0.05847502897319672, + "language_loss": 0.850618, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86121196, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.22949219, + "step": 4708, + "time_per_iteration": 3.24080228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062169, + "balance_loss_mlp": 1.03837466, + "epoch": 0.9059253559061178, + "flos": 905261515776.0, + "grad_norm": 0.08783492473540015, + "language_loss": 0.83353186, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.84415358, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.23791504, + "step": 4709, + "time_per_iteration": 3.145754337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060875, + "balance_loss_mlp": 1.03717613, + "epoch": 0.9061177375913813, + "flos": 664534222848.0, + "grad_norm": 0.06837810531041678, + "language_loss": 0.77734303, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78795183, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.23706055, + "step": 4710, + "time_per_iteration": 2.8780710697174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065307, + "balance_loss_mlp": 1.04175103, + "epoch": 0.9063101192766448, + "flos": 565609900032.0, + "grad_norm": 0.07543882900367092, + "language_loss": 0.82841074, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83906382, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.23547363, + "step": 4711, + "time_per_iteration": 2.748779773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_mlp": 1.03939068, + "epoch": 0.9065025009619084, + "flos": 727377242112.0, + "grad_norm": 0.0645612601116662, + "language_loss": 0.79195869, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80257982, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.22729492, + "step": 4712, + "time_per_iteration": 2.8739047050476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060893, + "balance_loss_mlp": 1.03833866, + "epoch": 0.906694882647172, + "flos": 531512699904.0, + "grad_norm": 0.06746017534167525, + "language_loss": 0.79950291, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81011188, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.22558594, + "step": 4713, + "time_per_iteration": 2.659532308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060379, + "balance_loss_mlp": 1.03682303, + "epoch": 0.9068872643324356, + "flos": 429788975616.0, + "grad_norm": 0.08111291405332487, + "language_loss": 0.85047996, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.86108375, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.23535156, + "step": 4714, + "time_per_iteration": 2.607717990875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.04185677, + "epoch": 0.9070796460176991, + "flos": 588366332928.0, + "grad_norm": 0.06399071140262953, + "language_loss": 0.79739857, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80805904, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.24182129, + "step": 4715, + "time_per_iteration": 2.844972610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062855, + "balance_loss_mlp": 1.03901279, + "epoch": 0.9072720277029627, + "flos": 571582033920.0, + "grad_norm": 0.06112565203304271, + "language_loss": 0.75674605, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76737463, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.23828125, + "step": 4716, + "time_per_iteration": 2.7184197902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062655, + "balance_loss_mlp": 1.03995764, + "epoch": 0.9074644093882263, + "flos": 555798412800.0, + "grad_norm": 0.05611379446254042, + "language_loss": 0.8870452, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89767182, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.22705078, + "step": 4717, + "time_per_iteration": 2.645702838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064312, + "balance_loss_mlp": 1.04135203, + "epoch": 0.9076567910734898, + "flos": 640994798592.0, + "grad_norm": 0.06604974299876817, + "language_loss": 0.82842344, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83906651, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.22961426, + "step": 4718, + "time_per_iteration": 2.7762739658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062525, + "balance_loss_mlp": 1.03921926, + "epoch": 0.9078491727587533, + "flos": 733998744576.0, + "grad_norm": 0.05329973155355116, + "language_loss": 0.81630248, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82692772, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.23303223, + "step": 4719, + "time_per_iteration": 3.11417555809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106129, + "balance_loss_mlp": 1.03837776, + "epoch": 0.9080415544440169, + "flos": 654774492672.0, + "grad_norm": 0.054605753235731906, + "language_loss": 0.86558378, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87619674, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.22924805, + "step": 4720, + "time_per_iteration": 2.8445792198181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060559, + "balance_loss_mlp": 1.03724146, + "epoch": 0.9082339361292805, + "flos": 597463838208.0, + "grad_norm": 0.055554833776014695, + "language_loss": 0.79743761, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80804324, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.23303223, + "step": 4721, + "time_per_iteration": 2.743527412414551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106695, + "balance_loss_mlp": 1.04333496, + "epoch": 0.9084263178145441, + "flos": 504407761920.0, + "grad_norm": 0.05842313627094751, + "language_loss": 0.84558177, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85625124, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.23608398, + "step": 4722, + "time_per_iteration": 2.6498191356658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065647, + "balance_loss_mlp": 1.04142332, + "epoch": 0.9086186994998077, + "flos": 550031482368.0, + "grad_norm": 0.0722517857317367, + "language_loss": 0.80876398, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81942052, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.2421875, + "step": 4723, + "time_per_iteration": 2.6922495365142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061199, + "balance_loss_mlp": 1.03808427, + "epoch": 0.9088110811850711, + "flos": 1134076847616.0, + "grad_norm": 0.07453846323964182, + "language_loss": 0.75279224, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76340425, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.23120117, + "step": 4724, + "time_per_iteration": 3.5442843437194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068171, + "balance_loss_mlp": 1.04486537, + "epoch": 0.9090034628703347, + "flos": 556991811072.0, + "grad_norm": 0.052486424573562215, + "language_loss": 0.7726813, + "learning_rate": 2.155810244111628e-05, + "loss": 0.78336298, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.23291016, + "step": 4725, + "time_per_iteration": 2.712711811065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064643, + "balance_loss_mlp": 1.04276848, + "epoch": 0.9091958445555983, + "flos": 543970515456.0, + "grad_norm": 0.05499523521606461, + "language_loss": 0.84340453, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85405099, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.21887207, + "step": 4726, + "time_per_iteration": 2.6923320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061768, + "balance_loss_mlp": 1.03845048, + "epoch": 0.9093882262408619, + "flos": 526113957888.0, + "grad_norm": 0.06073661706231448, + "language_loss": 0.81389105, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82450879, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.23291016, + "step": 4727, + "time_per_iteration": 2.64753794670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105825, + "balance_loss_mlp": 1.03569603, + "epoch": 0.9095806079261254, + "flos": 548526795264.0, + "grad_norm": 0.06249947312490014, + "language_loss": 0.82006919, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.83065176, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.22558594, + "step": 4728, + "time_per_iteration": 2.6654014587402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061076, + "balance_loss_mlp": 1.03806877, + "epoch": 0.909772989611389, + "flos": 572535724032.0, + "grad_norm": 0.06570062173363597, + "language_loss": 0.84565747, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85626823, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.23010254, + "step": 4729, + "time_per_iteration": 2.7289013862609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063857, + "balance_loss_mlp": 1.04019403, + "epoch": 0.9099653712966526, + "flos": 561812391936.0, + "grad_norm": 0.09449980732831745, + "language_loss": 0.79957283, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.81021142, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.23657227, + "step": 4730, + "time_per_iteration": 2.6817986965179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059413, + "balance_loss_mlp": 1.03622675, + "epoch": 0.9101577529819161, + "flos": 1093800112128.0, + "grad_norm": 0.06465270600581106, + "language_loss": 0.80294889, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81354308, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.23181152, + "step": 4731, + "time_per_iteration": 3.3533620834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062594, + "balance_loss_mlp": 1.03854942, + "epoch": 0.9103501346671797, + "flos": 445444116480.0, + "grad_norm": 0.07092257188337395, + "language_loss": 0.82299185, + "learning_rate": 2.092919721190678e-05, + "loss": 0.83361781, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.24047852, + "step": 4732, + "time_per_iteration": 2.5148427486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062948, + "balance_loss_mlp": 1.0397495, + "epoch": 0.9105425163524432, + "flos": 500770667520.0, + "grad_norm": 0.06939969815016095, + "language_loss": 0.77697742, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78760689, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.23205566, + "step": 4733, + "time_per_iteration": 2.642012596130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065924, + "balance_loss_mlp": 1.04153371, + "epoch": 0.9107348980377068, + "flos": 657519565824.0, + "grad_norm": 0.05133792278830391, + "language_loss": 0.84146416, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85212338, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.24377441, + "step": 4734, + "time_per_iteration": 2.835914134979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.03760004, + "epoch": 0.9109272797229704, + "flos": 553668576768.0, + "grad_norm": 0.06017445615271462, + "language_loss": 0.84846371, + "learning_rate": 2.066245558029256e-05, + "loss": 0.85907179, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.23205566, + "step": 4735, + "time_per_iteration": 2.6190714836120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066064, + "balance_loss_mlp": 1.04261589, + "epoch": 0.911119661408234, + "flos": 519007896576.0, + "grad_norm": 0.06548686029452257, + "language_loss": 0.83979481, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85045546, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.23449707, + "step": 4736, + "time_per_iteration": 2.619947910308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064319, + "balance_loss_mlp": 1.04119277, + "epoch": 0.9113120430934974, + "flos": 554375218176.0, + "grad_norm": 0.06468079710264198, + "language_loss": 0.8364414, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84708458, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.23144531, + "step": 4737, + "time_per_iteration": 2.673065662384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106403, + "balance_loss_mlp": 1.04109359, + "epoch": 0.911504424778761, + "flos": 501889913856.0, + "grad_norm": 0.06603092013772342, + "language_loss": 0.81557083, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82621109, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.22937012, + "step": 4738, + "time_per_iteration": 2.6410467624664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068098, + "balance_loss_mlp": 1.04519773, + "epoch": 0.9116968064640246, + "flos": 611100370944.0, + "grad_norm": 0.0574544512840337, + "language_loss": 0.82125366, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83193469, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.22888184, + "step": 4739, + "time_per_iteration": 2.7088229656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061118, + "balance_loss_mlp": 1.03840876, + "epoch": 0.9118891881492882, + "flos": 572918593536.0, + "grad_norm": 0.06015748588292996, + "language_loss": 0.82474881, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83536005, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.22717285, + "step": 4740, + "time_per_iteration": 2.7565977573394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.03874695, + "epoch": 0.9120815698345518, + "flos": 635961673728.0, + "grad_norm": 0.0683916463523955, + "language_loss": 0.78226697, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79288852, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.23400879, + "step": 4741, + "time_per_iteration": 2.827404499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059165, + "balance_loss_mlp": 1.0367775, + "epoch": 0.9122739515198153, + "flos": 702300824064.0, + "grad_norm": 0.07740625754984397, + "language_loss": 0.86271739, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.87330902, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.22387695, + "step": 4742, + "time_per_iteration": 2.874316692352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067284, + "balance_loss_mlp": 1.04440832, + "epoch": 0.9124663332050789, + "flos": 524690763264.0, + "grad_norm": 0.07197983695129517, + "language_loss": 0.87550807, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88618088, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.2286377, + "step": 4743, + "time_per_iteration": 2.7014012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064508, + "balance_loss_mlp": 1.04181051, + "epoch": 0.9126587148903424, + "flos": 563299826688.0, + "grad_norm": 0.056151781575863396, + "language_loss": 0.82793915, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83858418, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.22705078, + "step": 4744, + "time_per_iteration": 2.688795566558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062409, + "balance_loss_mlp": 1.03963971, + "epoch": 0.912851096575606, + "flos": 505942184448.0, + "grad_norm": 0.06468850239318244, + "language_loss": 0.79841912, + "learning_rate": 1.978541819374574e-05, + "loss": 0.80904323, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.22766113, + "step": 4745, + "time_per_iteration": 2.5829083919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066216, + "balance_loss_mlp": 1.04246938, + "epoch": 0.9130434782608695, + "flos": 550730783232.0, + "grad_norm": 0.06526804508187888, + "language_loss": 0.82502508, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83568728, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.23718262, + "step": 4746, + "time_per_iteration": 2.6205508708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_mlp": 1.03875685, + "epoch": 0.9132358599461331, + "flos": 468976200192.0, + "grad_norm": 0.072585776540836, + "language_loss": 0.83583778, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84645015, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.22473145, + "step": 4747, + "time_per_iteration": 2.5356056690216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062146, + "balance_loss_mlp": 1.03998518, + "epoch": 0.9134282416313967, + "flos": 506097828864.0, + "grad_norm": 0.06194777952555315, + "language_loss": 0.80193496, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.81255639, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.22167969, + "step": 4748, + "time_per_iteration": 2.6417837142944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062825, + "balance_loss_mlp": 1.03936434, + "epoch": 0.9136206233166603, + "flos": 604819519488.0, + "grad_norm": 0.06274709516347185, + "language_loss": 0.84114301, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85177124, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.23461914, + "step": 4749, + "time_per_iteration": 2.7290050983428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065104, + "balance_loss_mlp": 1.04191816, + "epoch": 0.9138130050019239, + "flos": 561738240000.0, + "grad_norm": 0.05949272956621823, + "language_loss": 0.83238935, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.84304041, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.23181152, + "step": 4750, + "time_per_iteration": 2.6649882793426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065722, + "balance_loss_mlp": 1.04198802, + "epoch": 0.9140053866871873, + "flos": 690117221376.0, + "grad_norm": 0.051828523329950985, + "language_loss": 0.90372276, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91437995, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.23742676, + "step": 4751, + "time_per_iteration": 2.8122756481170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063116, + "balance_loss_mlp": 1.04025221, + "epoch": 0.9141977683724509, + "flos": 551012336640.0, + "grad_norm": 0.05513697084655646, + "language_loss": 0.84049332, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85112453, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.22851562, + "step": 4752, + "time_per_iteration": 2.690352439880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065089, + "balance_loss_mlp": 1.04072297, + "epoch": 0.9143901500577145, + "flos": 540088943616.0, + "grad_norm": 0.059079736183717625, + "language_loss": 0.75681716, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76746798, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.24353027, + "step": 4753, + "time_per_iteration": 2.6606411933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062336, + "balance_loss_mlp": 1.03876805, + "epoch": 0.9145825317429781, + "flos": 528767626752.0, + "grad_norm": 0.06000540619598026, + "language_loss": 0.81170249, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.82232583, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.23547363, + "step": 4754, + "time_per_iteration": 2.6054999828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069817, + "balance_loss_mlp": 1.04665422, + "epoch": 0.9147749134282416, + "flos": 514792641024.0, + "grad_norm": 0.060337107580878784, + "language_loss": 0.79316139, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80385947, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.23156738, + "step": 4755, + "time_per_iteration": 2.69866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106264, + "balance_loss_mlp": 1.04012132, + "epoch": 0.9149672951135052, + "flos": 514441704960.0, + "grad_norm": 0.057247927056212296, + "language_loss": 0.85906756, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.86969399, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.22521973, + "step": 4756, + "time_per_iteration": 2.657858371734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062369, + "balance_loss_mlp": 1.03919435, + "epoch": 0.9151596767987688, + "flos": 577069608960.0, + "grad_norm": 0.05578816097399521, + "language_loss": 0.82000184, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.83062547, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.23144531, + "step": 4757, + "time_per_iteration": 2.7571098804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106101, + "balance_loss_mlp": 1.03912318, + "epoch": 0.9153520584840323, + "flos": 619335590400.0, + "grad_norm": 0.055386707706292025, + "language_loss": 0.82802898, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83863914, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.21911621, + "step": 4758, + "time_per_iteration": 2.746701240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.04191732, + "epoch": 0.9155444401692959, + "flos": 468921871872.0, + "grad_norm": 0.055003918587598934, + "language_loss": 0.82849371, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83915728, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.2442627, + "step": 4759, + "time_per_iteration": 2.599597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000937, + "balance_loss_mlp": 0.99421328, + "epoch": 0.9157368218545594, + "flos": 1410711054336.0, + "grad_norm": 0.006985610958737596, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75820005, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.06738281, + "step": 4760, + "time_per_iteration": 4.837120294570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000943, + "balance_loss_mlp": 0.99421966, + "epoch": 0.915929203539823, + "flos": 1522019040768.0, + "grad_norm": 0.006987645393502265, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80576992, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.06738281, + "step": 4761, + "time_per_iteration": 4.932307720184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_mlp": 1.03865767, + "epoch": 0.9161215852250866, + "flos": 535752548352.0, + "grad_norm": 0.06103904821656309, + "language_loss": 0.80316961, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81379426, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.23803711, + "step": 4762, + "time_per_iteration": 2.7238500118255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.04348063, + "epoch": 0.9163139669103502, + "flos": 590624649216.0, + "grad_norm": 0.057935792953032535, + "language_loss": 0.80923164, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81989825, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.23181152, + "step": 4763, + "time_per_iteration": 2.7679901123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063834, + "balance_loss_mlp": 1.04057622, + "epoch": 0.9165063485956138, + "flos": 821975081472.0, + "grad_norm": 0.051758643461764085, + "language_loss": 0.84645653, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85709482, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.23254395, + "step": 4764, + "time_per_iteration": 3.0768322944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105787, + "balance_loss_mlp": 1.03519642, + "epoch": 0.9166987302808772, + "flos": 525194772480.0, + "grad_norm": 0.05556449671285437, + "language_loss": 0.8234725, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83405113, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.22668457, + "step": 4765, + "time_per_iteration": 2.660769462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062896, + "balance_loss_mlp": 1.03945971, + "epoch": 0.9168911119661408, + "flos": 655095693312.0, + "grad_norm": 0.049564714523440044, + "language_loss": 0.84340852, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85403752, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.23425293, + "step": 4766, + "time_per_iteration": 2.9556195735931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063242, + "balance_loss_mlp": 1.03998423, + "epoch": 0.9170834936514044, + "flos": 491747314176.0, + "grad_norm": 0.07059908264734456, + "language_loss": 0.852781, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.86341345, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.2322998, + "step": 4767, + "time_per_iteration": 2.568305730819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065741, + "balance_loss_mlp": 1.04228079, + "epoch": 0.917275875336668, + "flos": 628040314368.0, + "grad_norm": 0.0724753532377933, + "language_loss": 0.80676091, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81741834, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.234375, + "step": 4768, + "time_per_iteration": 2.8312792778015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01001118, + "balance_loss_mlp": 0.99439502, + "epoch": 0.9174682570219315, + "flos": 1517981824512.0, + "grad_norm": 0.00699799346191008, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79181355, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.06738281, + "step": 4769, + "time_per_iteration": 4.91847825050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106499, + "balance_loss_mlp": 1.04138637, + "epoch": 0.917660638707195, + "flos": 560021008896.0, + "grad_norm": 0.06313513734114481, + "language_loss": 0.85138547, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86203539, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.23608398, + "step": 4770, + "time_per_iteration": 2.701441764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065065, + "balance_loss_mlp": 1.04179573, + "epoch": 0.9178530203924586, + "flos": 447252751872.0, + "grad_norm": 0.05958541802971449, + "language_loss": 0.83878446, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.84943509, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.23242188, + "step": 4771, + "time_per_iteration": 2.461113691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061371, + "balance_loss_mlp": 1.03884006, + "epoch": 0.9180454020777222, + "flos": 465981507072.0, + "grad_norm": 0.055111772688767, + "language_loss": 0.8086372, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81925088, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.2253418, + "step": 4772, + "time_per_iteration": 2.603609561920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063308, + "balance_loss_mlp": 1.04161239, + "epoch": 0.9182377837629858, + "flos": 596314856448.0, + "grad_norm": 0.05471349144669564, + "language_loss": 0.87369776, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88433087, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.21728516, + "step": 4773, + "time_per_iteration": 2.780123233795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065111, + "balance_loss_mlp": 1.04175746, + "epoch": 0.9184301654482493, + "flos": 996671941632.0, + "grad_norm": 0.04652666616854213, + "language_loss": 0.82479548, + "learning_rate": 1.734755767142876e-05, + "loss": 0.8354466, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.23352051, + "step": 4774, + "time_per_iteration": 3.391968011856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065291, + "balance_loss_mlp": 1.04208064, + "epoch": 0.9186225471335129, + "flos": 508860154368.0, + "grad_norm": 0.04816843723354433, + "language_loss": 0.8497929, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.8604458, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.23242188, + "step": 4775, + "time_per_iteration": 2.6323952674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065856, + "balance_loss_mlp": 1.04232407, + "epoch": 0.9188149288187765, + "flos": 940423633920.0, + "grad_norm": 0.05644006303487493, + "language_loss": 0.79136419, + "learning_rate": 1.718522925136551e-05, + "loss": 0.8020227, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.23535156, + "step": 4776, + "time_per_iteration": 3.336750030517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057658, + "balance_loss_mlp": 1.03536582, + "epoch": 0.91900731050404, + "flos": 583674232320.0, + "grad_norm": 0.05731866001862462, + "language_loss": 0.84241879, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.8529954, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.22302246, + "step": 4777, + "time_per_iteration": 2.6611621379852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063108, + "balance_loss_mlp": 1.03984988, + "epoch": 0.9191996921893035, + "flos": 581213283840.0, + "grad_norm": 0.07522007256153451, + "language_loss": 0.7975679, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80819893, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.23242188, + "step": 4778, + "time_per_iteration": 2.712597370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063497, + "balance_loss_mlp": 1.04007208, + "epoch": 0.9193920738745671, + "flos": 908935686144.0, + "grad_norm": 0.08774646197038768, + "language_loss": 0.80187458, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81250954, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.23413086, + "step": 4779, + "time_per_iteration": 3.1365673542022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000653, + "balance_loss_mlp": 0.99392956, + "epoch": 0.9195844555598307, + "flos": 1558372359168.0, + "grad_norm": 0.007681036007533729, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80796051, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.06738281, + "step": 4780, + "time_per_iteration": 4.7075746059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064217, + "balance_loss_mlp": 1.04043531, + "epoch": 0.9197768372450943, + "flos": 474053741568.0, + "grad_norm": 0.06999993636245827, + "language_loss": 0.78756964, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79821181, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.23791504, + "step": 4781, + "time_per_iteration": 2.5254733562469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065513, + "balance_loss_mlp": 1.0416832, + "epoch": 0.9199692189303579, + "flos": 857016059904.0, + "grad_norm": 0.06498458366260695, + "language_loss": 0.84505671, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85571182, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.23840332, + "step": 4782, + "time_per_iteration": 3.2329697608947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.04155171, + "epoch": 0.9201616006156214, + "flos": 504390509568.0, + "grad_norm": 0.060298368895520246, + "language_loss": 0.77588093, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78652751, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.23071289, + "step": 4783, + "time_per_iteration": 2.6278817653656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_mlp": 1.04122746, + "epoch": 0.9203539823008849, + "flos": 548781184512.0, + "grad_norm": 0.12392133111659505, + "language_loss": 0.84999621, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.86064506, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.2364502, + "step": 4784, + "time_per_iteration": 2.7395739555358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065218, + "balance_loss_mlp": 1.04228246, + "epoch": 0.9205463639861485, + "flos": 540004879872.0, + "grad_norm": 0.07902513425092665, + "language_loss": 0.82629323, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83694541, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.22912598, + "step": 4785, + "time_per_iteration": 2.6370961666107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064652, + "balance_loss_mlp": 1.04096556, + "epoch": 0.9207387456714121, + "flos": 799725229056.0, + "grad_norm": 0.05792536374091078, + "language_loss": 0.78340071, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79404724, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.23681641, + "step": 4786, + "time_per_iteration": 3.0584778785705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065903, + "balance_loss_mlp": 1.04258537, + "epoch": 0.9209311273566756, + "flos": 502848746496.0, + "grad_norm": 0.058744664162000707, + "language_loss": 0.78801596, + "learning_rate": 1.630583198044333e-05, + "loss": 0.798675, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.23303223, + "step": 4787, + "time_per_iteration": 2.7021265029907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063265, + "balance_loss_mlp": 1.04010296, + "epoch": 0.9211235090419392, + "flos": 569323717632.0, + "grad_norm": 0.056492166982787126, + "language_loss": 0.82674456, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83737719, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.23168945, + "step": 4788, + "time_per_iteration": 2.7052977085113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064131, + "balance_loss_mlp": 1.04143345, + "epoch": 0.9213158907272028, + "flos": 806549736960.0, + "grad_norm": 0.07226120806501657, + "language_loss": 0.82702368, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83766496, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.22717285, + "step": 4789, + "time_per_iteration": 2.9990592002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065389, + "balance_loss_mlp": 1.04177368, + "epoch": 0.9215082724124664, + "flos": 490682396160.0, + "grad_norm": 0.05541559131448945, + "language_loss": 0.76416653, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77482045, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.23632812, + "step": 4790, + "time_per_iteration": 2.5554823875427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002062, + "balance_loss_mlp": 0.99533838, + "epoch": 0.9217006540977299, + "flos": 1514495232000.0, + "grad_norm": 0.006305010355530082, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78072327, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.06738281, + "step": 4791, + "time_per_iteration": 5.046639442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106511, + "balance_loss_mlp": 1.04191208, + "epoch": 0.9218930357829934, + "flos": 743793352704.0, + "grad_norm": 0.05641676981244117, + "language_loss": 0.76365674, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77430785, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.23181152, + "step": 4792, + "time_per_iteration": 2.9390647411346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061659, + "balance_loss_mlp": 1.03788924, + "epoch": 0.922085417468257, + "flos": 453036934656.0, + "grad_norm": 0.06341496172003151, + "language_loss": 0.80764413, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81826079, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.23742676, + "step": 4793, + "time_per_iteration": 2.519322633743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059745, + "balance_loss_mlp": 1.03674984, + "epoch": 0.9222777991535206, + "flos": 500249405952.0, + "grad_norm": 0.062344579720582986, + "language_loss": 0.85091174, + "learning_rate": 1.575804349061616e-05, + "loss": 0.8615092, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.2298584, + "step": 4794, + "time_per_iteration": 2.595057249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064993, + "balance_loss_mlp": 1.04115093, + "epoch": 0.9224701808387842, + "flos": 527959669248.0, + "grad_norm": 0.06403867116354088, + "language_loss": 0.78786629, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.79851627, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.23864746, + "step": 4795, + "time_per_iteration": 2.566840887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066799, + "balance_loss_mlp": 1.04436338, + "epoch": 0.9226625625240477, + "flos": 874640623104.0, + "grad_norm": 0.06236913833267484, + "language_loss": 0.75503278, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76570076, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.2244873, + "step": 4796, + "time_per_iteration": 3.1768314838409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068962, + "balance_loss_mlp": 1.04589522, + "epoch": 0.9228549442093112, + "flos": 502774594560.0, + "grad_norm": 0.09209245774607971, + "language_loss": 0.88015127, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89084095, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.23059082, + "step": 4797, + "time_per_iteration": 2.5515801906585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067427, + "balance_loss_mlp": 1.04427648, + "epoch": 0.9230473258945748, + "flos": 599989026816.0, + "grad_norm": 0.062149405212805944, + "language_loss": 0.85267746, + "learning_rate": 1.544915681564829e-05, + "loss": 0.8633517, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.23156738, + "step": 4798, + "time_per_iteration": 2.7861833572387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060939, + "balance_loss_mlp": 1.03778911, + "epoch": 0.9232397075798384, + "flos": 822508826112.0, + "grad_norm": 0.06277805502935233, + "language_loss": 0.79576015, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80636954, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.23132324, + "step": 4799, + "time_per_iteration": 3.07875394821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067703, + "balance_loss_mlp": 1.04437423, + "epoch": 0.923432089265102, + "flos": 707030000640.0, + "grad_norm": 0.06720277789755563, + "language_loss": 0.84717023, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85784721, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.2331543, + "step": 4800, + "time_per_iteration": 2.8867857456207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062759, + "balance_loss_mlp": 1.03977597, + "epoch": 0.9236244709503655, + "flos": 701861054976.0, + "grad_norm": 0.06232849747601707, + "language_loss": 0.76892114, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77954876, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.22973633, + "step": 4801, + "time_per_iteration": 2.8227624893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106451, + "balance_loss_mlp": 1.0407877, + "epoch": 0.9238168526356291, + "flos": 515039689728.0, + "grad_norm": 0.052686499610644186, + "language_loss": 0.84125519, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85190028, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.23706055, + "step": 4802, + "time_per_iteration": 2.60945725440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.04199064, + "epoch": 0.9240092343208927, + "flos": 492024098304.0, + "grad_norm": 0.07585539209620575, + "language_loss": 0.81734359, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82800424, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.24072266, + "step": 4803, + "time_per_iteration": 2.5775294303894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064809, + "balance_loss_mlp": 1.04138434, + "epoch": 0.9242016160061562, + "flos": 647218750464.0, + "grad_norm": 0.05855132746903116, + "language_loss": 0.74013615, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.75078428, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.234375, + "step": 4804, + "time_per_iteration": 2.8631527423858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064495, + "balance_loss_mlp": 1.0423224, + "epoch": 0.9243939976914197, + "flos": 729430354944.0, + "grad_norm": 0.0571546365091257, + "language_loss": 0.79579568, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80644059, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.22167969, + "step": 4805, + "time_per_iteration": 2.956089496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064454, + "balance_loss_mlp": 1.04110074, + "epoch": 0.9245863793766833, + "flos": 452246229504.0, + "grad_norm": 0.060184319008054675, + "language_loss": 0.90989661, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.92054117, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.23339844, + "step": 4806, + "time_per_iteration": 2.59251070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066465, + "balance_loss_mlp": 1.04270649, + "epoch": 0.9247787610619469, + "flos": 755030605824.0, + "grad_norm": 0.056769763464111264, + "language_loss": 0.77370489, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78436947, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.23754883, + "step": 4807, + "time_per_iteration": 2.9537925720214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067303, + "balance_loss_mlp": 1.04301965, + "epoch": 0.9249711427472105, + "flos": 561928389120.0, + "grad_norm": 0.058387380742388154, + "language_loss": 0.85072255, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.8613956, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.24267578, + "step": 4808, + "time_per_iteration": 2.76755952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063513, + "balance_loss_mlp": 1.03921795, + "epoch": 0.9251635244324741, + "flos": 526699459584.0, + "grad_norm": 0.061698884078940947, + "language_loss": 0.85564888, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86628401, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.24291992, + "step": 4809, + "time_per_iteration": 2.674180507659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061235, + "balance_loss_mlp": 1.03777432, + "epoch": 0.9253559061177375, + "flos": 611280608256.0, + "grad_norm": 0.06228045172593029, + "language_loss": 0.79177982, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80239218, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.234375, + "step": 4810, + "time_per_iteration": 2.8329577445983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003033, + "balance_loss_mlp": 0.9963572, + "epoch": 0.9255482878030011, + "flos": 1551258957312.0, + "grad_norm": 0.005180533105187962, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77928424, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.06689453, + "step": 4811, + "time_per_iteration": 4.778438329696655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070987, + "balance_loss_mlp": 1.04688334, + "epoch": 0.9257406694882647, + "flos": 766366603776.0, + "grad_norm": 0.06467551917975638, + "language_loss": 0.81310445, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82381433, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.24084473, + "step": 4812, + "time_per_iteration": 3.0328586101531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065422, + "balance_loss_mlp": 1.04172337, + "epoch": 0.9259330511735283, + "flos": 497991089664.0, + "grad_norm": 0.06532975975063249, + "language_loss": 0.83375204, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84440625, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.23669434, + "step": 4813, + "time_per_iteration": 2.560148239135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063752, + "balance_loss_mlp": 1.04049397, + "epoch": 0.9261254328587919, + "flos": 626874080256.0, + "grad_norm": 0.07028520883418425, + "language_loss": 0.79285073, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80348825, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.23254395, + "step": 4814, + "time_per_iteration": 2.7584128379821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_mlp": 1.04060006, + "epoch": 0.9263178145440554, + "flos": 597382345728.0, + "grad_norm": 0.055695870259143146, + "language_loss": 0.85310483, + "learning_rate": 1.416999056594831e-05, + "loss": 0.8637417, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.23095703, + "step": 4815, + "time_per_iteration": 2.713562250137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065101, + "balance_loss_mlp": 1.04192686, + "epoch": 0.926510196229319, + "flos": 388563319296.0, + "grad_norm": 0.060924009285229154, + "language_loss": 0.83667755, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84732854, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.23168945, + "step": 4816, + "time_per_iteration": 2.450631856918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067046, + "balance_loss_mlp": 1.04412222, + "epoch": 0.9267025779145825, + "flos": 545798974464.0, + "grad_norm": 0.06638467341927559, + "language_loss": 0.84540778, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85607827, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.22900391, + "step": 4817, + "time_per_iteration": 2.6362295150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065467, + "balance_loss_mlp": 1.04217339, + "epoch": 0.9268949595998461, + "flos": 499789813248.0, + "grad_norm": 0.06886658822302974, + "language_loss": 0.81961125, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.83026588, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.23266602, + "step": 4818, + "time_per_iteration": 2.6441164016723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062045, + "balance_loss_mlp": 1.03836977, + "epoch": 0.9270873412851096, + "flos": 432828085248.0, + "grad_norm": 0.05488858492503732, + "language_loss": 0.82954144, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.84016186, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.23669434, + "step": 4819, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062926, + "balance_loss_mlp": 1.03994298, + "epoch": 0.9272797229703732, + "flos": 466769640960.0, + "grad_norm": 0.05813716940805213, + "language_loss": 0.86372411, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87435341, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.22973633, + "step": 4820, + "time_per_iteration": 2.588118553161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_mlp": 1.03865016, + "epoch": 0.9274721046556368, + "flos": 704838122496.0, + "grad_norm": 0.08851698631196843, + "language_loss": 0.78835869, + "learning_rate": 1.373152729763938e-05, + "loss": 0.79898179, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.23632812, + "step": 4821, + "time_per_iteration": 3.013256788253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003926, + "balance_loss_mlp": 0.99725056, + "epoch": 0.9276644863409004, + "flos": 1402255950336.0, + "grad_norm": 0.0043274702842664575, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83384389, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.06689453, + "step": 4822, + "time_per_iteration": 4.84527587890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063845, + "balance_loss_mlp": 1.04090953, + "epoch": 0.927856868026164, + "flos": 741722614272.0, + "grad_norm": 0.06082082219392593, + "language_loss": 0.80463547, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81527394, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.22937012, + "step": 4823, + "time_per_iteration": 3.0070438385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067648, + "balance_loss_mlp": 1.04371095, + "epoch": 0.9280492497114274, + "flos": 412223883264.0, + "grad_norm": 0.07327384984258292, + "language_loss": 0.74385369, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.75453013, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.23925781, + "step": 4824, + "time_per_iteration": 2.4685919284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065774, + "balance_loss_mlp": 1.04250407, + "epoch": 0.928241631396691, + "flos": 646504768512.0, + "grad_norm": 0.0635930076131742, + "language_loss": 0.84229624, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85295397, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.23254395, + "step": 4825, + "time_per_iteration": 2.7666497230529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065352, + "balance_loss_mlp": 1.04148602, + "epoch": 0.9284340130819546, + "flos": 696855094272.0, + "grad_norm": 0.06956360391060491, + "language_loss": 0.80905682, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81971031, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.23852539, + "step": 4826, + "time_per_iteration": 2.9280033111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067687, + "balance_loss_mlp": 1.0441432, + "epoch": 0.9286263947672182, + "flos": 759132062208.0, + "grad_norm": 0.0534141212700452, + "language_loss": 0.83933532, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.85001218, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.23535156, + "step": 4827, + "time_per_iteration": 3.05427622795105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065524, + "balance_loss_mlp": 1.04190862, + "epoch": 0.9288187764524817, + "flos": 672823770624.0, + "grad_norm": 0.0517820493277977, + "language_loss": 0.8033545, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81400979, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.23632812, + "step": 4828, + "time_per_iteration": 2.95302152633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.0422368, + "epoch": 0.9290111581377453, + "flos": 500469290496.0, + "grad_norm": 0.06212228001767646, + "language_loss": 0.8410219, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.85167575, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.23156738, + "step": 4829, + "time_per_iteration": 2.6186985969543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003756, + "balance_loss_mlp": 0.99707985, + "epoch": 0.9292035398230089, + "flos": 1563627566592.0, + "grad_norm": 0.004318231630907556, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73125815, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.06689453, + "step": 4830, + "time_per_iteration": 4.89445424079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004264, + "balance_loss_mlp": 0.99758852, + "epoch": 0.9293959215082724, + "flos": 1518673411584.0, + "grad_norm": 0.00373636872499248, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.805161, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.06689453, + "step": 4831, + "time_per_iteration": 4.8943281173706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063576, + "balance_loss_mlp": 1.0404253, + "epoch": 0.929588303193536, + "flos": 557836844544.0, + "grad_norm": 0.07212229346213533, + "language_loss": 0.84529984, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85593563, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.23120117, + "step": 4832, + "time_per_iteration": 2.670252561569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071068, + "balance_loss_mlp": 1.0462724, + "epoch": 0.9297806848787995, + "flos": 478580285952.0, + "grad_norm": 0.07862143445369829, + "language_loss": 0.80253655, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.8132472, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.2479248, + "step": 4833, + "time_per_iteration": 2.5253655910491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.04317367, + "epoch": 0.9299730665640631, + "flos": 564537641472.0, + "grad_norm": 0.06499018101587725, + "language_loss": 0.80202889, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81269199, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.23144531, + "step": 4834, + "time_per_iteration": 2.754465341567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_mlp": 1.04458618, + "epoch": 0.9301654482493267, + "flos": 560174082048.0, + "grad_norm": 0.059674157785584915, + "language_loss": 0.82694227, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83761966, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.23132324, + "step": 4835, + "time_per_iteration": 2.80916690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003151, + "balance_loss_mlp": 0.9964751, + "epoch": 0.9303578299345903, + "flos": 1520096606208.0, + "grad_norm": 0.004416175426765907, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77855599, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.06689453, + "step": 4836, + "time_per_iteration": 5.02410364151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063476, + "balance_loss_mlp": 1.04054022, + "epoch": 0.9305502116198537, + "flos": 530843134464.0, + "grad_norm": 0.0782967296120336, + "language_loss": 0.83038974, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84102452, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.22937012, + "step": 4837, + "time_per_iteration": 2.6249191761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062018, + "balance_loss_mlp": 1.03849769, + "epoch": 0.9307425933051173, + "flos": 474898775040.0, + "grad_norm": 0.06822826243671351, + "language_loss": 0.81786454, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82848471, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.23522949, + "step": 4838, + "time_per_iteration": 2.540736675262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068071, + "balance_loss_mlp": 1.04443145, + "epoch": 0.9309349749903809, + "flos": 584892223488.0, + "grad_norm": 0.05350792740286597, + "language_loss": 0.86948222, + "learning_rate": 1.245693929549213e-05, + "loss": 0.88016295, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.23620605, + "step": 4839, + "time_per_iteration": 2.7612762451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063218, + "balance_loss_mlp": 1.04080653, + "epoch": 0.9311273566756445, + "flos": 861666315264.0, + "grad_norm": 0.0537936621956177, + "language_loss": 0.76869768, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77932984, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.22412109, + "step": 4840, + "time_per_iteration": 3.152477502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070778, + "balance_loss_mlp": 1.0476867, + "epoch": 0.9313197383609081, + "flos": 548094366720.0, + "grad_norm": 0.06050694450577352, + "language_loss": 0.82516444, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83587223, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.23083496, + "step": 4841, + "time_per_iteration": 2.6213271617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069132, + "balance_loss_mlp": 1.04567182, + "epoch": 0.9315121200461716, + "flos": 468756315648.0, + "grad_norm": 0.07421356756847869, + "language_loss": 0.8145088, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82520008, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.234375, + "step": 4842, + "time_per_iteration": 2.519787549972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066338, + "balance_loss_mlp": 1.04315162, + "epoch": 0.9317045017314352, + "flos": 417659701248.0, + "grad_norm": 0.06920394282735456, + "language_loss": 0.77784127, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78850466, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.23181152, + "step": 4843, + "time_per_iteration": 2.499638080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065399, + "balance_loss_mlp": 1.04216504, + "epoch": 0.9318968834166987, + "flos": 540489065472.0, + "grad_norm": 0.05744852961978849, + "language_loss": 0.77309132, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78374529, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.23217773, + "step": 4844, + "time_per_iteration": 2.7709178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066931, + "balance_loss_mlp": 1.04338753, + "epoch": 0.9320892651019623, + "flos": 521330452992.0, + "grad_norm": 0.05796617525488239, + "language_loss": 0.80614036, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81680971, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.23522949, + "step": 4845, + "time_per_iteration": 2.6631603240966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060552, + "balance_loss_mlp": 1.03811717, + "epoch": 0.9322816467872258, + "flos": 582072998400.0, + "grad_norm": 0.05607447986709382, + "language_loss": 0.80583751, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.81644303, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.22424316, + "step": 4846, + "time_per_iteration": 2.787560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.03873289, + "epoch": 0.9324740284724894, + "flos": 484747338240.0, + "grad_norm": 0.06368063166693694, + "language_loss": 0.82036614, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83098686, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.2331543, + "step": 4847, + "time_per_iteration": 2.7033135890960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067437, + "balance_loss_mlp": 1.04408431, + "epoch": 0.932666410157753, + "flos": 732585461760.0, + "grad_norm": 0.061109195979706835, + "language_loss": 0.82817626, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83885062, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.23352051, + "step": 4848, + "time_per_iteration": 3.0654428005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065268, + "balance_loss_mlp": 1.04224896, + "epoch": 0.9328587918430166, + "flos": 965537127936.0, + "grad_norm": 0.05439499186899724, + "language_loss": 0.78829134, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.798944, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.23010254, + "step": 4849, + "time_per_iteration": 3.2687015533447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068837, + "balance_loss_mlp": 1.04509044, + "epoch": 0.9330511735282802, + "flos": 614552085504.0, + "grad_norm": 0.06187219780448243, + "language_loss": 0.80620849, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81689686, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.23706055, + "step": 4850, + "time_per_iteration": 2.7616846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064216, + "balance_loss_mlp": 1.04126775, + "epoch": 0.9332435552135436, + "flos": 559101823488.0, + "grad_norm": 0.057342153109796054, + "language_loss": 0.8581, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86874211, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.22937012, + "step": 4851, + "time_per_iteration": 2.715174913406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069652, + "balance_loss_mlp": 1.0457387, + "epoch": 0.9334359368988072, + "flos": 515536358400.0, + "grad_norm": 0.057032353569238206, + "language_loss": 0.8192578, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.82995433, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.23901367, + "step": 4852, + "time_per_iteration": 2.608262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064317, + "balance_loss_mlp": 1.0412972, + "epoch": 0.9336283185840708, + "flos": 539809588224.0, + "grad_norm": 0.07695737048357706, + "language_loss": 0.82685083, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83749396, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.23022461, + "step": 4853, + "time_per_iteration": 2.755443811416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002947, + "balance_loss_mlp": 0.99627131, + "epoch": 0.9338207002693344, + "flos": 1562824751616.0, + "grad_norm": 0.004836287294760912, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79457963, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.06689453, + "step": 4854, + "time_per_iteration": 4.906665325164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064754, + "balance_loss_mlp": 1.04140043, + "epoch": 0.9340130819545979, + "flos": 645261811200.0, + "grad_norm": 0.05327327951901491, + "language_loss": 0.81519377, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82584137, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.23352051, + "step": 4855, + "time_per_iteration": 2.9034512042999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064262, + "balance_loss_mlp": 1.04152846, + "epoch": 0.9342054636398615, + "flos": 503441588736.0, + "grad_norm": 0.059211036123640204, + "language_loss": 0.77279824, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.78344083, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.22729492, + "step": 4856, + "time_per_iteration": 2.6446688175201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065745, + "balance_loss_mlp": 1.04301167, + "epoch": 0.934397845325125, + "flos": 593026126848.0, + "grad_norm": 0.0688579272968578, + "language_loss": 0.84582496, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85648245, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.22741699, + "step": 4857, + "time_per_iteration": 2.8205456733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063182, + "balance_loss_mlp": 1.04081821, + "epoch": 0.9345902270103886, + "flos": 499891129344.0, + "grad_norm": 0.05512577442694981, + "language_loss": 0.80538815, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81601995, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.22363281, + "step": 4858, + "time_per_iteration": 2.559678554534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100331, + "balance_loss_mlp": 0.99663389, + "epoch": 0.9347826086956522, + "flos": 1520329347072.0, + "grad_norm": 0.003912108369513215, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76991028, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.06689453, + "step": 4859, + "time_per_iteration": 4.678871393203735 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064544, + "balance_loss_mlp": 1.04120278, + "epoch": 0.9349749903809157, + "flos": 504550923264.0, + "grad_norm": 0.07717239642081329, + "language_loss": 0.81449348, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82513893, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.23327637, + "step": 4860, + "time_per_iteration": 2.7969114780426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106448, + "balance_loss_mlp": 1.04100811, + "epoch": 0.9351673720661793, + "flos": 568901200896.0, + "grad_norm": 0.07866007536371793, + "language_loss": 0.78877968, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.79942441, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.23474121, + "step": 4861, + "time_per_iteration": 2.647122383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061655, + "balance_loss_mlp": 1.03809905, + "epoch": 0.9353597537514429, + "flos": 544605576192.0, + "grad_norm": 0.06625680401164971, + "language_loss": 0.86626673, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87688333, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.23547363, + "step": 4862, + "time_per_iteration": 2.6402182579040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061832, + "balance_loss_mlp": 1.0390749, + "epoch": 0.9355521354367065, + "flos": 518997984768.0, + "grad_norm": 0.045603108717518104, + "language_loss": 0.84642792, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85704625, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.22753906, + "step": 4863, + "time_per_iteration": 2.7369048595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062849, + "balance_loss_mlp": 1.04003191, + "epoch": 0.93574451712197, + "flos": 446316314112.0, + "grad_norm": 0.06738020369816806, + "language_loss": 0.78858817, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79921663, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.22790527, + "step": 4864, + "time_per_iteration": 2.5434770584106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062216, + "balance_loss_mlp": 1.03907764, + "epoch": 0.9359368988072335, + "flos": 480517401600.0, + "grad_norm": 0.06755819858546042, + "language_loss": 0.77495611, + "learning_rate": 1.072417553472832e-05, + "loss": 0.78557825, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.23132324, + "step": 4865, + "time_per_iteration": 2.51901912689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063392, + "balance_loss_mlp": 1.04114723, + "epoch": 0.9361292804924971, + "flos": 497118892032.0, + "grad_norm": 0.06722920807040486, + "language_loss": 0.85406494, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86469889, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.22265625, + "step": 4866, + "time_per_iteration": 2.626563787460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062707, + "balance_loss_mlp": 1.04009354, + "epoch": 0.9363216621777607, + "flos": 618122368512.0, + "grad_norm": 0.058043061809792344, + "language_loss": 0.84187984, + "learning_rate": 1.059619902982184e-05, + "loss": 0.852507, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.22595215, + "step": 4867, + "time_per_iteration": 2.7498879432678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002771, + "balance_loss_mlp": 0.99609506, + "epoch": 0.9365140438630243, + "flos": 1415929559040.0, + "grad_norm": 0.0036492832338999074, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80206108, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.06689453, + "step": 4868, + "time_per_iteration": 4.8916120529174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.04079998, + "epoch": 0.9367064255482878, + "flos": 590503509504.0, + "grad_norm": 0.05992901808998059, + "language_loss": 0.81489742, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82553828, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.23291016, + "step": 4869, + "time_per_iteration": 2.6934800148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063, + "balance_loss_mlp": 1.04038572, + "epoch": 0.9368988072335513, + "flos": 526637790720.0, + "grad_norm": 0.05806012703031958, + "language_loss": 0.82367408, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83430409, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.22631836, + "step": 4870, + "time_per_iteration": 2.6466493606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.04546094, + "epoch": 0.9370911889188149, + "flos": 743205279744.0, + "grad_norm": 0.05796983188837931, + "language_loss": 0.79065406, + "learning_rate": 1.034252625822113e-05, + "loss": 0.80134535, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.2364502, + "step": 4871, + "time_per_iteration": 2.895653009414673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010612, + "balance_loss_mlp": 1.03949165, + "epoch": 0.9372835706040785, + "flos": 546038682624.0, + "grad_norm": 0.05945674857621565, + "language_loss": 0.79093826, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.80155027, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.21716309, + "step": 4872, + "time_per_iteration": 2.643869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065754, + "balance_loss_mlp": 1.04218626, + "epoch": 0.9374759522893421, + "flos": 491633515008.0, + "grad_norm": 0.05500205513131922, + "language_loss": 0.81993419, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.83059168, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.23571777, + "step": 4873, + "time_per_iteration": 2.6483852863311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.04451275, + "epoch": 0.9376683339746056, + "flos": 578421222912.0, + "grad_norm": 0.058395935150287376, + "language_loss": 0.82688534, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83756995, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.23925781, + "step": 4874, + "time_per_iteration": 2.6801791191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065584, + "balance_loss_mlp": 1.04139686, + "epoch": 0.9378607156598692, + "flos": 506290549248.0, + "grad_norm": 0.07310615221307289, + "language_loss": 0.80254924, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81320512, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.24182129, + "step": 4875, + "time_per_iteration": 2.646641492843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062329, + "balance_loss_mlp": 1.03869009, + "epoch": 0.9380530973451328, + "flos": 520015915008.0, + "grad_norm": 0.07308635184773467, + "language_loss": 0.77842414, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.78904748, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.23632812, + "step": 4876, + "time_per_iteration": 2.6062300205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_mlp": 1.04104531, + "epoch": 0.9382454790303963, + "flos": 557799768576.0, + "grad_norm": 0.06375694667062341, + "language_loss": 0.84655243, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85719502, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.23193359, + "step": 4877, + "time_per_iteration": 2.645962953567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064251, + "balance_loss_mlp": 1.04075491, + "epoch": 0.9384378607156598, + "flos": 695476316160.0, + "grad_norm": 0.06248438585170171, + "language_loss": 0.81895542, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82959789, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.23486328, + "step": 4878, + "time_per_iteration": 2.89534330368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065421, + "balance_loss_mlp": 1.04216325, + "epoch": 0.9386302424009234, + "flos": 554750747136.0, + "grad_norm": 0.06499784582552266, + "language_loss": 0.81068319, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82133734, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.23254395, + "step": 4879, + "time_per_iteration": 2.637808084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106776, + "balance_loss_mlp": 1.04457378, + "epoch": 0.938822624086187, + "flos": 566981337600.0, + "grad_norm": 0.06495588202273726, + "language_loss": 0.79970872, + "learning_rate": 9.782885847304469e-06, + "loss": 0.8103863, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.23181152, + "step": 4880, + "time_per_iteration": 2.6463756561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_mlp": 1.04586959, + "epoch": 0.9390150057714506, + "flos": 417602801664.0, + "grad_norm": 0.055703733749820204, + "language_loss": 0.80662251, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81730676, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.22546387, + "step": 4881, + "time_per_iteration": 2.5764780044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064129, + "balance_loss_mlp": 1.04130054, + "epoch": 0.9392073874567142, + "flos": 1553839967232.0, + "grad_norm": 0.06423619472618826, + "language_loss": 0.76624274, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77688408, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.22802734, + "step": 4882, + "time_per_iteration": 3.687774181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066846, + "balance_loss_mlp": 1.0433979, + "epoch": 0.9393997691419776, + "flos": 652536000000.0, + "grad_norm": 0.061865375214900514, + "language_loss": 0.78620249, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79687095, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.23449707, + "step": 4883, + "time_per_iteration": 2.762404441833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002747, + "balance_loss_mlp": 0.9960711, + "epoch": 0.9395921508272412, + "flos": 1553294817792.0, + "grad_norm": 0.003644027064254583, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79173422, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.06689453, + "step": 4884, + "time_per_iteration": 4.8244874477386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072097, + "balance_loss_mlp": 1.04874396, + "epoch": 0.9397845325125048, + "flos": 498144162816.0, + "grad_norm": 0.05308961569497824, + "language_loss": 0.78810966, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79883063, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.23339844, + "step": 4885, + "time_per_iteration": 2.616199254989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_mlp": 1.04230857, + "epoch": 0.9399769141977684, + "flos": 498596414976.0, + "grad_norm": 0.06818307389884938, + "language_loss": 0.83659059, + "learning_rate": 9.418355513755638e-06, + "loss": 0.8472355, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.22180176, + "step": 4886, + "time_per_iteration": 2.601780891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003192, + "balance_loss_mlp": 0.99651599, + "epoch": 0.9401692958830319, + "flos": 1402500427776.0, + "grad_norm": 0.0034647396608713126, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80335385, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.06689453, + "step": 4887, + "time_per_iteration": 4.828904151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063627, + "balance_loss_mlp": 1.04023838, + "epoch": 0.9403616775682955, + "flos": 540123448320.0, + "grad_norm": 0.04725299945235895, + "language_loss": 0.85205865, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86269492, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.23400879, + "step": 4888, + "time_per_iteration": 2.7369189262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003192, + "balance_loss_mlp": 0.99651647, + "epoch": 0.9405540592535591, + "flos": 1322058184704.0, + "grad_norm": 0.0034657140386961023, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76172626, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.06689453, + "step": 4889, + "time_per_iteration": 4.934547662734985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062139, + "balance_loss_mlp": 1.03936982, + "epoch": 0.9407464409388226, + "flos": 572362827264.0, + "grad_norm": 0.08003864930040928, + "language_loss": 0.8301568, + "learning_rate": 9.179144190235799e-06, + "loss": 0.84077823, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.22766113, + "step": 4890, + "time_per_iteration": 2.6717724800109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065219, + "balance_loss_mlp": 1.04135287, + "epoch": 0.9409388226240862, + "flos": 511264203264.0, + "grad_norm": 0.06453478244721644, + "language_loss": 0.77116787, + "learning_rate": 9.119817685386112e-06, + "loss": 0.78182006, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.23840332, + "step": 4891, + "time_per_iteration": 2.7476096153259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003182, + "balance_loss_mlp": 0.99650663, + "epoch": 0.9411312043093497, + "flos": 1569901077504.0, + "grad_norm": 0.003464956988218373, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81245065, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.06689453, + "step": 4892, + "time_per_iteration": 4.850857496261597 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067071, + "balance_loss_mlp": 1.04444468, + "epoch": 0.9413235859946133, + "flos": 569469450240.0, + "grad_norm": 0.06650955489841599, + "language_loss": 0.78249395, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79316461, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.22619629, + "step": 4893, + "time_per_iteration": 2.7343316078186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_mlp": 1.04241848, + "epoch": 0.9415159676798769, + "flos": 781905747456.0, + "grad_norm": 0.06610671863402162, + "language_loss": 0.80428064, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81493282, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.22814941, + "step": 4894, + "time_per_iteration": 2.9975225925445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063059, + "balance_loss_mlp": 1.03932428, + "epoch": 0.9417083493651405, + "flos": 849341749248.0, + "grad_norm": 0.05474998370074054, + "language_loss": 0.80078602, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81141663, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.23730469, + "step": 4895, + "time_per_iteration": 3.1650521755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065606, + "balance_loss_mlp": 1.04243147, + "epoch": 0.941900731050404, + "flos": 529333304832.0, + "grad_norm": 0.06924558330939024, + "language_loss": 0.85654449, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86720055, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.23144531, + "step": 4896, + "time_per_iteration": 2.689373731613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062226, + "balance_loss_mlp": 1.03861082, + "epoch": 0.9420931127356675, + "flos": 557073303552.0, + "grad_norm": 0.05304188510469451, + "language_loss": 0.79942775, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81005001, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.23583984, + "step": 4897, + "time_per_iteration": 2.7777161598205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064715, + "balance_loss_mlp": 1.04198134, + "epoch": 0.9422854944209311, + "flos": 652543340544.0, + "grad_norm": 0.06756755796170955, + "language_loss": 0.86650455, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87715167, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.22717285, + "step": 4898, + "time_per_iteration": 2.8182201385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065783, + "balance_loss_mlp": 1.04313302, + "epoch": 0.9424778761061947, + "flos": 553685829120.0, + "grad_norm": 0.057617902456089394, + "language_loss": 0.84157532, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85223317, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.22644043, + "step": 4899, + "time_per_iteration": 2.6730167865753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061325, + "balance_loss_mlp": 1.03837681, + "epoch": 0.9426702577914583, + "flos": 588559053312.0, + "grad_norm": 0.06833282478625986, + "language_loss": 0.79837596, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80898917, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.22937012, + "step": 4900, + "time_per_iteration": 2.716416358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106177, + "balance_loss_mlp": 1.03855944, + "epoch": 0.9428626394767218, + "flos": 616625021952.0, + "grad_norm": 0.07344461443720736, + "language_loss": 0.78565979, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79627746, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.2322998, + "step": 4901, + "time_per_iteration": 2.8687591552734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106181, + "balance_loss_mlp": 1.03870749, + "epoch": 0.9430550211619854, + "flos": 610410981888.0, + "grad_norm": 0.06554635995861575, + "language_loss": 0.81850672, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82912481, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.2310791, + "step": 4902, + "time_per_iteration": 2.7098758220672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062796, + "balance_loss_mlp": 1.04032564, + "epoch": 0.943247402847249, + "flos": 565990571520.0, + "grad_norm": 0.05937812811143957, + "language_loss": 0.78285533, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79348326, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.22460938, + "step": 4903, + "time_per_iteration": 2.7645294666290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067307, + "balance_loss_mlp": 1.04396605, + "epoch": 0.9434397845325125, + "flos": 527040483840.0, + "grad_norm": 0.06344384420879408, + "language_loss": 0.81625634, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82692945, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.23339844, + "step": 4904, + "time_per_iteration": 2.635781764984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066045, + "balance_loss_mlp": 1.0433718, + "epoch": 0.943632166217776, + "flos": 593451214848.0, + "grad_norm": 0.05889111669093531, + "language_loss": 0.83035827, + "learning_rate": 8.309267504391593e-06, + "loss": 0.84101868, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.2265625, + "step": 4905, + "time_per_iteration": 2.713594436645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063603, + "balance_loss_mlp": 1.04072678, + "epoch": 0.9438245479030396, + "flos": 572770289664.0, + "grad_norm": 0.050528675559877, + "language_loss": 0.85556394, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86620003, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.22875977, + "step": 4906, + "time_per_iteration": 2.828354597091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066076, + "balance_loss_mlp": 1.04285407, + "epoch": 0.9440169295883032, + "flos": 488258523648.0, + "grad_norm": 0.05995138028136088, + "language_loss": 0.81837797, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82903874, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.23217773, + "step": 4907, + "time_per_iteration": 2.5686655044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063058, + "balance_loss_mlp": 1.03981233, + "epoch": 0.9442093112735668, + "flos": 731742999552.0, + "grad_norm": 0.0676773058435964, + "language_loss": 0.73885441, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74948502, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.23242188, + "step": 4908, + "time_per_iteration": 2.980459451675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060915, + "balance_loss_mlp": 1.03826475, + "epoch": 0.9444016929588304, + "flos": 571031036928.0, + "grad_norm": 0.06922149829853487, + "language_loss": 0.81962436, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83023351, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.22668457, + "step": 4909, + "time_per_iteration": 2.657578229904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_mlp": 1.0400275, + "epoch": 0.9445940746440938, + "flos": 509292582912.0, + "grad_norm": 0.06711231181162121, + "language_loss": 0.86318266, + "learning_rate": 8.028849459169318e-06, + "loss": 0.87381876, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.23583984, + "step": 4910, + "time_per_iteration": 2.610914707183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067641, + "balance_loss_mlp": 1.04381144, + "epoch": 0.9447864563293574, + "flos": 624556293120.0, + "grad_norm": 0.07115824086322278, + "language_loss": 0.80972213, + "learning_rate": 7.97333876382028e-06, + "loss": 0.82039851, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.23828125, + "step": 4911, + "time_per_iteration": 2.835008144378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064397, + "balance_loss_mlp": 1.04086518, + "epoch": 0.944978838014621, + "flos": 505270047744.0, + "grad_norm": 0.06363073940641663, + "language_loss": 0.80821174, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81885576, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.23498535, + "step": 4912, + "time_per_iteration": 2.706355571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004085, + "balance_loss_mlp": 0.99740899, + "epoch": 0.9451712196998846, + "flos": 1484205451776.0, + "grad_norm": 0.0037181691886226226, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79291421, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.06689453, + "step": 4913, + "time_per_iteration": 4.950761318206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064844, + "balance_loss_mlp": 1.04196763, + "epoch": 0.9453636013851482, + "flos": 521137732608.0, + "grad_norm": 0.06260745945333845, + "language_loss": 0.90274841, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91339684, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.2286377, + "step": 4914, + "time_per_iteration": 2.673654079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004093, + "balance_loss_mlp": 0.99741739, + "epoch": 0.9455559830704117, + "flos": 1496902975488.0, + "grad_norm": 0.003715128651478327, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84566444, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.06689453, + "step": 4915, + "time_per_iteration": 4.9783148765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.04273486, + "epoch": 0.9457483647556753, + "flos": 498126910464.0, + "grad_norm": 0.056832232509337464, + "language_loss": 0.81964576, + "learning_rate": 7.698651040865534e-06, + "loss": 0.8303082, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.23486328, + "step": 4916, + "time_per_iteration": 2.621641159057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066257, + "balance_loss_mlp": 1.04305935, + "epoch": 0.9459407464409388, + "flos": 1019405979648.0, + "grad_norm": 0.05378819222671923, + "language_loss": 0.82485378, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83551633, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.23168945, + "step": 4917, + "time_per_iteration": 3.4095513820648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066193, + "balance_loss_mlp": 1.04319763, + "epoch": 0.9461331281262024, + "flos": 513589330944.0, + "grad_norm": 0.060476805644650536, + "language_loss": 0.81080627, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82146823, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.22998047, + "step": 4918, + "time_per_iteration": 2.597259283065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067245, + "balance_loss_mlp": 1.04371297, + "epoch": 0.9463255098114659, + "flos": 528023909376.0, + "grad_norm": 0.06478433442615823, + "language_loss": 0.78112018, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79179263, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.23522949, + "step": 4919, + "time_per_iteration": 2.6052768230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106469, + "balance_loss_mlp": 1.04148018, + "epoch": 0.9465178914967295, + "flos": 506043500544.0, + "grad_norm": 0.08274002373229264, + "language_loss": 0.83619392, + "learning_rate": 7.482341043430485e-06, + "loss": 0.84684086, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.23193359, + "step": 4920, + "time_per_iteration": 2.5524473190307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010621, + "balance_loss_mlp": 1.03874683, + "epoch": 0.9467102731819931, + "flos": 660254727168.0, + "grad_norm": 0.07960983863422562, + "language_loss": 0.85914946, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86977041, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.23339844, + "step": 4921, + "time_per_iteration": 2.9114954471588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063495, + "balance_loss_mlp": 1.04008269, + "epoch": 0.9469026548672567, + "flos": 675183403008.0, + "grad_norm": 0.06504331603068483, + "language_loss": 0.89519143, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90582639, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.23400879, + "step": 4922, + "time_per_iteration": 2.917511463165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064556, + "balance_loss_mlp": 1.04024911, + "epoch": 0.9470950365525203, + "flos": 513964859904.0, + "grad_norm": 0.06142738243292726, + "language_loss": 0.79989028, + "learning_rate": 7.32211620090012e-06, + "loss": 0.81053579, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.24291992, + "step": 4923, + "time_per_iteration": 2.594297170639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065563, + "balance_loss_mlp": 1.04267466, + "epoch": 0.9472874182377837, + "flos": 550103063040.0, + "grad_norm": 0.11094317680560754, + "language_loss": 0.81383359, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82448924, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.22888184, + "step": 4924, + "time_per_iteration": 2.7379791736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062378, + "balance_loss_mlp": 1.03937054, + "epoch": 0.9474797999230473, + "flos": 542769776640.0, + "grad_norm": 0.0655870162966716, + "language_loss": 0.80459619, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81522, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.22998047, + "step": 4925, + "time_per_iteration": 2.6647040843963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063317, + "balance_loss_mlp": 1.03977299, + "epoch": 0.9476721816083109, + "flos": 844644879360.0, + "grad_norm": 0.057403972523752636, + "language_loss": 0.85956061, + "learning_rate": 7.163612828585242e-06, + "loss": 0.87019372, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.23522949, + "step": 4926, + "time_per_iteration": 3.1154632568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065094, + "balance_loss_mlp": 1.04253972, + "epoch": 0.9478645632935745, + "flos": 638002676736.0, + "grad_norm": 0.05945675651641381, + "language_loss": 0.79295301, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80360401, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.22546387, + "step": 4927, + "time_per_iteration": 2.7876851558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062569, + "balance_loss_mlp": 1.04003882, + "epoch": 0.948056944978838, + "flos": 656832748032.0, + "grad_norm": 0.06197261766309, + "language_loss": 0.76143181, + "learning_rate": 7.058900559793469e-06, + "loss": 0.77205747, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.22521973, + "step": 4928, + "time_per_iteration": 2.8076236248016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061132, + "balance_loss_mlp": 1.03792191, + "epoch": 0.9482493266641016, + "flos": 440907660288.0, + "grad_norm": 0.06807992051239511, + "language_loss": 0.83396912, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84458041, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.23193359, + "step": 4929, + "time_per_iteration": 3.9939382076263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067537, + "balance_loss_mlp": 1.04376614, + "epoch": 0.9484417083493651, + "flos": 545989123584.0, + "grad_norm": 0.0624157967268652, + "language_loss": 0.78332889, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.7940042, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.23742676, + "step": 4930, + "time_per_iteration": 2.833548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.04350913, + "epoch": 0.9486340900346287, + "flos": 538598937600.0, + "grad_norm": 0.054248039463331794, + "language_loss": 0.79658103, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80724776, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.23144531, + "step": 4931, + "time_per_iteration": 2.6925623416900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065473, + "balance_loss_mlp": 1.04097533, + "epoch": 0.9488264717198923, + "flos": 681669457920.0, + "grad_norm": 0.05662742875275373, + "language_loss": 0.85853779, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86919254, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.24487305, + "step": 4932, + "time_per_iteration": 2.824794054031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059786, + "balance_loss_mlp": 1.03684974, + "epoch": 0.9490188534051558, + "flos": 462603944448.0, + "grad_norm": 0.06760036759869348, + "language_loss": 0.87858427, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.88918209, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.22949219, + "step": 4933, + "time_per_iteration": 2.523742914199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064105, + "balance_loss_mlp": 1.04112101, + "epoch": 0.9492112350904194, + "flos": 543135393792.0, + "grad_norm": 0.05780879872940495, + "language_loss": 0.82862514, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83926618, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.22973633, + "step": 4934, + "time_per_iteration": 2.7232470512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058669, + "balance_loss_mlp": 1.03604269, + "epoch": 0.949403616775683, + "flos": 550322947584.0, + "grad_norm": 0.05314252301196156, + "language_loss": 0.84149778, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85208452, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.22607422, + "step": 4935, + "time_per_iteration": 2.6983773708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065017, + "balance_loss_mlp": 1.04250991, + "epoch": 0.9495959984609466, + "flos": 598383023616.0, + "grad_norm": 0.05800504294431128, + "language_loss": 0.83177608, + "learning_rate": 6.647708160456678e-06, + "loss": 0.84242618, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.22485352, + "step": 4936, + "time_per_iteration": 2.698988437652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_mlp": 1.04014874, + "epoch": 0.94978838014621, + "flos": 608409626112.0, + "grad_norm": 0.05755130237878741, + "language_loss": 0.82729852, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83793151, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.23132324, + "step": 4937, + "time_per_iteration": 2.7943813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066129, + "balance_loss_mlp": 1.04247832, + "epoch": 0.9499807618314736, + "flos": 540832660992.0, + "grad_norm": 0.05836751006762688, + "language_loss": 0.86831605, + "learning_rate": 6.546825027775427e-06, + "loss": 0.8789773, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.23620605, + "step": 4938, + "time_per_iteration": 2.627950668334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106465, + "balance_loss_mlp": 1.0412848, + "epoch": 0.9501731435167372, + "flos": 594600196608.0, + "grad_norm": 0.05400011970264735, + "language_loss": 0.82876295, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83940947, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.23364258, + "step": 4939, + "time_per_iteration": 2.6900975704193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061908, + "balance_loss_mlp": 1.03847146, + "epoch": 0.9503655252020008, + "flos": 454138928640.0, + "grad_norm": 0.06191054950237094, + "language_loss": 0.80292475, + "learning_rate": 6.446708197070161e-06, + "loss": 0.8135438, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.234375, + "step": 4940, + "time_per_iteration": 2.5250589847564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062795, + "balance_loss_mlp": 1.03952527, + "epoch": 0.9505579068872644, + "flos": 667944092160.0, + "grad_norm": 0.05771055255851234, + "language_loss": 0.84632671, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85695469, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.23266602, + "step": 4941, + "time_per_iteration": 2.778857946395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066363, + "balance_loss_mlp": 1.04283106, + "epoch": 0.9507502885725279, + "flos": 402207192576.0, + "grad_norm": 0.062273521229545936, + "language_loss": 0.818012, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82867563, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.23535156, + "step": 4942, + "time_per_iteration": 2.4828665256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063047, + "balance_loss_mlp": 1.04011106, + "epoch": 0.9509426702577914, + "flos": 700358565888.0, + "grad_norm": 0.058771865769073964, + "language_loss": 0.79593182, + "learning_rate": 6.297970106994011e-06, + "loss": 0.8065623, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.22912598, + "step": 4943, + "time_per_iteration": 2.98268723487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063645, + "balance_loss_mlp": 1.04156792, + "epoch": 0.951135051943055, + "flos": 501415640064.0, + "grad_norm": 0.062322209370641694, + "language_loss": 0.82714278, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83777922, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.22070312, + "step": 4944, + "time_per_iteration": 2.57051157951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_mlp": 1.03986096, + "epoch": 0.9513274336283186, + "flos": 614621094912.0, + "grad_norm": 0.0547837860585051, + "language_loss": 0.81620574, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82683134, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.22692871, + "step": 4945, + "time_per_iteration": 2.9212303161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062052, + "balance_loss_mlp": 1.03908038, + "epoch": 0.9515198153135821, + "flos": 519586057728.0, + "grad_norm": 0.06872755929487602, + "language_loss": 0.81921458, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82983512, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.22949219, + "step": 4946, + "time_per_iteration": 2.5664422512054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065752, + "balance_loss_mlp": 1.04230392, + "epoch": 0.9517121969988457, + "flos": 664954168320.0, + "grad_norm": 0.05309985428265593, + "language_loss": 0.7667439, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77740133, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.23449707, + "step": 4947, + "time_per_iteration": 2.97707200050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062851, + "balance_loss_mlp": 1.03965259, + "epoch": 0.9519045786841093, + "flos": 676409107968.0, + "grad_norm": 0.06883652339076757, + "language_loss": 0.76289392, + "learning_rate": 6.053906985658553e-06, + "loss": 0.7735225, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.23193359, + "step": 4948, + "time_per_iteration": 2.8374738693237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067243, + "balance_loss_mlp": 1.04439044, + "epoch": 0.9520969603693729, + "flos": 652901617152.0, + "grad_norm": 0.058068098639605055, + "language_loss": 0.8045215, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81519395, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.22851562, + "step": 4949, + "time_per_iteration": 2.82055401802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.04118955, + "epoch": 0.9522893420546364, + "flos": 743284200960.0, + "grad_norm": 0.050157087967027066, + "language_loss": 0.83311605, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84375304, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.22521973, + "step": 4950, + "time_per_iteration": 3.023888111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.03994191, + "epoch": 0.9524817237398999, + "flos": 761696898048.0, + "grad_norm": 0.0679983686289968, + "language_loss": 0.81021792, + "learning_rate": 5.909770163964545e-06, + "loss": 0.82085323, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.23571777, + "step": 4951, + "time_per_iteration": 2.947537660598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059285, + "balance_loss_mlp": 1.03614664, + "epoch": 0.9526741054251635, + "flos": 529125903360.0, + "grad_norm": 0.0629324357168766, + "language_loss": 0.81972486, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83031774, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.23120117, + "step": 4952, + "time_per_iteration": 2.5630245208740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064616, + "balance_loss_mlp": 1.04153693, + "epoch": 0.9528664871104271, + "flos": 488441332224.0, + "grad_norm": 0.06485983174912871, + "language_loss": 0.80928588, + "learning_rate": 5.814638032609787e-06, + "loss": 0.81993204, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.23083496, + "step": 4953, + "time_per_iteration": 2.5676066875457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061965, + "balance_loss_mlp": 1.03951788, + "epoch": 0.9530588687956907, + "flos": 517745115648.0, + "grad_norm": 0.05287711517893565, + "language_loss": 0.85460746, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86522716, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.2244873, + "step": 4954, + "time_per_iteration": 2.8287205696105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066092, + "balance_loss_mlp": 1.04302478, + "epoch": 0.9532512504809542, + "flos": 675148898304.0, + "grad_norm": 0.06332484587058154, + "language_loss": 0.8102749, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82093585, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.23059082, + "step": 4955, + "time_per_iteration": 2.8266055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_mlp": 1.03993309, + "epoch": 0.9534436321662177, + "flos": 489523502592.0, + "grad_norm": 0.06865327401481261, + "language_loss": 0.84251958, + "learning_rate": 5.673378829575249e-06, + "loss": 0.8531605, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.24121094, + "step": 4956, + "time_per_iteration": 2.5656938552856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065867, + "balance_loss_mlp": 1.04219234, + "epoch": 0.9536360138514813, + "flos": 496585147392.0, + "grad_norm": 0.0615272303032046, + "language_loss": 0.8198781, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83053672, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.23657227, + "step": 4957, + "time_per_iteration": 2.621752977371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067276, + "balance_loss_mlp": 1.04463863, + "epoch": 0.9538283955367449, + "flos": 801462283776.0, + "grad_norm": 0.05741952715263904, + "language_loss": 0.84171546, + "learning_rate": 5.580165570157114e-06, + "loss": 0.8523882, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.22644043, + "step": 4958, + "time_per_iteration": 3.0494062900543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010663, + "balance_loss_mlp": 1.04333985, + "epoch": 0.9540207772220085, + "flos": 556668039168.0, + "grad_norm": 0.048223393326200514, + "language_loss": 0.80173922, + "learning_rate": 5.533846857624203e-06, + "loss": 0.81240225, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.22949219, + "step": 4959, + "time_per_iteration": 2.7545664310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061401, + "balance_loss_mlp": 1.03826261, + "epoch": 0.954213158907272, + "flos": 684505935360.0, + "grad_norm": 0.055758007049536804, + "language_loss": 0.81805837, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82867241, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.23156738, + "step": 4960, + "time_per_iteration": 2.9386799335479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.04121327, + "epoch": 0.9544055405925356, + "flos": 535752548352.0, + "grad_norm": 0.08819296696550104, + "language_loss": 0.8276701, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83831513, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.23303223, + "step": 4961, + "time_per_iteration": 2.7551167011260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066455, + "balance_loss_mlp": 1.04275656, + "epoch": 0.9545979222777992, + "flos": 825404401152.0, + "grad_norm": 0.06311984743506972, + "language_loss": 0.80238789, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81305242, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.23693848, + "step": 4962, + "time_per_iteration": 3.1598803997039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064254, + "balance_loss_mlp": 1.04111564, + "epoch": 0.9547903039630627, + "flos": 761691755520.0, + "grad_norm": 0.05994282205738248, + "language_loss": 0.77416027, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78480279, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.23144531, + "step": 4963, + "time_per_iteration": 3.1220405101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067536, + "balance_loss_mlp": 1.04460001, + "epoch": 0.9549826856483262, + "flos": 515306562048.0, + "grad_norm": 0.060039013851122376, + "language_loss": 0.83022189, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.84089726, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.22961426, + "step": 4964, + "time_per_iteration": 2.5976028442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065198, + "balance_loss_mlp": 1.04267979, + "epoch": 0.9551750673335898, + "flos": 643107382272.0, + "grad_norm": 0.05890053188259814, + "language_loss": 0.82927918, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83993113, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.22509766, + "step": 4965, + "time_per_iteration": 2.8856465816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063869, + "balance_loss_mlp": 1.04065895, + "epoch": 0.9553674490188534, + "flos": 472208030208.0, + "grad_norm": 0.05765142290447974, + "language_loss": 0.83205676, + "learning_rate": 5.214991993520546e-06, + "loss": 0.84269542, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.23205566, + "step": 4966, + "time_per_iteration": 2.5833020210266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068625, + "balance_loss_mlp": 1.04520023, + "epoch": 0.955559830704117, + "flos": 528317945856.0, + "grad_norm": 0.06557271954059918, + "language_loss": 0.82097799, + "learning_rate": 5.170209528521763e-06, + "loss": 0.8316642, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.23400879, + "step": 4967, + "time_per_iteration": 2.5960209369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062797, + "balance_loss_mlp": 1.03951526, + "epoch": 0.9557522123893806, + "flos": 548168518656.0, + "grad_norm": 0.058067854328256556, + "language_loss": 0.84217876, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85280675, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.23266602, + "step": 4968, + "time_per_iteration": 2.653374195098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062828, + "balance_loss_mlp": 1.03951049, + "epoch": 0.955944594074644, + "flos": 509465479680.0, + "grad_norm": 0.05965226251084018, + "language_loss": 0.82160389, + "learning_rate": 5.08122094572222e-06, + "loss": 0.83223224, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.2331543, + "step": 4969, + "time_per_iteration": 2.675502061843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065215, + "balance_loss_mlp": 1.04239869, + "epoch": 0.9561369757599076, + "flos": 527578997760.0, + "grad_norm": 0.05789220147713751, + "language_loss": 0.79880643, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80945861, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.22827148, + "step": 4970, + "time_per_iteration": 2.7603402137756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066302, + "balance_loss_mlp": 1.04406953, + "epoch": 0.9563293574451712, + "flos": 498201062400.0, + "grad_norm": 0.07045752519217358, + "language_loss": 0.80387723, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81454021, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.22241211, + "step": 4971, + "time_per_iteration": 2.6467912197113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003461, + "balance_loss_mlp": 0.99678528, + "epoch": 0.9565217391304348, + "flos": 1408875628032.0, + "grad_norm": 0.0039371398806634095, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82777023, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.06689453, + "step": 4972, + "time_per_iteration": 4.884695291519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063136, + "balance_loss_mlp": 1.04022372, + "epoch": 0.9567141208156984, + "flos": 503846853120.0, + "grad_norm": 0.058137821309089344, + "language_loss": 0.78417355, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79480487, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.22912598, + "step": 4973, + "time_per_iteration": 2.7980639934539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.04048562, + "epoch": 0.9569065025009619, + "flos": 433213526016.0, + "grad_norm": 0.06846975795352252, + "language_loss": 0.79742157, + "learning_rate": 4.86211231669359e-06, + "loss": 0.80805671, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.23022461, + "step": 4974, + "time_per_iteration": 2.478752613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_mlp": 1.04140127, + "epoch": 0.9570988841862255, + "flos": 589959853056.0, + "grad_norm": 0.06934685034456788, + "language_loss": 0.78280979, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79345274, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.2286377, + "step": 4975, + "time_per_iteration": 2.7857437133789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064696, + "balance_loss_mlp": 1.04105639, + "epoch": 0.957291265871489, + "flos": 767278448640.0, + "grad_norm": 0.0946486602746724, + "language_loss": 0.78849113, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79913813, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.23632812, + "step": 4976, + "time_per_iteration": 2.942918062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063015, + "balance_loss_mlp": 1.04046106, + "epoch": 0.9574836475567526, + "flos": 639104670720.0, + "grad_norm": 0.07529166601163789, + "language_loss": 0.84990984, + "learning_rate": 4.732953758233849e-06, + "loss": 0.86054003, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.22546387, + "step": 4977, + "time_per_iteration": 2.7789368629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003403, + "balance_loss_mlp": 0.99677485, + "epoch": 0.9576760292420161, + "flos": 1575939649536.0, + "grad_norm": 0.003929886062785851, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79610658, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.06640625, + "step": 4978, + "time_per_iteration": 4.893427848815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065644, + "balance_loss_mlp": 1.04185009, + "epoch": 0.9578684109272797, + "flos": 496345439232.0, + "grad_norm": 0.05080656218249429, + "language_loss": 0.87270832, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.8833648, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.23779297, + "step": 4979, + "time_per_iteration": 2.6311700344085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.04242969, + "epoch": 0.9580607926125433, + "flos": 429954531840.0, + "grad_norm": 0.08390852397863828, + "language_loss": 0.85619473, + "learning_rate": 4.605525716805337e-06, + "loss": 0.8668583, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.23913574, + "step": 4980, + "time_per_iteration": 2.4578068256378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065714, + "balance_loss_mlp": 1.04258728, + "epoch": 0.9582531742978069, + "flos": 1127262251520.0, + "grad_norm": 0.05376140040349082, + "language_loss": 0.80332768, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81398481, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.23144531, + "step": 4981, + "time_per_iteration": 3.5303521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066494, + "balance_loss_mlp": 1.04290295, + "epoch": 0.9584455559830705, + "flos": 524458395648.0, + "grad_norm": 0.06188839718179842, + "language_loss": 0.79525679, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80592173, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.23596191, + "step": 4982, + "time_per_iteration": 2.6728785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066093, + "balance_loss_mlp": 1.04308546, + "epoch": 0.9586379376683339, + "flos": 634187543040.0, + "grad_norm": 0.05647162716885676, + "language_loss": 0.80907547, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81973636, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.23010254, + "step": 4983, + "time_per_iteration": 2.8294382095336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067857, + "balance_loss_mlp": 1.04449201, + "epoch": 0.9588303193535975, + "flos": 416061038592.0, + "grad_norm": 0.06368713584012144, + "language_loss": 0.83991754, + "learning_rate": 4.438314345641459e-06, + "loss": 0.85059619, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.23352051, + "step": 4984, + "time_per_iteration": 2.4864554405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058115, + "balance_loss_mlp": 1.03466618, + "epoch": 0.9590227010388611, + "flos": 481683635712.0, + "grad_norm": 0.06774888803703895, + "language_loss": 0.78104466, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79162586, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.23425293, + "step": 4985, + "time_per_iteration": 2.59116530418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059271, + "balance_loss_mlp": 1.03722906, + "epoch": 0.9592150827241247, + "flos": 684540440064.0, + "grad_norm": 0.06301853068999348, + "language_loss": 0.80390298, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81449568, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.22045898, + "step": 4986, + "time_per_iteration": 2.954206705093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063557, + "balance_loss_mlp": 1.04100311, + "epoch": 0.9594074644093882, + "flos": 574490092032.0, + "grad_norm": 0.053195859640902336, + "language_loss": 0.70766115, + "learning_rate": 4.314925898349642e-06, + "loss": 0.71829671, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.2253418, + "step": 4987, + "time_per_iteration": 2.7368128299713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067667, + "balance_loss_mlp": 1.04468322, + "epoch": 0.9595998460946518, + "flos": 546871233024.0, + "grad_norm": 0.06464800221271895, + "language_loss": 0.78132504, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79200172, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.22973633, + "step": 4988, + "time_per_iteration": 2.771296977996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106073, + "balance_loss_mlp": 1.03774643, + "epoch": 0.9597922277799154, + "flos": 474043829760.0, + "grad_norm": 0.0631980320315198, + "language_loss": 0.78296041, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79356772, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.22998047, + "step": 4989, + "time_per_iteration": 2.5766162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064136, + "balance_loss_mlp": 1.04084253, + "epoch": 0.9599846094651789, + "flos": 514691324928.0, + "grad_norm": 0.05683022845710803, + "language_loss": 0.8579731, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86861444, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.23278809, + "step": 4990, + "time_per_iteration": 2.5965628623962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_mlp": 1.03919303, + "epoch": 0.9601769911504425, + "flos": 594983066112.0, + "grad_norm": 0.06110034995600321, + "language_loss": 0.78472888, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79535127, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.23034668, + "step": 4991, + "time_per_iteration": 2.760037660598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066688, + "balance_loss_mlp": 1.04364491, + "epoch": 0.960369372835706, + "flos": 493012293120.0, + "grad_norm": 0.05123755509819416, + "language_loss": 0.79453242, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80519927, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.23022461, + "step": 4992, + "time_per_iteration": 2.6341288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065116, + "balance_loss_mlp": 1.04189432, + "epoch": 0.9605617545209696, + "flos": 579293420544.0, + "grad_norm": 0.04990578397455196, + "language_loss": 0.82886499, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83951616, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.23193359, + "step": 4993, + "time_per_iteration": 2.673229217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062918, + "balance_loss_mlp": 1.04065001, + "epoch": 0.9607541362062332, + "flos": 927708857856.0, + "grad_norm": 0.054118063874610094, + "language_loss": 0.86487305, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87550223, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.22265625, + "step": 4994, + "time_per_iteration": 3.2317514419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106405, + "balance_loss_mlp": 1.04008889, + "epoch": 0.9609465178914968, + "flos": 573121225728.0, + "grad_norm": 0.059331665161215484, + "language_loss": 0.75806749, + "learning_rate": 3.994358637073036e-06, + "loss": 0.76870805, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.23950195, + "step": 4995, + "time_per_iteration": 2.8493144512176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062782, + "balance_loss_mlp": 1.03969097, + "epoch": 0.9611388995767602, + "flos": 530850475008.0, + "grad_norm": 0.05160071442281887, + "language_loss": 0.85746717, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86809498, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.23083496, + "step": 4996, + "time_per_iteration": 2.6498258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_mlp": 1.03623509, + "epoch": 0.9613312812620238, + "flos": 646247808000.0, + "grad_norm": 0.05978415787629307, + "language_loss": 0.82224667, + "learning_rate": 3.916142178097881e-06, + "loss": 0.8328442, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.23522949, + "step": 4997, + "time_per_iteration": 2.825035333633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065595, + "balance_loss_mlp": 1.04313636, + "epoch": 0.9615236629472874, + "flos": 496152718848.0, + "grad_norm": 0.05181929898036546, + "language_loss": 0.78011382, + "learning_rate": 3.877322836288888e-06, + "loss": 0.79076982, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.2244873, + "step": 4998, + "time_per_iteration": 2.841170310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062329, + "balance_loss_mlp": 1.03926229, + "epoch": 0.961716044632551, + "flos": 512974093824.0, + "grad_norm": 0.05566555595708184, + "language_loss": 0.75968587, + "learning_rate": 3.838696106385153e-06, + "loss": 0.77030915, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.23059082, + "step": 4999, + "time_per_iteration": 2.6079280376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068691, + "balance_loss_mlp": 1.04517114, + "epoch": 0.9619084263178146, + "flos": 501084527616.0, + "grad_norm": 0.0634969920505646, + "language_loss": 0.80885196, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81953883, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.23498535, + "step": 5000, + "time_per_iteration": 2.5831449031829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106539, + "balance_loss_mlp": 1.04257321, + "epoch": 0.9621008080030781, + "flos": 595635379200.0, + "grad_norm": 0.0788330829504222, + "language_loss": 0.74819994, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.75885379, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.22790527, + "step": 5001, + "time_per_iteration": 2.8088111877441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066486, + "balance_loss_mlp": 1.0436573, + "epoch": 0.9622931896883417, + "flos": 502250761728.0, + "grad_norm": 0.06376625274094212, + "language_loss": 0.82308555, + "learning_rate": 3.723971737693899e-06, + "loss": 0.83375037, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.22827148, + "step": 5002, + "time_per_iteration": 2.645930767059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063524, + "balance_loss_mlp": 1.03970599, + "epoch": 0.9624855713736052, + "flos": 607287808512.0, + "grad_norm": 0.05456162093483777, + "language_loss": 0.80836141, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81899667, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.23815918, + "step": 5003, + "time_per_iteration": 2.7728164196014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065524, + "balance_loss_mlp": 1.0426712, + "epoch": 0.9626779530588688, + "flos": 510715777536.0, + "grad_norm": 0.07008234047327347, + "language_loss": 0.84970057, + "learning_rate": 3.648452157695936e-06, + "loss": 0.86035579, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.22827148, + "step": 5004, + "time_per_iteration": 2.5669493675231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066163, + "balance_loss_mlp": 1.04288161, + "epoch": 0.9628703347441323, + "flos": 627294025728.0, + "grad_norm": 0.06552748478093777, + "language_loss": 0.82791591, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83857757, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.23266602, + "step": 5005, + "time_per_iteration": 2.8153672218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068034, + "balance_loss_mlp": 1.04431152, + "epoch": 0.9630627164293959, + "flos": 630758223360.0, + "grad_norm": 0.05992229311372926, + "language_loss": 0.77596062, + "learning_rate": 3.573703380666149e-06, + "loss": 0.786641, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.23693848, + "step": 5006, + "time_per_iteration": 2.7764079570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_mlp": 1.0408895, + "epoch": 0.9632550981146595, + "flos": 570558961152.0, + "grad_norm": 0.05008951632274407, + "language_loss": 0.78372812, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79436582, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.2286377, + "step": 5007, + "time_per_iteration": 2.8054816722869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062955, + "balance_loss_mlp": 1.03951883, + "epoch": 0.9634474797999231, + "flos": 466117327872.0, + "grad_norm": 0.06832318603954099, + "language_loss": 0.80910051, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.81973004, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.234375, + "step": 5008, + "time_per_iteration": 2.6352286338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064861, + "balance_loss_mlp": 1.04144847, + "epoch": 0.9636398614851867, + "flos": 526600714752.0, + "grad_norm": 0.07994370035866022, + "language_loss": 0.85493284, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86558145, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.23413086, + "step": 5009, + "time_per_iteration": 2.603555917739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060923, + "balance_loss_mlp": 1.03749835, + "epoch": 0.9638322431704501, + "flos": 564831677952.0, + "grad_norm": 0.058787878064300095, + "language_loss": 0.752451, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.234375, + "step": 5010, + "time_per_iteration": 2.7728347778320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067099, + "balance_loss_mlp": 1.04449689, + "epoch": 0.9640246248557137, + "flos": 477772328448.0, + "grad_norm": 0.08181968777797018, + "language_loss": 0.84829634, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85896736, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.22619629, + "step": 5011, + "time_per_iteration": 2.6068520545959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060574, + "balance_loss_mlp": 1.03792429, + "epoch": 0.9642170065409773, + "flos": 539318062080.0, + "grad_norm": 0.0552728686967885, + "language_loss": 0.88548124, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89608705, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.22644043, + "step": 5012, + "time_per_iteration": 2.6157679557800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.03952003, + "epoch": 0.9644093882262409, + "flos": 523754325504.0, + "grad_norm": 0.05861545033938659, + "language_loss": 0.83724725, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84786797, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.22546387, + "step": 5013, + "time_per_iteration": 2.5874183177948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063855, + "balance_loss_mlp": 1.04178977, + "epoch": 0.9646017699115044, + "flos": 574290031104.0, + "grad_norm": 0.06290318681113366, + "language_loss": 0.79001546, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.80065405, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.22070312, + "step": 5014, + "time_per_iteration": 2.697143077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_mlp": 1.04372108, + "epoch": 0.964794151596768, + "flos": 636799366656.0, + "grad_norm": 0.07022535288197708, + "language_loss": 0.84282309, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85350025, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.23986816, + "step": 5015, + "time_per_iteration": 2.721156358718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067023, + "balance_loss_mlp": 1.04393208, + "epoch": 0.9649865332820315, + "flos": 617435550720.0, + "grad_norm": 0.06772054018077114, + "language_loss": 0.86324394, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87391412, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.23095703, + "step": 5016, + "time_per_iteration": 2.7218070030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066045, + "balance_loss_mlp": 1.04256094, + "epoch": 0.9651789149672951, + "flos": 516183528960.0, + "grad_norm": 0.054732773145454494, + "language_loss": 0.81263906, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82329947, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.23461914, + "step": 5017, + "time_per_iteration": 2.763035535812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03928423, + "epoch": 0.9653712966525587, + "flos": 492940712448.0, + "grad_norm": 0.06149986612463951, + "language_loss": 0.80204356, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.81267327, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.23693848, + "step": 5018, + "time_per_iteration": 2.6125636100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064154, + "balance_loss_mlp": 1.04086018, + "epoch": 0.9655636783378222, + "flos": 536560505856.0, + "grad_norm": 0.06357882552992955, + "language_loss": 0.82340455, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83404613, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.23278809, + "step": 5019, + "time_per_iteration": 2.7467122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064987, + "balance_loss_mlp": 1.04167008, + "epoch": 0.9657560600230858, + "flos": 459023749632.0, + "grad_norm": 0.05685331710974359, + "language_loss": 0.82253885, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83318865, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.2331543, + "step": 5020, + "time_per_iteration": 2.670370101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066255, + "balance_loss_mlp": 1.04366493, + "epoch": 0.9659484417083494, + "flos": 686178749952.0, + "grad_norm": 0.06470708717020164, + "language_loss": 0.83449835, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84516084, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.22583008, + "step": 5021, + "time_per_iteration": 2.8630871772766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003402, + "balance_loss_mlp": 0.99677426, + "epoch": 0.966140823393613, + "flos": 1502292178944.0, + "grad_norm": 0.003927469064220584, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81697649, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.06640625, + "step": 5022, + "time_per_iteration": 4.691370487213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064655, + "balance_loss_mlp": 1.04190993, + "epoch": 0.9663332050788765, + "flos": 464899336704.0, + "grad_norm": 0.07461109320403748, + "language_loss": 0.81199974, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82264626, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.22741699, + "step": 5023, + "time_per_iteration": 2.5815436840057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067341, + "balance_loss_mlp": 1.04451227, + "epoch": 0.96652558676414, + "flos": 500834907648.0, + "grad_norm": 0.054429682823401944, + "language_loss": 0.85471153, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86538494, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.22827148, + "step": 5024, + "time_per_iteration": 2.696014165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.04140997, + "epoch": 0.9667179684494036, + "flos": 424839914496.0, + "grad_norm": 0.07944496939889083, + "language_loss": 0.82808036, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.83873314, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.23864746, + "step": 5025, + "time_per_iteration": 2.438119888305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067473, + "balance_loss_mlp": 1.04450107, + "epoch": 0.9669103501346672, + "flos": 516996628992.0, + "grad_norm": 0.062276776313522096, + "language_loss": 0.86128414, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.87195885, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.22961426, + "step": 5026, + "time_per_iteration": 2.644537925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063932, + "balance_loss_mlp": 1.04109097, + "epoch": 0.9671027318199308, + "flos": 456241600512.0, + "grad_norm": 0.0854616119175948, + "language_loss": 0.75941432, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.77005363, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.22827148, + "step": 5027, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064978, + "balance_loss_mlp": 1.04238749, + "epoch": 0.9672951135051943, + "flos": 525058951680.0, + "grad_norm": 0.05741509378155617, + "language_loss": 0.80092812, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81157786, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.22595215, + "step": 5028, + "time_per_iteration": 2.603698492050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060972, + "balance_loss_mlp": 1.03829837, + "epoch": 0.9674874951904578, + "flos": 573986082816.0, + "grad_norm": 0.06194459113870459, + "language_loss": 0.80152857, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.81213832, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.22680664, + "step": 5029, + "time_per_iteration": 2.7249526977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064927, + "balance_loss_mlp": 1.04158545, + "epoch": 0.9676798768757214, + "flos": 629184153600.0, + "grad_norm": 0.04555589373001173, + "language_loss": 0.80289042, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.81353974, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.23352051, + "step": 5030, + "time_per_iteration": 2.90915846824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003379, + "balance_loss_mlp": 0.99675071, + "epoch": 0.967872258560985, + "flos": 1463880605184.0, + "grad_norm": 0.003928487667441442, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76566738, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.06640625, + "step": 5031, + "time_per_iteration": 4.642369747161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.04282784, + "epoch": 0.9680646402462486, + "flos": 565503814656.0, + "grad_norm": 0.06195217532413699, + "language_loss": 0.79083759, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80149376, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.22790527, + "step": 5032, + "time_per_iteration": 2.6946024894714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003375, + "balance_loss_mlp": 0.99674749, + "epoch": 0.9682570219315121, + "flos": 1434463022592.0, + "grad_norm": 0.0039263261842775645, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79078174, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.06640625, + "step": 5033, + "time_per_iteration": 4.833622455596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064933, + "balance_loss_mlp": 1.04119873, + "epoch": 0.9684494036167757, + "flos": 584610670080.0, + "grad_norm": 0.07322568294021568, + "language_loss": 0.81921077, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82986003, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.23693848, + "step": 5034, + "time_per_iteration": 2.7054622173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_mlp": 1.0416739, + "epoch": 0.9686417853020393, + "flos": 559064747520.0, + "grad_norm": 0.05265422766774479, + "language_loss": 0.84284925, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85349661, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.23071289, + "step": 5035, + "time_per_iteration": 2.6912314891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064116, + "balance_loss_mlp": 1.04046464, + "epoch": 0.9688341669873028, + "flos": 784927604736.0, + "grad_norm": 0.05870206875682227, + "language_loss": 0.83489573, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84553695, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.23632812, + "step": 5036, + "time_per_iteration": 2.947582721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062286, + "balance_loss_mlp": 1.03920674, + "epoch": 0.9690265486725663, + "flos": 395899176960.0, + "grad_norm": 0.06292583451816228, + "language_loss": 0.79677629, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80739915, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.23071289, + "step": 5037, + "time_per_iteration": 2.4658944606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065139, + "balance_loss_mlp": 1.04173779, + "epoch": 0.9692189303578299, + "flos": 476373726720.0, + "grad_norm": 0.07250551131476936, + "language_loss": 0.77617192, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78682327, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.23400879, + "step": 5038, + "time_per_iteration": 2.7228803634643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106111, + "balance_loss_mlp": 1.03956842, + "epoch": 0.9694113120430935, + "flos": 597575066112.0, + "grad_norm": 0.06788850088092563, + "language_loss": 0.79366446, + "learning_rate": 2.451732453851385e-06, + "loss": 0.80427551, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.21557617, + "step": 5039, + "time_per_iteration": 2.7196829319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_mlp": 1.03825676, + "epoch": 0.9696036937283571, + "flos": 500881895424.0, + "grad_norm": 0.053804028005827634, + "language_loss": 0.82859206, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83920407, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.22949219, + "step": 5040, + "time_per_iteration": 2.603468179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_mlp": 1.03809977, + "epoch": 0.9697960754136207, + "flos": 432277088256.0, + "grad_norm": 0.06156976605988288, + "language_loss": 0.87425447, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88487267, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.23718262, + "step": 5041, + "time_per_iteration": 2.4655745029449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106621, + "balance_loss_mlp": 1.04332173, + "epoch": 0.9699884570988841, + "flos": 568540353024.0, + "grad_norm": 0.054950180555539796, + "language_loss": 0.85486805, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86553025, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.22900391, + "step": 5042, + "time_per_iteration": 2.7471725940704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069068, + "balance_loss_mlp": 1.04596519, + "epoch": 0.9701808387841477, + "flos": 516215835648.0, + "grad_norm": 0.05677739964495203, + "language_loss": 0.81870991, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82940054, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.2310791, + "step": 5043, + "time_per_iteration": 2.7095541954040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060887, + "balance_loss_mlp": 1.03853512, + "epoch": 0.9703732204694113, + "flos": 491517517824.0, + "grad_norm": 0.06567489938132548, + "language_loss": 0.76418149, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77479041, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.22351074, + "step": 5044, + "time_per_iteration": 2.5679051876068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_mlp": 1.03968978, + "epoch": 0.9705656021546749, + "flos": 626120451072.0, + "grad_norm": 0.06280101868126062, + "language_loss": 0.80511069, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.8157413, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.23352051, + "step": 5045, + "time_per_iteration": 2.754497766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.04334414, + "epoch": 0.9707579838399384, + "flos": 471437148672.0, + "grad_norm": 0.0575037191710409, + "language_loss": 0.82947087, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.84013402, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.22961426, + "step": 5046, + "time_per_iteration": 2.6000595092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066433, + "balance_loss_mlp": 1.04273403, + "epoch": 0.970950365525202, + "flos": 492103019520.0, + "grad_norm": 0.06395406690389444, + "language_loss": 0.80750513, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81816947, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.23718262, + "step": 5047, + "time_per_iteration": 2.6433827877044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058641, + "balance_loss_mlp": 1.03622973, + "epoch": 0.9711427472104656, + "flos": 557322923520.0, + "grad_norm": 0.06536793289339991, + "language_loss": 0.80885148, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81943792, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.22424316, + "step": 5048, + "time_per_iteration": 2.7203571796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106497, + "balance_loss_mlp": 1.04177177, + "epoch": 0.9713351288957291, + "flos": 625841095680.0, + "grad_norm": 0.05247386429416099, + "language_loss": 0.83924532, + "learning_rate": 2.153250946564489e-06, + "loss": 0.849895, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.23193359, + "step": 5049, + "time_per_iteration": 2.915627956390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068661, + "balance_loss_mlp": 1.04564154, + "epoch": 0.9715275105809927, + "flos": 499073260032.0, + "grad_norm": 0.06425293622373854, + "language_loss": 0.81309581, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82378244, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.23022461, + "step": 5050, + "time_per_iteration": 2.720860242843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065169, + "balance_loss_mlp": 1.04197037, + "epoch": 0.9717198922662562, + "flos": 477515367936.0, + "grad_norm": 0.0640839658378964, + "language_loss": 0.77984011, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.79049182, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.23205566, + "step": 5051, + "time_per_iteration": 2.544619560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059378, + "balance_loss_mlp": 1.03681207, + "epoch": 0.9719122739515198, + "flos": 553446120960.0, + "grad_norm": 0.09980134854797204, + "language_loss": 0.78635657, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79695034, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.22546387, + "step": 5052, + "time_per_iteration": 2.6725540161132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069486, + "balance_loss_mlp": 1.0463829, + "epoch": 0.9721046556367834, + "flos": 565852179456.0, + "grad_norm": 0.07128218141456762, + "language_loss": 0.80016816, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.81086302, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.23083496, + "step": 5053, + "time_per_iteration": 2.664026975631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.03906298, + "epoch": 0.972297037322047, + "flos": 560315045376.0, + "grad_norm": 0.059142247701586874, + "language_loss": 0.78148782, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79209924, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.22094727, + "step": 5054, + "time_per_iteration": 2.7956132888793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064724, + "balance_loss_mlp": 1.04113269, + "epoch": 0.9724894190073105, + "flos": 512440349184.0, + "grad_norm": 0.06082537810675151, + "language_loss": 0.79474902, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.8053962, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.2355957, + "step": 5055, + "time_per_iteration": 2.671006202697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067024, + "balance_loss_mlp": 1.04326606, + "epoch": 0.972681800692574, + "flos": 613832961024.0, + "grad_norm": 0.07513583056263037, + "language_loss": 0.80732191, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81799209, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.23754883, + "step": 5056, + "time_per_iteration": 2.779228925704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065639, + "balance_loss_mlp": 1.04301322, + "epoch": 0.9728741823778376, + "flos": 833911635456.0, + "grad_norm": 0.06408364503125263, + "language_loss": 0.83977121, + "learning_rate": 1.92838141509849e-06, + "loss": 0.85042763, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.22619629, + "step": 5057, + "time_per_iteration": 3.0838735103607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066592, + "balance_loss_mlp": 1.04308391, + "epoch": 0.9730665640631012, + "flos": 571450982400.0, + "grad_norm": 0.059458566299808, + "language_loss": 0.8458215, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85648739, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.23522949, + "step": 5058, + "time_per_iteration": 2.7313530445098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064891, + "balance_loss_mlp": 1.0414784, + "epoch": 0.9732589457483648, + "flos": 506520345600.0, + "grad_norm": 0.058993279317627906, + "language_loss": 0.77250987, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78315884, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.23388672, + "step": 5059, + "time_per_iteration": 2.568094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060711, + "balance_loss_mlp": 1.0382638, + "epoch": 0.9734513274336283, + "flos": 926977623552.0, + "grad_norm": 0.061554025846400975, + "language_loss": 0.81109071, + "learning_rate": 1.84724562509897e-06, + "loss": 0.82169777, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.22473145, + "step": 5060, + "time_per_iteration": 3.157397747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062998, + "balance_loss_mlp": 1.03989542, + "epoch": 0.9736437091188919, + "flos": 491930122752.0, + "grad_norm": 0.058836820412195824, + "language_loss": 0.78267944, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79330945, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.23071289, + "step": 5061, + "time_per_iteration": 2.7482900619506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_mlp": 1.04470706, + "epoch": 0.9738360908041555, + "flos": 613321611264.0, + "grad_norm": 0.0694756230310086, + "language_loss": 0.83708924, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.8477726, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.23596191, + "step": 5062, + "time_per_iteration": 2.7663509845733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003347, + "balance_loss_mlp": 0.99671924, + "epoch": 0.974028472489419, + "flos": 1549561549824.0, + "grad_norm": 0.003919403605797286, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.76995444, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.06640625, + "step": 5063, + "time_per_iteration": 4.96152138710022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100335, + "balance_loss_mlp": 0.99672168, + "epoch": 0.9742208541746825, + "flos": 1411155965952.0, + "grad_norm": 0.0039186810532365335, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80681062, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.06640625, + "step": 5064, + "time_per_iteration": 5.010188817977905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067385, + "balance_loss_mlp": 1.04452014, + "epoch": 0.9744132358599461, + "flos": 674884597248.0, + "grad_norm": 0.06180371506572385, + "language_loss": 0.7679944, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77866822, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.2286377, + "step": 5065, + "time_per_iteration": 2.936842203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062455, + "balance_loss_mlp": 1.03975797, + "epoch": 0.9746056175452097, + "flos": 598407616512.0, + "grad_norm": 0.06002895188447741, + "language_loss": 0.78050154, + "learning_rate": 1.690196122544896e-06, + "loss": 0.79112613, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.22717285, + "step": 5066, + "time_per_iteration": 2.780294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064534, + "balance_loss_mlp": 1.04207551, + "epoch": 0.9747979992304733, + "flos": 732175428096.0, + "grad_norm": 0.0638520746537111, + "language_loss": 0.82705855, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83770382, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.22485352, + "step": 5067, + "time_per_iteration": 4.443971395492554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_mlp": 1.04508257, + "epoch": 0.9749903809157369, + "flos": 616499112960.0, + "grad_norm": 0.08421424574447112, + "language_loss": 0.76827443, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77895868, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.23327637, + "step": 5068, + "time_per_iteration": 2.6909236907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058974, + "balance_loss_mlp": 1.03682458, + "epoch": 0.9751827626010003, + "flos": 468398039040.0, + "grad_norm": 0.05243523587913791, + "language_loss": 0.83662784, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84721756, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.22167969, + "step": 5069, + "time_per_iteration": 2.554863929748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_mlp": 1.04154027, + "epoch": 0.9753751442862639, + "flos": 599215574016.0, + "grad_norm": 0.08363220915033362, + "language_loss": 0.84941483, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86006248, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.23217773, + "step": 5070, + "time_per_iteration": 2.772955894470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060273, + "balance_loss_mlp": 1.03805256, + "epoch": 0.9755675259715275, + "flos": 650806285824.0, + "grad_norm": 0.0635669703618464, + "language_loss": 0.82277942, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83338213, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.22216797, + "step": 5071, + "time_per_iteration": 2.922133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065735, + "balance_loss_mlp": 1.0425607, + "epoch": 0.9757599076567911, + "flos": 563658103296.0, + "grad_norm": 0.06500360890677431, + "language_loss": 0.79198158, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80263901, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.23181152, + "step": 5072, + "time_per_iteration": 2.725067615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060385, + "balance_loss_mlp": 1.03761649, + "epoch": 0.9759522893420547, + "flos": 504637558272.0, + "grad_norm": 0.061072535254532934, + "language_loss": 0.8039192, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81452304, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.22753906, + "step": 5073, + "time_per_iteration": 2.5948638916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061976, + "balance_loss_mlp": 1.03927875, + "epoch": 0.9761446710273182, + "flos": 583728560640.0, + "grad_norm": 0.07099648020053952, + "language_loss": 0.82122862, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83184838, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.22680664, + "step": 5074, + "time_per_iteration": 2.7220845222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106256, + "balance_loss_mlp": 1.03939795, + "epoch": 0.9763370527125818, + "flos": 482207468544.0, + "grad_norm": 0.07455408744394738, + "language_loss": 0.81895626, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.82958186, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.23168945, + "step": 5075, + "time_per_iteration": 2.591758966445923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065565, + "balance_loss_mlp": 1.04274833, + "epoch": 0.9765294343978453, + "flos": 618987225600.0, + "grad_norm": 0.05817197872146041, + "language_loss": 0.78506422, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79571986, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.22814941, + "step": 5076, + "time_per_iteration": 2.7101328372955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060528, + "balance_loss_mlp": 1.03771114, + "epoch": 0.9767218160831089, + "flos": 526573550592.0, + "grad_norm": 0.06386112095671416, + "language_loss": 0.85028458, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86088979, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.22814941, + "step": 5077, + "time_per_iteration": 2.6009128093719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062594, + "balance_loss_mlp": 1.04019439, + "epoch": 0.9769141977683724, + "flos": 525194772480.0, + "grad_norm": 0.058942301821321326, + "language_loss": 0.83781874, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.8484447, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.22399902, + "step": 5078, + "time_per_iteration": 2.737980604171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106952, + "balance_loss_mlp": 1.0459764, + "epoch": 0.977106579453636, + "flos": 457615236096.0, + "grad_norm": 0.06395353271069072, + "language_loss": 0.80623591, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81693113, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.23522949, + "step": 5079, + "time_per_iteration": 2.783198833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063803, + "balance_loss_mlp": 1.04111767, + "epoch": 0.9772989611388996, + "flos": 532090861056.0, + "grad_norm": 0.06963280842478477, + "language_loss": 0.81529284, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82593083, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.22680664, + "step": 5080, + "time_per_iteration": 2.615165948867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065016, + "balance_loss_mlp": 1.04231882, + "epoch": 0.9774913428241632, + "flos": 755349235200.0, + "grad_norm": 0.05783942863824456, + "language_loss": 0.85814047, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.86879063, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.22680664, + "step": 5081, + "time_per_iteration": 3.005134344100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003318, + "balance_loss_mlp": 0.99668992, + "epoch": 0.9776837245094268, + "flos": 1554320088576.0, + "grad_norm": 0.003915824492337212, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.7989881, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.06640625, + "step": 5082, + "time_per_iteration": 5.032810211181641 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060089, + "balance_loss_mlp": 1.03662872, + "epoch": 0.9778761061946902, + "flos": 592534600704.0, + "grad_norm": 0.06772639363056118, + "language_loss": 0.84042907, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85102993, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.234375, + "step": 5083, + "time_per_iteration": 2.683605194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_mlp": 1.04470205, + "epoch": 0.9780684878799538, + "flos": 414951704064.0, + "grad_norm": 0.11848839079428937, + "language_loss": 0.81910384, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82978618, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.23535156, + "step": 5084, + "time_per_iteration": 2.510514974594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106606, + "balance_loss_mlp": 1.04306436, + "epoch": 0.9782608695652174, + "flos": 568411872768.0, + "grad_norm": 0.06042021971117926, + "language_loss": 0.84641409, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85707462, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.2298584, + "step": 5085, + "time_per_iteration": 2.7490227222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062587, + "balance_loss_mlp": 1.03903139, + "epoch": 0.978453251250481, + "flos": 690472926720.0, + "grad_norm": 0.06323661108562777, + "language_loss": 0.83238792, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84301388, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.2355957, + "step": 5086, + "time_per_iteration": 2.8859827518463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064613, + "balance_loss_mlp": 1.04137921, + "epoch": 0.9786456329357445, + "flos": 502505150976.0, + "grad_norm": 0.06308015457512217, + "language_loss": 0.77418578, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78483194, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.23217773, + "step": 5087, + "time_per_iteration": 2.640166997909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064813, + "balance_loss_mlp": 1.04157913, + "epoch": 0.9788380146210081, + "flos": 863183485440.0, + "grad_norm": 0.06605726461697956, + "language_loss": 0.80655509, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81720316, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.23254395, + "step": 5088, + "time_per_iteration": 3.0655131340026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062744, + "balance_loss_mlp": 1.04057157, + "epoch": 0.9790303963062716, + "flos": 512717133312.0, + "grad_norm": 0.06638727169087975, + "language_loss": 0.84402472, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85465217, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.22192383, + "step": 5089, + "time_per_iteration": 2.63153338432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065094, + "balance_loss_mlp": 1.04174054, + "epoch": 0.9792227779915352, + "flos": 494428147200.0, + "grad_norm": 0.06762738713186066, + "language_loss": 0.86859965, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87925059, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.23352051, + "step": 5090, + "time_per_iteration": 2.627124309539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063588, + "balance_loss_mlp": 1.04065216, + "epoch": 0.9794151596767988, + "flos": 608325562368.0, + "grad_norm": 0.06322157277550289, + "language_loss": 0.81654972, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82718563, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.22924805, + "step": 5091, + "time_per_iteration": 2.776026964187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_mlp": 1.04480207, + "epoch": 0.9796075413620623, + "flos": 478222009344.0, + "grad_norm": 0.10119893374448796, + "language_loss": 0.86482507, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87550843, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.23510742, + "step": 5092, + "time_per_iteration": 2.5913615226745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063991, + "balance_loss_mlp": 1.04055452, + "epoch": 0.9797999230473259, + "flos": 592220740608.0, + "grad_norm": 0.06249893375532654, + "language_loss": 0.84391701, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85455692, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.23425293, + "step": 5093, + "time_per_iteration": 2.7546114921569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064835, + "balance_loss_mlp": 1.04194725, + "epoch": 0.9799923047325895, + "flos": 556381343232.0, + "grad_norm": 0.06263423673512068, + "language_loss": 0.81759882, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82824719, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.22875977, + "step": 5094, + "time_per_iteration": 2.9061594009399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_mlp": 1.04176164, + "epoch": 0.9801846864178531, + "flos": 579456405504.0, + "grad_norm": 0.049311129804513056, + "language_loss": 0.8460865, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.8567307, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.2265625, + "step": 5095, + "time_per_iteration": 2.7628769874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067247, + "balance_loss_mlp": 1.0450387, + "epoch": 0.9803770681031165, + "flos": 515101358592.0, + "grad_norm": 0.05430804653354444, + "language_loss": 0.79867494, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.80934739, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.22216797, + "step": 5096, + "time_per_iteration": 2.704636335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063536, + "balance_loss_mlp": 1.04074311, + "epoch": 0.9805694497883801, + "flos": 566988678144.0, + "grad_norm": 0.061103858425466714, + "language_loss": 0.78479362, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79542893, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.22790527, + "step": 5097, + "time_per_iteration": 2.761188507080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061653, + "balance_loss_mlp": 1.03853846, + "epoch": 0.9807618314736437, + "flos": 479351167488.0, + "grad_norm": 0.057577727000740944, + "language_loss": 0.73809493, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74871147, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.23132324, + "step": 5098, + "time_per_iteration": 2.6452760696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066484, + "balance_loss_mlp": 1.04276156, + "epoch": 0.9809542131589073, + "flos": 545285053440.0, + "grad_norm": 0.10492983204691474, + "language_loss": 0.80005634, + "learning_rate": 9.509698444908344e-07, + "loss": 0.81072116, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.23718262, + "step": 5099, + "time_per_iteration": 2.62776255607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.04059947, + "epoch": 0.9811465948441709, + "flos": 520843696128.0, + "grad_norm": 0.06439125090315591, + "language_loss": 0.79966128, + "learning_rate": 9.318612999057452e-07, + "loss": 0.81029904, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.23144531, + "step": 5100, + "time_per_iteration": 2.590998649597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063439, + "balance_loss_mlp": 1.04044354, + "epoch": 0.9813389765294344, + "flos": 541282341888.0, + "grad_norm": 0.06775361472467173, + "language_loss": 0.80269879, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81333315, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.23010254, + "step": 5101, + "time_per_iteration": 2.677475690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062592, + "balance_loss_mlp": 1.03938198, + "epoch": 0.981531358214698, + "flos": 567356866560.0, + "grad_norm": 0.055868673578204395, + "language_loss": 0.8439014, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85452735, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.23217773, + "step": 5102, + "time_per_iteration": 2.8073782920837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062341, + "balance_loss_mlp": 1.03984594, + "epoch": 0.9817237398999615, + "flos": 577272241152.0, + "grad_norm": 0.0521173779260352, + "language_loss": 0.81328356, + "learning_rate": 8.756982280578307e-07, + "loss": 0.82390702, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.22497559, + "step": 5103, + "time_per_iteration": 2.7432682514190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106656, + "balance_loss_mlp": 1.04287314, + "epoch": 0.9819161215852251, + "flos": 701507547648.0, + "grad_norm": 0.05634160003679975, + "language_loss": 0.81942677, + "learning_rate": 8.573647489714676e-07, + "loss": 0.83009243, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.23657227, + "step": 5104, + "time_per_iteration": 2.944218873977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064768, + "balance_loss_mlp": 1.04151022, + "epoch": 0.9821085032704886, + "flos": 624188104704.0, + "grad_norm": 0.0740075344049142, + "language_loss": 0.84234631, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85299402, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.23266602, + "step": 5105, + "time_per_iteration": 2.960850238800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105814, + "balance_loss_mlp": 1.03584814, + "epoch": 0.9823008849557522, + "flos": 499505688576.0, + "grad_norm": 0.059286633961085584, + "language_loss": 0.81240541, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82298684, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.22302246, + "step": 5106, + "time_per_iteration": 2.668607473373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106239, + "balance_loss_mlp": 1.03929949, + "epoch": 0.9824932666410158, + "flos": 523815994368.0, + "grad_norm": 0.06438064273190143, + "language_loss": 0.72763407, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73825794, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.23095703, + "step": 5107, + "time_per_iteration": 2.6748061180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064285, + "balance_loss_mlp": 1.04153991, + "epoch": 0.9826856483262794, + "flos": 502663366656.0, + "grad_norm": 0.058904571353806466, + "language_loss": 0.8261292, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83677202, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.22753906, + "step": 5108, + "time_per_iteration": 2.630704402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.04453397, + "epoch": 0.982878030011543, + "flos": 562056869376.0, + "grad_norm": 0.054352316826208384, + "language_loss": 0.84295356, + "learning_rate": 7.686042586151354e-07, + "loss": 0.853634, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.23510742, + "step": 5109, + "time_per_iteration": 2.8963773250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064593, + "balance_loss_mlp": 1.04148996, + "epoch": 0.9830704116968064, + "flos": 537101591040.0, + "grad_norm": 0.05702599379327277, + "language_loss": 0.83052921, + "learning_rate": 7.514335898027857e-07, + "loss": 0.84117514, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.23095703, + "step": 5110, + "time_per_iteration": 2.7798235416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063272, + "balance_loss_mlp": 1.04049134, + "epoch": 0.98326279338207, + "flos": 458949597696.0, + "grad_norm": 0.07220377225871695, + "language_loss": 0.84532428, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85595697, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.22766113, + "step": 5111, + "time_per_iteration": 2.510430335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061353, + "balance_loss_mlp": 1.03872716, + "epoch": 0.9834551750673336, + "flos": 640974974976.0, + "grad_norm": 0.06064500141416257, + "language_loss": 0.79539931, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80601287, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.22607422, + "step": 5112, + "time_per_iteration": 2.809610366821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106535, + "balance_loss_mlp": 1.04302168, + "epoch": 0.9836475567525972, + "flos": 1071807220224.0, + "grad_norm": 0.05639844002899201, + "language_loss": 0.79515636, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80580986, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.22338867, + "step": 5113, + "time_per_iteration": 3.4230575561523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064426, + "balance_loss_mlp": 1.04124022, + "epoch": 0.9838399384378607, + "flos": 565209778176.0, + "grad_norm": 0.07537003994735685, + "language_loss": 0.75971985, + "learning_rate": 6.846892349181566e-07, + "loss": 0.77036417, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.23181152, + "step": 5114, + "time_per_iteration": 2.6420531272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066558, + "balance_loss_mlp": 1.04363441, + "epoch": 0.9840323201231242, + "flos": 772805670912.0, + "grad_norm": 0.060824497499080046, + "language_loss": 0.79589713, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80656278, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.22912598, + "step": 5115, + "time_per_iteration": 2.9657704830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063418, + "balance_loss_mlp": 1.04027963, + "epoch": 0.9842247018083878, + "flos": 472262358528.0, + "grad_norm": 0.057744957305318596, + "language_loss": 0.85976076, + "learning_rate": 6.524801401249225e-07, + "loss": 0.87039495, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.23132324, + "step": 5116, + "time_per_iteration": 2.558858633041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062689, + "balance_loss_mlp": 1.04007459, + "epoch": 0.9844170834936514, + "flos": 525259012608.0, + "grad_norm": 0.0686306369668635, + "language_loss": 0.85068059, + "learning_rate": 6.366663854713295e-07, + "loss": 0.8613075, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.22607422, + "step": 5117, + "time_per_iteration": 2.60532546043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004337, + "balance_loss_mlp": 0.99770856, + "epoch": 0.984609465178915, + "flos": 1567247408640.0, + "grad_norm": 0.0029769817335575824, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78166854, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.06640625, + "step": 5118, + "time_per_iteration": 4.922377586364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066915, + "balance_loss_mlp": 1.04326415, + "epoch": 0.9848018468641785, + "flos": 519548981760.0, + "grad_norm": 0.07012559788215562, + "language_loss": 0.82266271, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83333188, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.23632812, + "step": 5119, + "time_per_iteration": 2.6348206996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063866, + "balance_loss_mlp": 1.04067993, + "epoch": 0.9849942285494421, + "flos": 493004952576.0, + "grad_norm": 0.05674795107066025, + "language_loss": 0.83400589, + "learning_rate": 5.903883659301167e-07, + "loss": 0.84464455, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.23168945, + "step": 5120, + "time_per_iteration": 2.563413619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067769, + "balance_loss_mlp": 1.04472589, + "epoch": 0.9851866102347057, + "flos": 546001606656.0, + "grad_norm": 0.06041502302174469, + "language_loss": 0.80996847, + "learning_rate": 5.753501275193029e-07, + "loss": 0.82064617, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.23046875, + "step": 5121, + "time_per_iteration": 2.650181293487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061842, + "balance_loss_mlp": 1.03858471, + "epoch": 0.9853789919199692, + "flos": 476257729536.0, + "grad_norm": 0.06754220961608856, + "language_loss": 0.80218607, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81280452, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.23254395, + "step": 5122, + "time_per_iteration": 2.593015193939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066, + "balance_loss_mlp": 1.04334998, + "epoch": 0.9855713736052328, + "flos": 1032619995648.0, + "grad_norm": 0.06282848940738642, + "language_loss": 0.76010883, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77076876, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.22644043, + "step": 5123, + "time_per_iteration": 3.397169828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067794, + "balance_loss_mlp": 1.04490566, + "epoch": 0.9857637552904963, + "flos": 495050724864.0, + "grad_norm": 0.053030998912210484, + "language_loss": 0.82679278, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83747071, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.22900391, + "step": 5124, + "time_per_iteration": 2.599963665008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.04034376, + "epoch": 0.9859561369757599, + "flos": 592267728384.0, + "grad_norm": 0.06500711995807343, + "language_loss": 0.8369258, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84756613, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.23657227, + "step": 5125, + "time_per_iteration": 2.771230936050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061253, + "balance_loss_mlp": 1.03853178, + "epoch": 0.9861485186610235, + "flos": 486971149824.0, + "grad_norm": 0.06307068299560632, + "language_loss": 0.78874338, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79935598, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.22705078, + "step": 5126, + "time_per_iteration": 2.6188127994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062656, + "balance_loss_mlp": 1.03972054, + "epoch": 0.9863409003462871, + "flos": 518795352576.0, + "grad_norm": 0.04863083160531793, + "language_loss": 0.82588637, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83651292, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.22924805, + "step": 5127, + "time_per_iteration": 2.6533150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004341, + "balance_loss_mlp": 0.99771261, + "epoch": 0.9865332820315506, + "flos": 1486026570240.0, + "grad_norm": 0.0029781002545871183, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.8018707, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.06640625, + "step": 5128, + "time_per_iteration": 4.873580694198608 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065539, + "balance_loss_mlp": 1.0421623, + "epoch": 0.9867256637168141, + "flos": 582112645632.0, + "grad_norm": 0.06379657534480902, + "language_loss": 0.79116714, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80182254, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.23376465, + "step": 5129, + "time_per_iteration": 2.6797971725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063082, + "balance_loss_mlp": 1.0404557, + "epoch": 0.9869180454020777, + "flos": 959303264256.0, + "grad_norm": 0.053165441114429356, + "language_loss": 0.86395758, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87458837, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.22631836, + "step": 5130, + "time_per_iteration": 3.2278151512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065979, + "balance_loss_mlp": 1.04216099, + "epoch": 0.9871104270873413, + "flos": 770730163200.0, + "grad_norm": 0.08455721104882859, + "language_loss": 0.82508123, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.83574104, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.23803711, + "step": 5131, + "time_per_iteration": 3.013922691345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106287, + "balance_loss_mlp": 1.03873003, + "epoch": 0.9873028087726049, + "flos": 446444794368.0, + "grad_norm": 0.06717519377101111, + "language_loss": 0.78595114, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.79657984, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.24121094, + "step": 5132, + "time_per_iteration": 2.5394835472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_mlp": 1.04094672, + "epoch": 0.9874951904578684, + "flos": 507612427776.0, + "grad_norm": 0.0838918478695267, + "language_loss": 0.86701912, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87765592, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.22741699, + "step": 5133, + "time_per_iteration": 2.5988612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067946, + "balance_loss_mlp": 1.04530835, + "epoch": 0.987687572143132, + "flos": 716742743040.0, + "grad_norm": 0.05777251050052068, + "language_loss": 0.8243646, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83504403, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.2265625, + "step": 5134, + "time_per_iteration": 2.9052164554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004341, + "balance_loss_mlp": 0.99771327, + "epoch": 0.9878799538283956, + "flos": 1538647695360.0, + "grad_norm": 0.0029781392554615676, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80822289, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.06640625, + "step": 5135, + "time_per_iteration": 4.864543676376343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058561, + "balance_loss_mlp": 1.03648376, + "epoch": 0.9880723355136591, + "flos": 721424931840.0, + "grad_norm": 0.05497798109654085, + "language_loss": 0.81718028, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82776594, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.22070312, + "step": 5136, + "time_per_iteration": 2.9499542713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106326, + "balance_loss_mlp": 1.0411582, + "epoch": 0.9882647171989226, + "flos": 557350087680.0, + "grad_norm": 0.05211029783189689, + "language_loss": 0.84319884, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85383141, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.22106934, + "step": 5137, + "time_per_iteration": 2.7629799842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064001, + "balance_loss_mlp": 1.04021907, + "epoch": 0.9884570988841862, + "flos": 562820410368.0, + "grad_norm": 0.05978192539046426, + "language_loss": 0.80376399, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81440401, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.2376709, + "step": 5138, + "time_per_iteration": 2.698518753051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061053, + "balance_loss_mlp": 1.03812945, + "epoch": 0.9886494805694498, + "flos": 431763167232.0, + "grad_norm": 0.06509914319715783, + "language_loss": 0.8613711, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87198162, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.22912598, + "step": 5139, + "time_per_iteration": 2.496880531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064714, + "balance_loss_mlp": 1.04178977, + "epoch": 0.9888418622547134, + "flos": 592082348544.0, + "grad_norm": 0.05021855236929169, + "language_loss": 0.90418476, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91483188, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.22912598, + "step": 5140, + "time_per_iteration": 2.810107469558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064111, + "balance_loss_mlp": 1.04127049, + "epoch": 0.989034243939977, + "flos": 1134993461760.0, + "grad_norm": 0.054738913900339296, + "language_loss": 0.80376035, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81440145, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.22827148, + "step": 5141, + "time_per_iteration": 3.5171725749969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.03775382, + "epoch": 0.9892266256252404, + "flos": 566670048768.0, + "grad_norm": 0.05685524099004798, + "language_loss": 0.81969726, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83030808, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.2331543, + "step": 5142, + "time_per_iteration": 2.671959638595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064239, + "balance_loss_mlp": 1.04062331, + "epoch": 0.989419007310504, + "flos": 640577051136.0, + "grad_norm": 0.0861584296867121, + "language_loss": 0.83990133, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.85054374, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.23620605, + "step": 5143, + "time_per_iteration": 2.88116192817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065538, + "balance_loss_mlp": 1.04206514, + "epoch": 0.9896113889957676, + "flos": 455478059520.0, + "grad_norm": 0.056260924480767305, + "language_loss": 0.81494832, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82560366, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.23461914, + "step": 5144, + "time_per_iteration": 2.6181671619415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.04035997, + "epoch": 0.9898037706810312, + "flos": 567339614208.0, + "grad_norm": 0.06359073729727067, + "language_loss": 0.80580842, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81644046, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.22839355, + "step": 5145, + "time_per_iteration": 2.6478822231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004339, + "balance_loss_mlp": 0.99771106, + "epoch": 0.9899961523662947, + "flos": 1550268191232.0, + "grad_norm": 0.0029775104009917776, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.7915076, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.06640625, + "step": 5146, + "time_per_iteration": 4.94190239906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063906, + "balance_loss_mlp": 1.04033875, + "epoch": 0.9901885340515583, + "flos": 610709787648.0, + "grad_norm": 0.06714350569578352, + "language_loss": 0.85334808, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86398715, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.23571777, + "step": 5147, + "time_per_iteration": 2.8776228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062575, + "balance_loss_mlp": 1.03987753, + "epoch": 0.9903809157368219, + "flos": 517483385856.0, + "grad_norm": 0.05973429515052892, + "language_loss": 0.83089972, + "learning_rate": 2.426269020866512e-07, + "loss": 0.8415255, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.22692871, + "step": 5148, + "time_per_iteration": 2.5988693237304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065813, + "balance_loss_mlp": 1.04279327, + "epoch": 0.9905732974220854, + "flos": 1100426757120.0, + "grad_norm": 0.05728172246327606, + "language_loss": 0.80693364, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81759173, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.22998047, + "step": 5149, + "time_per_iteration": 3.4677181243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_mlp": 1.0427587, + "epoch": 0.990765679107349, + "flos": 858002056704.0, + "grad_norm": 0.07032896745482858, + "language_loss": 0.84725714, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85791647, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.23181152, + "step": 5150, + "time_per_iteration": 3.101963520050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065032, + "balance_loss_mlp": 1.04247713, + "epoch": 0.9909580607926125, + "flos": 491287721472.0, + "grad_norm": 0.05971113737071383, + "language_loss": 0.80380929, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81445956, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.22558594, + "step": 5151, + "time_per_iteration": 2.5920066833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064633, + "balance_loss_mlp": 1.041852, + "epoch": 0.9911504424778761, + "flos": 585060350976.0, + "grad_norm": 0.06220607613020044, + "language_loss": 0.79609364, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80673993, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.2277832, + "step": 5152, + "time_per_iteration": 2.7163023948669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061577, + "balance_loss_mlp": 1.03867722, + "epoch": 0.9913428241631397, + "flos": 570030359040.0, + "grad_norm": 0.05987133143138268, + "language_loss": 0.81748044, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82809621, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.22900391, + "step": 5153, + "time_per_iteration": 2.6854746341705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066119, + "balance_loss_mlp": 1.04346955, + "epoch": 0.9915352058484033, + "flos": 489745958400.0, + "grad_norm": 0.061045684154491624, + "language_loss": 0.86315995, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.87382114, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.22644043, + "step": 5154, + "time_per_iteration": 2.676959991455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063029, + "balance_loss_mlp": 1.03990245, + "epoch": 0.9917275875336667, + "flos": 744047741952.0, + "grad_norm": 0.05686586567139158, + "language_loss": 0.82752365, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83815396, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.23132324, + "step": 5155, + "time_per_iteration": 2.9827871322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065793, + "balance_loss_mlp": 1.04195142, + "epoch": 0.9919199692189303, + "flos": 508272081408.0, + "grad_norm": 0.05312703789662428, + "language_loss": 0.80161297, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81227094, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.23840332, + "step": 5156, + "time_per_iteration": 2.6331825256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062015, + "balance_loss_mlp": 1.03901994, + "epoch": 0.9921123509041939, + "flos": 543963174912.0, + "grad_norm": 0.05853430222577831, + "language_loss": 0.84317166, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85379183, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.2298584, + "step": 5157, + "time_per_iteration": 2.6789844036102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064673, + "balance_loss_mlp": 1.04108155, + "epoch": 0.9923047325894575, + "flos": 671561362944.0, + "grad_norm": 0.05962975100622896, + "language_loss": 0.77451545, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78516221, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.23571777, + "step": 5158, + "time_per_iteration": 2.7843782901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064845, + "balance_loss_mlp": 1.04091978, + "epoch": 0.9924971142747211, + "flos": 466557096960.0, + "grad_norm": 0.06079228956039329, + "language_loss": 0.80781394, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81846237, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.23913574, + "step": 5159, + "time_per_iteration": 2.699115753173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010682, + "balance_loss_mlp": 1.04509735, + "epoch": 0.9926894959599846, + "flos": 491581757952.0, + "grad_norm": 0.058577196677344574, + "language_loss": 0.82705361, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83773553, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.2310791, + "step": 5160, + "time_per_iteration": 2.573960304260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.04675007, + "epoch": 0.9928818776452482, + "flos": 492389715456.0, + "grad_norm": 0.10155127334981055, + "language_loss": 0.81777894, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82848155, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.23510742, + "step": 5161, + "time_per_iteration": 2.6028358936309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061966, + "balance_loss_mlp": 1.03905427, + "epoch": 0.9930742593305117, + "flos": 546357312000.0, + "grad_norm": 0.05598226563819152, + "language_loss": 0.84204501, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85266471, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.22900391, + "step": 5162, + "time_per_iteration": 2.7396605014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_mlp": 1.03965724, + "epoch": 0.9932666410157753, + "flos": 585510031872.0, + "grad_norm": 0.06298013807205607, + "language_loss": 0.86198676, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.87261665, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.2331543, + "step": 5163, + "time_per_iteration": 2.754847288131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064074, + "balance_loss_mlp": 1.04148424, + "epoch": 0.9934590227010388, + "flos": 537086909952.0, + "grad_norm": 0.1690844779753495, + "language_loss": 0.84142578, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.85206652, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.22583008, + "step": 5164, + "time_per_iteration": 2.668947219848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060593, + "balance_loss_mlp": 1.03828871, + "epoch": 0.9936514043863024, + "flos": 518014559232.0, + "grad_norm": 0.06013738975249062, + "language_loss": 0.87179309, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.88239902, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.22314453, + "step": 5165, + "time_per_iteration": 2.603496789932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062895, + "balance_loss_mlp": 1.03958964, + "epoch": 0.993843786071566, + "flos": 744625903104.0, + "grad_norm": 0.059796641809083874, + "language_loss": 0.8027035, + "learning_rate": 9.938472493803419e-08, + "loss": 0.8133325, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.23303223, + "step": 5166, + "time_per_iteration": 3.0740067958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064775, + "balance_loss_mlp": 1.04208994, + "epoch": 0.9940361677568296, + "flos": 525918666240.0, + "grad_norm": 0.08114562381518384, + "language_loss": 0.82492352, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83557123, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.22680664, + "step": 5167, + "time_per_iteration": 2.67103910446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061467, + "balance_loss_mlp": 1.03862691, + "epoch": 0.9942285494420932, + "flos": 555650108928.0, + "grad_norm": 0.06282919009589391, + "language_loss": 0.80018061, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81079531, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.22851562, + "step": 5168, + "time_per_iteration": 2.746816873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063198, + "balance_loss_mlp": 1.04025054, + "epoch": 0.9944209311273566, + "flos": 585996788736.0, + "grad_norm": 0.0553184423711719, + "language_loss": 0.81770915, + "learning_rate": 8.162407083411872e-08, + "loss": 0.82834113, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.22937012, + "step": 5169, + "time_per_iteration": 2.698438882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_mlp": 1.04285061, + "epoch": 0.9946133128126202, + "flos": 735518486016.0, + "grad_norm": 0.05668738134113141, + "language_loss": 0.82296896, + "learning_rate": 7.609202086272804e-08, + "loss": 0.8336283, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.23071289, + "step": 5170, + "time_per_iteration": 3.0203161239624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.04139829, + "epoch": 0.9948056944978838, + "flos": 646018011648.0, + "grad_norm": 0.0586239910556347, + "language_loss": 0.82445252, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83510554, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.23876953, + "step": 5171, + "time_per_iteration": 2.733257293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062523, + "balance_loss_mlp": 1.03965902, + "epoch": 0.9949980761831474, + "flos": 445846809600.0, + "grad_norm": 0.073524028673894, + "language_loss": 0.86205852, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87268376, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.2286377, + "step": 5172, + "time_per_iteration": 2.5351874828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068489, + "balance_loss_mlp": 1.04420602, + "epoch": 0.995190457868411, + "flos": 435637398528.0, + "grad_norm": 0.05471779387138947, + "language_loss": 0.85440135, + "learning_rate": 6.066040520641414e-08, + "loss": 0.8650862, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.24279785, + "step": 5173, + "time_per_iteration": 2.545313596725464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063421, + "balance_loss_mlp": 1.04110503, + "epoch": 0.9953828395536745, + "flos": 514187315712.0, + "grad_norm": 0.06435296438429121, + "language_loss": 0.82080287, + "learning_rate": 5.590471806377062e-08, + "loss": 0.83143711, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.22302246, + "step": 5174, + "time_per_iteration": 2.5876967906951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066975, + "balance_loss_mlp": 1.0443728, + "epoch": 0.995575221238938, + "flos": 479847836160.0, + "grad_norm": 0.0684169578482112, + "language_loss": 0.82180631, + "learning_rate": 5.134312643245709e-08, + "loss": 0.83247602, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.22595215, + "step": 5175, + "time_per_iteration": 2.56477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061988, + "balance_loss_mlp": 1.03824103, + "epoch": 0.9957676029242016, + "flos": 587785600512.0, + "grad_norm": 0.0622556493926411, + "language_loss": 0.76661915, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77723902, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.23730469, + "step": 5176, + "time_per_iteration": 2.727720260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106837, + "balance_loss_mlp": 1.04471922, + "epoch": 0.9959599846094652, + "flos": 426465741312.0, + "grad_norm": 0.06993506205261471, + "language_loss": 0.80097485, + "learning_rate": 4.280223671243588e-08, + "loss": 0.81165856, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.2364502, + "step": 5177, + "time_per_iteration": 2.4686057567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064385, + "balance_loss_mlp": 1.04079378, + "epoch": 0.9961523662947287, + "flos": 611619061248.0, + "grad_norm": 0.058518209299634485, + "language_loss": 0.80758059, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81822443, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.23608398, + "step": 5178, + "time_per_iteration": 2.807483434677124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064873, + "balance_loss_mlp": 1.04153216, + "epoch": 0.9963447479799923, + "flos": 550785111552.0, + "grad_norm": 0.07135517223335183, + "language_loss": 0.74197721, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.75262594, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.2331543, + "step": 5179, + "time_per_iteration": 2.6507856845855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066091, + "balance_loss_mlp": 1.04279709, + "epoch": 0.9965371296652559, + "flos": 625873402368.0, + "grad_norm": 0.06151129189851597, + "language_loss": 0.8901037, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.90076458, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.23291016, + "step": 5180, + "time_per_iteration": 2.7257490158081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067144, + "balance_loss_mlp": 1.04370713, + "epoch": 0.9967295113505195, + "flos": 639522044928.0, + "grad_norm": 0.05801908419016484, + "language_loss": 0.82134813, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.83201957, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.23425293, + "step": 5181, + "time_per_iteration": 2.9021897315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061511, + "balance_loss_mlp": 1.03774095, + "epoch": 0.996921893035783, + "flos": 607389124608.0, + "grad_norm": 0.06523374059991581, + "language_loss": 0.76779681, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77841198, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.23754883, + "step": 5182, + "time_per_iteration": 2.7176480293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065298, + "balance_loss_mlp": 1.04209948, + "epoch": 0.9971142747210465, + "flos": 644162388480.0, + "grad_norm": 0.06922055405359702, + "language_loss": 0.81937838, + "learning_rate": 2.183802848243488e-08, + "loss": 0.83003139, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.23168945, + "step": 5183, + "time_per_iteration": 2.7730648517608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061879, + "balance_loss_mlp": 1.03976595, + "epoch": 0.9973066564063101, + "flos": 1040773722624.0, + "grad_norm": 0.05364949272503117, + "language_loss": 0.81309438, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82371318, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.22094727, + "step": 5184, + "time_per_iteration": 3.3700714111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068176, + "balance_loss_mlp": 1.04495406, + "epoch": 0.9974990380915737, + "flos": 665095131648.0, + "grad_norm": 0.0564934876502354, + "language_loss": 0.83195555, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84263736, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.23205566, + "step": 5185, + "time_per_iteration": 2.8419084548950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106209, + "balance_loss_mlp": 1.03943992, + "epoch": 0.9976914197768373, + "flos": 718121521152.0, + "grad_norm": 0.061013510739593706, + "language_loss": 0.77592587, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78654671, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.22644043, + "step": 5186, + "time_per_iteration": 2.900660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067293, + "balance_loss_mlp": 1.04322433, + "epoch": 0.9978838014621008, + "flos": 518328419328.0, + "grad_norm": 0.07107533241149569, + "language_loss": 0.79306746, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80374032, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.24060059, + "step": 5187, + "time_per_iteration": 2.6121342182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060009, + "balance_loss_mlp": 1.03775311, + "epoch": 0.9980761831473643, + "flos": 603430829568.0, + "grad_norm": 0.05826890310585048, + "language_loss": 0.84421951, + "learning_rate": 9.70582968801148e-09, + "loss": 0.8548196, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.22241211, + "step": 5188, + "time_per_iteration": 2.809272050857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064719, + "balance_loss_mlp": 1.04069877, + "epoch": 0.9982685648326279, + "flos": 453523691520.0, + "grad_norm": 0.056344907455401605, + "language_loss": 0.89689124, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90753841, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.24023438, + "step": 5189, + "time_per_iteration": 2.566086769104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061619, + "balance_loss_mlp": 1.03783727, + "epoch": 0.9984609465178915, + "flos": 481424103936.0, + "grad_norm": 0.06655173686317807, + "language_loss": 0.78688771, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79750389, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.2376709, + "step": 5190, + "time_per_iteration": 2.6371493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067596, + "balance_loss_mlp": 1.04430294, + "epoch": 0.9986533282031551, + "flos": 476941976064.0, + "grad_norm": 0.06114669924659269, + "language_loss": 0.84300482, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85368079, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.23266602, + "step": 5191, + "time_per_iteration": 2.6488306522369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067181, + "balance_loss_mlp": 1.04337525, + "epoch": 0.9988457098884186, + "flos": 641948488704.0, + "grad_norm": 0.05899170995522276, + "language_loss": 0.86477023, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87544203, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.23791504, + "step": 5192, + "time_per_iteration": 2.828174114227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064616, + "balance_loss_mlp": 1.04143012, + "epoch": 0.9990380915736822, + "flos": 396321693696.0, + "grad_norm": 0.07115874499342345, + "language_loss": 0.8789829, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.88962901, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.23181152, + "step": 5193, + "time_per_iteration": 2.4448673725128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106445, + "balance_loss_mlp": 1.04135919, + "epoch": 0.9992304732589458, + "flos": 576123259392.0, + "grad_norm": 0.06564969851429614, + "language_loss": 0.84961963, + "learning_rate": 1.552936970405927e-09, + "loss": 0.86026412, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.23083496, + "step": 5194, + "time_per_iteration": 2.733811855316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069185, + "balance_loss_mlp": 1.046404, + "epoch": 0.9994228549442093, + "flos": 544291716096.0, + "grad_norm": 0.07182096607307784, + "language_loss": 0.75418997, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76488185, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.2277832, + "step": 5195, + "time_per_iteration": 2.661844253540039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.04167914, + "epoch": 0.9996152366294728, + "flos": 1471314502656.0, + "grad_norm": 0.06834659341862818, + "language_loss": 0.81183863, + "learning_rate": 3.882343933003796e-10, + "loss": 0.82248634, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.23071289, + "step": 5196, + "time_per_iteration": 3.754668951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048308, + "balance_loss_mlp": 1.02844799, + "epoch": 0.9998076183147364, + "flos": 618950149632.0, + "grad_norm": 0.1142694827050757, + "language_loss": 0.69779372, + "learning_rate": 9.70586077619906e-11, + "loss": 0.70827675, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.19836426, + "step": 5197, + "time_per_iteration": 4.012174844741821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026206, + "balance_loss_mlp": 1.01353955, + "epoch": 1.0, + "flos": 1290737617920.0, + "grad_norm": 0.028700024398426234, + "language_loss": 0.84229785, + "learning_rate": 0.0, + "loss": 0.85255992, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.12670898, + "step": 5198, + "time_per_iteration": 5.750073432922363 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.171926856433664e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/training_args.bin b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b28f0633932ff84d8e0fde7beb2f9c59f0d04be --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54b92ce31f27a60f5f91da41c22febbdc5fe6a9ac82c4d361c2b9dbc9096639 +size 7992 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_sigmoidgating/config.json b/sft_pretrain/Full_smoe_sigmoidgating/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6e980e9f2970ef75f4606820f66d7af3d36c5e3 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/generation_config.json b/sft_pretrain/Full_smoe_sigmoidgating/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_sigmoidgating/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4d239cd18f38895f9c0d72f4844238ac8a2c4ab --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f279a4fbf39064aa08c5b72e2bdde9242074e1b223240741dbb926caad995eed +size 3759025152 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/model.safetensors.index.json b/sft_pretrain/Full_smoe_sigmoidgating/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/special_tokens_map.json b/sft_pretrain/Full_smoe_sigmoidgating/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/tokenizer.model b/sft_pretrain/Full_smoe_sigmoidgating/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_sigmoidgating/tokenizer_config.json b/sft_pretrain/Full_smoe_sigmoidgating/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/trainer_state.json b/sft_pretrain/Full_smoe_sigmoidgating/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..80b370aa0122765b462764436c9cadcbe7bbd85c --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/trainer_state.json @@ -0,0 +1,78013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03935784, + "balance_loss_mlp": 2.84935808, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 13.498251331228948, + "language_loss": 2.81572914, + "learning_rate": 0.0, + "loss": 1.90346789, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.30480647087097 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0351246, + "balance_loss_mlp": 2.65644169, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 27.482987886380492, + "language_loss": 8.76816368, + "learning_rate": 0.00013726078121135892, + "loss": 8.80328846, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 8.578125, + "step": 2, + "time_per_iteration": 2.6929261684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03513305, + "balance_loss_mlp": 2.65728736, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 28.576563245741852, + "language_loss": 9.00053596, + "learning_rate": 0.00021755319103969496, + "loss": 9.03566933, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 8.578125, + "step": 3, + "time_per_iteration": 2.7945075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03667009, + "balance_loss_mlp": 2.78657675, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 15.694146018083416, + "language_loss": 2.74122858, + "learning_rate": 0.00027452156242271784, + "loss": 2.77789879, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 8.828125, + "step": 4, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03933422, + "balance_loss_mlp": 3.01102829, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 3.505338851882968, + "language_loss": 1.83478093, + "learning_rate": 0.0003187096642208417, + "loss": 1.87411511, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 9.2109375, + "step": 5, + "time_per_iteration": 2.651094675064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04005588, + "balance_loss_mlp": 3.05420256, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 3.050600048840319, + "language_loss": 1.61776543, + "learning_rate": 0.0003548139722510539, + "loss": 1.65782118, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 9.4921875, + "step": 6, + "time_per_iteration": 2.697614908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03708502, + "balance_loss_mlp": 2.7708497, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7974788691124679, + "language_loss": 1.32417345, + "learning_rate": 0.00038533972973918044, + "loss": 1.36125851, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 9.3515625, + "step": 7, + "time_per_iteration": 2.6407949924468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0332405, + "balance_loss_mlp": 2.38868618, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.7144720842381633, + "language_loss": 1.25956392, + "learning_rate": 0.0004117823436340768, + "loss": 1.29280448, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 9.3359375, + "step": 8, + "time_per_iteration": 2.6287930011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02785454, + "balance_loss_mlp": 1.8508532, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.3140255221758466, + "language_loss": 1.29993415, + "learning_rate": 0.00043510638207938993, + "loss": 1.32778871, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 9.3203125, + "step": 9, + "time_per_iteration": 2.8048858642578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0244685, + "balance_loss_mlp": 1.50004196, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.19799802642524775, + "language_loss": 1.19032216, + "learning_rate": 0.00045597044543220066, + "loss": 1.2147907, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 9.4453125, + "step": 10, + "time_per_iteration": 2.7669434547424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02310187, + "balance_loss_mlp": 1.35117221, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.14485632700798082, + "language_loss": 1.18421102, + "learning_rate": 0.00047484428652143135, + "loss": 1.20731282, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 9.5703125, + "step": 11, + "time_per_iteration": 2.9067423343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02309394, + "balance_loss_mlp": 1.33740926, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.1366980934684776, + "language_loss": 1.24379897, + "learning_rate": 0.0004920747534624128, + "loss": 1.26689291, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 9.703125, + "step": 12, + "time_per_iteration": 2.612813949584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022984, + "balance_loss_mlp": 1.32565212, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.11957957623458634, + "language_loss": 1.26615512, + "learning_rate": 0.0005079252465375872, + "loss": 1.28913903, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 9.7109375, + "step": 13, + "time_per_iteration": 2.879688262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02311662, + "balance_loss_mlp": 1.34730673, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.10749127497061137, + "language_loss": 1.14448667, + "learning_rate": 0.0005226005109505393, + "loss": 1.16760325, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 9.625, + "step": 14, + "time_per_iteration": 2.568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02285502, + "balance_loss_mlp": 1.3615818, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.11405493545380829, + "language_loss": 1.20514369, + "learning_rate": 0.0005362628552605367, + "loss": 1.22799873, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 9.21875, + "step": 15, + "time_per_iteration": 2.6814210414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02243131, + "balance_loss_mlp": 1.36117291, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.10465613456634369, + "language_loss": 1.24307358, + "learning_rate": 0.0005490431248454357, + "loss": 1.26550484, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 8.84375, + "step": 16, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02323403, + "balance_loss_mlp": 1.52994621, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2929644268686402, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78028512, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.90625, + "step": 17, + "time_per_iteration": 6.376815319061279 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02154669, + "balance_loss_mlp": 1.37418151, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.15081794947454089, + "language_loss": 1.11159086, + "learning_rate": 0.0005723671632907488, + "loss": 1.13313746, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.80078125, + "step": 18, + "time_per_iteration": 2.721731424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02067628, + "balance_loss_mlp": 1.35466075, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11430094844987627, + "language_loss": 1.15730095, + "learning_rate": 0.0005830738490244919, + "loss": 1.1779772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 7.12890625, + "step": 19, + "time_per_iteration": 2.691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01966178, + "balance_loss_mlp": 1.31958628, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10166759343386816, + "language_loss": 1.17760253, + "learning_rate": 0.0005932312266435596, + "loss": 1.19726431, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.46484375, + "step": 20, + "time_per_iteration": 2.779218912124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01836812, + "balance_loss_mlp": 1.26727819, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.12846528828878043, + "language_loss": 1.12106359, + "learning_rate": 0.0006028929207788754, + "loss": 1.13943172, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 5.70703125, + "step": 21, + "time_per_iteration": 2.716970443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01720951, + "balance_loss_mlp": 1.21970022, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09445288880840001, + "language_loss": 1.16516471, + "learning_rate": 0.0006121050677327902, + "loss": 1.18237424, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 5.0078125, + "step": 22, + "time_per_iteration": 2.92696475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01630624, + "balance_loss_mlp": 1.19193399, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.11621712848760359, + "language_loss": 1.06380248, + "learning_rate": 0.0006209076479463684, + "loss": 1.08010876, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.3984375, + "step": 23, + "time_per_iteration": 2.666133165359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01572853, + "balance_loss_mlp": 1.18394423, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.10970997088624258, + "language_loss": 1.16519284, + "learning_rate": 0.0006293355346737718, + "loss": 1.18092132, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 3.88476562, + "step": 24, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0152954, + "balance_loss_mlp": 1.18755198, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.09735665571869598, + "language_loss": 1.12784922, + "learning_rate": 0.0006374193284416834, + "loss": 1.14314473, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 3.42382812, + "step": 25, + "time_per_iteration": 2.7919249534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0148827, + "balance_loss_mlp": 1.19282198, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.09233879954989622, + "language_loss": 1.11062908, + "learning_rate": 0.0006451860277489461, + "loss": 1.12551177, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 2.953125, + "step": 26, + "time_per_iteration": 2.581066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462989, + "balance_loss_mlp": 1.20988345, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.12330238493557526, + "language_loss": 1.19441557, + "learning_rate": 0.0006526595731190848, + "loss": 1.20904553, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 2.52929688, + "step": 27, + "time_per_iteration": 2.49725604057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423898, + "balance_loss_mlp": 1.20874906, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.09841719698503415, + "language_loss": 1.12322927, + "learning_rate": 0.0006598612921618983, + "loss": 1.13746822, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 2.15625, + "step": 28, + "time_per_iteration": 2.822068929672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399446, + "balance_loss_mlp": 1.21443295, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.2589331093265968, + "language_loss": 1.06232262, + "learning_rate": 0.0006668102665011454, + "loss": 1.07631707, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 1.84765625, + "step": 29, + "time_per_iteration": 3.2402820587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444994, + "balance_loss_mlp": 1.28353739, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.1317361033328709, + "language_loss": 1.14859319, + "learning_rate": 0.0006735236364718957, + "loss": 1.16304302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 1.61425781, + "step": 30, + "time_per_iteration": 2.6861231327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333301, + "balance_loss_mlp": 1.20445967, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.07039345614882069, + "language_loss": 1.13512135, + "learning_rate": 0.0006800168558381346, + "loss": 1.14845431, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 1.28808594, + "step": 31, + "time_per_iteration": 2.6444640159606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254242, + "balance_loss_mlp": 1.153772, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.07602265872136475, + "language_loss": 1.1720531, + "learning_rate": 0.0006863039060567947, + "loss": 1.18459558, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 1.00439453, + "step": 32, + "time_per_iteration": 2.7225399017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117915, + "balance_loss_mlp": 1.10071015, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.062098451262649575, + "language_loss": 1.09530759, + "learning_rate": 0.0006923974775611263, + "loss": 1.10709918, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.78417969, + "step": 33, + "time_per_iteration": 2.795565366744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155392, + "balance_loss_mlp": 1.09416604, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0750568617782567, + "language_loss": 1.06307364, + "learning_rate": 0.0006983091239737814, + "loss": 1.0746274, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.61132812, + "step": 34, + "time_per_iteration": 3.0703423023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.0903163, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.057198892540160154, + "language_loss": 1.05094206, + "learning_rate": 0.0007040493939600222, + "loss": 1.06232452, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.47949219, + "step": 35, + "time_per_iteration": 2.8476996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136821, + "balance_loss_mlp": 1.09926963, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.07105443011946577, + "language_loss": 1.05056715, + "learning_rate": 0.0007096279445021078, + "loss": 1.06193542, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.37548828, + "step": 36, + "time_per_iteration": 2.8306472301483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_mlp": 1.12274194, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09366404592926651, + "language_loss": 1.11846077, + "learning_rate": 0.0007150536386503726, + "loss": 1.12998605, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.29736328, + "step": 37, + "time_per_iteration": 2.875190258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150569, + "balance_loss_mlp": 1.12677491, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.0928332145488954, + "language_loss": 1.04548562, + "learning_rate": 0.0007203346302358509, + "loss": 1.05699134, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.23791504, + "step": 38, + "time_per_iteration": 3.0075292587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128748, + "balance_loss_mlp": 1.10757613, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.056043607360260886, + "language_loss": 1.09224963, + "learning_rate": 0.000725478437577282, + "loss": 1.10353708, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.21179199, + "step": 39, + "time_per_iteration": 2.78564715385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_mlp": 1.09953475, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.2122838817863008, + "language_loss": 1.04638147, + "learning_rate": 0.0007304920078549186, + "loss": 1.0575583, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18151855, + "step": 40, + "time_per_iteration": 2.745100975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133734, + "balance_loss_mlp": 1.11621058, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.14528393981530327, + "language_loss": 1.06509054, + "learning_rate": 0.0007353817735343603, + "loss": 1.07642794, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.17529297, + "step": 41, + "time_per_iteration": 2.7425575256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.10357416, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.06769616325508275, + "language_loss": 1.0188365, + "learning_rate": 0.0007401537019902344, + "loss": 1.03003538, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.16308594, + "step": 42, + "time_per_iteration": 2.6797902584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118919, + "balance_loss_mlp": 1.10271883, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.14916902722339276, + "language_loss": 1.05194306, + "learning_rate": 0.0007448133392900729, + "loss": 1.06313229, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.1619873, + "step": 43, + "time_per_iteration": 2.779276132583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_mlp": 1.09945166, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.052417895665492535, + "language_loss": 1.00651026, + "learning_rate": 0.0007493658489441491, + "loss": 1.0176717, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.16711426, + "step": 44, + "time_per_iteration": 2.965435028076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_mlp": 1.09195447, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.04248825884697869, + "language_loss": 1.04600978, + "learning_rate": 0.0007538160463002316, + "loss": 1.05709875, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.16967773, + "step": 45, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_mlp": 1.08735132, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.08538228051147774, + "language_loss": 1.08093452, + "learning_rate": 0.0007581684291577274, + "loss": 1.09198785, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.17980957, + "step": 46, + "time_per_iteration": 2.6020169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.04723509056908367, + "language_loss": 1.10695386, + "learning_rate": 0.0007624272050891776, + "loss": 1.11800754, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.19006348, + "step": 47, + "time_per_iteration": 2.8620407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_mlp": 1.08244705, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.07235265954126073, + "language_loss": 1.00601125, + "learning_rate": 0.0007665963158851307, + "loss": 1.01704311, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.20751953, + "step": 48, + "time_per_iteration": 2.8312995433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114938, + "balance_loss_mlp": 1.09308696, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.10505304652404167, + "language_loss": 1.09914839, + "learning_rate": 0.0007706794594783609, + "loss": 1.1102978, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.21850586, + "step": 49, + "time_per_iteration": 2.779561758041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.0874207, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.04709564792407722, + "language_loss": 1.08694363, + "learning_rate": 0.0007746801096530423, + "loss": 1.09804368, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.22583008, + "step": 50, + "time_per_iteration": 2.785332441329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_mlp": 1.09285581, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09574874491356838, + "language_loss": 1.13402438, + "learning_rate": 0.0007786015338021173, + "loss": 1.14518726, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.23425293, + "step": 51, + "time_per_iteration": 2.676326274871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_mlp": 1.09500206, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.12325193255180054, + "language_loss": 1.06019998, + "learning_rate": 0.0007824468089603051, + "loss": 1.07138121, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.23144531, + "step": 52, + "time_per_iteration": 2.688828945159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_mlp": 1.11038983, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.07208467676878935, + "language_loss": 1.05329835, + "learning_rate": 0.0007862188363098669, + "loss": 1.06464922, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.24707031, + "step": 53, + "time_per_iteration": 3.3342933654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126914, + "balance_loss_mlp": 1.10158229, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.09794855088059086, + "language_loss": 1.06043434, + "learning_rate": 0.0007899203543304438, + "loss": 1.07170355, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25354004, + "step": 54, + "time_per_iteration": 2.933236837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145083, + "balance_loss_mlp": 1.12053776, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.1404118977896248, + "language_loss": 1.20000231, + "learning_rate": 0.0007935539507422731, + "loss": 1.2114532, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.24536133, + "step": 55, + "time_per_iteration": 2.8257975578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.12969017, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.05382700946372506, + "language_loss": 1.10560298, + "learning_rate": 0.0007971220733732573, + "loss": 1.11713552, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.2355957, + "step": 56, + "time_per_iteration": 2.749382495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_mlp": 1.13151693, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.17392462927294325, + "language_loss": 1.05995011, + "learning_rate": 0.0008006270400641869, + "loss": 1.07150006, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.23474121, + "step": 57, + "time_per_iteration": 2.743929147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_mlp": 1.10234821, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.10169017538987117, + "language_loss": 1.06833839, + "learning_rate": 0.0008040710477125043, + "loss": 1.07959747, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.23547363, + "step": 58, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111797, + "balance_loss_mlp": 1.08861065, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.059941584643697095, + "language_loss": 1.07409072, + "learning_rate": 0.0008074561805429771, + "loss": 1.08520865, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.23181152, + "step": 59, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123772, + "balance_loss_mlp": 1.09970331, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.06438674129900752, + "language_loss": 1.04891515, + "learning_rate": 0.0008107844176832545, + "loss": 1.06015277, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.24072266, + "step": 60, + "time_per_iteration": 2.7009053230285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.11569333, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.09833112160800331, + "language_loss": 1.0671711, + "learning_rate": 0.0008140576401132568, + "loss": 1.07856739, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.23913574, + "step": 61, + "time_per_iteration": 2.678501844406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114169, + "balance_loss_mlp": 1.11887348, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.11014501355567002, + "language_loss": 1.07748628, + "learning_rate": 0.0008172776370494935, + "loss": 1.08890319, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.22814941, + "step": 62, + "time_per_iteration": 2.7718141078948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116479, + "balance_loss_mlp": 1.09356666, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.06441650429015075, + "language_loss": 1.15269816, + "learning_rate": 0.0008204461118185703, + "loss": 1.16386294, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.22912598, + "step": 63, + "time_per_iteration": 2.5839178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_mlp": 1.09543014, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.06608006175674933, + "language_loss": 1.04523873, + "learning_rate": 0.0008235646872681536, + "loss": 1.05641007, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.21728516, + "step": 64, + "time_per_iteration": 2.5611703395843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_mlp": 1.10659182, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.07834673611922068, + "language_loss": 1.04319417, + "learning_rate": 0.0008266349107584288, + "loss": 1.05447328, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.21313477, + "step": 65, + "time_per_iteration": 2.727666139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141841, + "balance_loss_mlp": 1.1207881, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.06003338375813584, + "language_loss": 1.07126927, + "learning_rate": 0.0008296582587724851, + "loss": 1.08268762, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21057129, + "step": 66, + "time_per_iteration": 2.716701030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127113, + "balance_loss_mlp": 1.10609627, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.04807876202194694, + "language_loss": 1.04662776, + "learning_rate": 0.0008326361411800136, + "loss": 1.05789876, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21008301, + "step": 67, + "time_per_iteration": 2.9571592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114337, + "balance_loss_mlp": 1.09446514, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.05551510449528945, + "language_loss": 1.05008268, + "learning_rate": 0.0008355699051851403, + "loss": 1.06122601, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1986084, + "step": 68, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.1242373, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.0697970629442659, + "language_loss": 1.12296045, + "learning_rate": 0.0008384608389860635, + "loss": 1.13439655, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.19372559, + "step": 69, + "time_per_iteration": 2.685215711593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.122311, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.08511613263061502, + "language_loss": 1.02745342, + "learning_rate": 0.000841310175171381, + "loss": 1.03886437, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.18774414, + "step": 70, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_mlp": 1.12464356, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.055787325190813475, + "language_loss": 1.0065217, + "learning_rate": 0.000844119093875517, + "loss": 1.0179472, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.17944336, + "step": 71, + "time_per_iteration": 2.753220319747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152267, + "balance_loss_mlp": 1.13508892, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08668312915327946, + "language_loss": 1.05463254, + "learning_rate": 0.0008468887257134666, + "loss": 1.0661552, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.17199707, + "step": 72, + "time_per_iteration": 2.7056305408477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117134, + "balance_loss_mlp": 1.15478206, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07356095482564125, + "language_loss": 1.08388793, + "learning_rate": 0.0008496201545131264, + "loss": 1.09560132, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.16564941, + "step": 73, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152437, + "balance_loss_mlp": 1.13545001, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.06787935984484554, + "language_loss": 1.06090975, + "learning_rate": 0.0008523144198617317, + "loss": 1.07243395, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16992188, + "step": 74, + "time_per_iteration": 3.2090003490448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1223346, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.04825332815792917, + "language_loss": 1.053195, + "learning_rate": 0.0008549725194813783, + "loss": 1.06458783, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.16967773, + "step": 75, + "time_per_iteration": 2.654343605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.10599899, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.03887402020767282, + "language_loss": 1.04797029, + "learning_rate": 0.0008575954114472099, + "loss": 1.05919111, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.1607666, + "step": 76, + "time_per_iteration": 3.119884967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.1187191, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.056937643991546806, + "language_loss": 1.02038705, + "learning_rate": 0.0008601840162606118, + "loss": 1.03173184, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.1574707, + "step": 77, + "time_per_iteration": 3.025688886642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146725, + "balance_loss_mlp": 1.13034582, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04989291514363055, + "language_loss": 1.08127129, + "learning_rate": 0.000862739218788641, + "loss": 1.09273863, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16381836, + "step": 78, + "time_per_iteration": 2.7922520637512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149681, + "balance_loss_mlp": 1.13339734, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.06709094188277621, + "language_loss": 1.06189477, + "learning_rate": 0.0008652618700799138, + "loss": 1.07339156, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.1628418, + "step": 79, + "time_per_iteration": 2.6902618408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_mlp": 1.1367681, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.062162504049989416, + "language_loss": 1.05161238, + "learning_rate": 0.0008677527890662774, + "loss": 1.06314492, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16491699, + "step": 80, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_mlp": 1.13076603, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.04934081686369646, + "language_loss": 1.06529951, + "learning_rate": 0.0008702127641587799, + "loss": 1.0767715, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.16430664, + "step": 81, + "time_per_iteration": 2.634038209915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_mlp": 1.12558985, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.08879987127008451, + "language_loss": 1.0221808, + "learning_rate": 0.0008726425547457192, + "loss": 1.0336051, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.16845703, + "step": 82, + "time_per_iteration": 2.74308705329895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.13108134, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.06313420095488197, + "language_loss": 1.01906681, + "learning_rate": 0.0008750428925998964, + "loss": 1.03054249, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.16491699, + "step": 83, + "time_per_iteration": 2.777132511138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146759, + "balance_loss_mlp": 1.13009322, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.11663644047392754, + "language_loss": 1.07169831, + "learning_rate": 0.0008774144832015932, + "loss": 1.08316588, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16674805, + "step": 84, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01524523, + "balance_loss_mlp": 1.51412809, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.22860236459315994, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76298833, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.10400391, + "step": 85, + "time_per_iteration": 4.57580041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166169, + "balance_loss_mlp": 1.1501826, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.05249425037579876, + "language_loss": 1.01959693, + "learning_rate": 0.0008820741205014318, + "loss": 1.03125858, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.15979004, + "step": 86, + "time_per_iteration": 2.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223619, + "balance_loss_mlp": 1.20703709, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.10761462625124436, + "language_loss": 1.03955913, + "learning_rate": 0.0008843634575408404, + "loss": 1.05179524, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.16577148, + "step": 87, + "time_per_iteration": 2.6694159507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228231, + "balance_loss_mlp": 1.21267366, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.10737104518045529, + "language_loss": 1.05078888, + "learning_rate": 0.0008866266301555082, + "loss": 1.06307125, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.15551758, + "step": 88, + "time_per_iteration": 2.7686069011688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212138, + "balance_loss_mlp": 1.19609249, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.1616084590878673, + "language_loss": 1.0609467, + "learning_rate": 0.0008888642296509615, + "loss": 1.07306814, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.16040039, + "step": 89, + "time_per_iteration": 2.625988721847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199649, + "balance_loss_mlp": 1.18316197, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.07585409016808545, + "language_loss": 1.1065979, + "learning_rate": 0.0008910768275115906, + "loss": 1.11859453, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16491699, + "step": 90, + "time_per_iteration": 2.793017864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_mlp": 1.15697813, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.07277460951060387, + "language_loss": 1.06493175, + "learning_rate": 0.0008932649762767675, + "loss": 1.07666695, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16552734, + "step": 91, + "time_per_iteration": 2.5919723510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169355, + "balance_loss_mlp": 1.15323818, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.10172519854243242, + "language_loss": 1.09112859, + "learning_rate": 0.0008954292103690864, + "loss": 1.10282218, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.16113281, + "step": 92, + "time_per_iteration": 2.9366836547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174542, + "balance_loss_mlp": 1.15828145, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.07803491111319032, + "language_loss": 1.10981905, + "learning_rate": 0.0008975700468778296, + "loss": 1.12156439, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16259766, + "step": 93, + "time_per_iteration": 2.592458963394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156862, + "balance_loss_mlp": 1.14067388, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.09102852745954727, + "language_loss": 1.04703569, + "learning_rate": 0.0008996879863005366, + "loss": 1.05860424, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.71566104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148536, + "balance_loss_mlp": 1.13235974, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.03859462796979438, + "language_loss": 1.04768109, + "learning_rate": 0.0009017835132453337, + "loss": 1.05916631, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.16174316, + "step": 95, + "time_per_iteration": 2.664511203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_mlp": 1.121889, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.060963703759419355, + "language_loss": 1.04675508, + "learning_rate": 0.0009038570970964896, + "loss": 1.05813384, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.15991211, + "step": 96, + "time_per_iteration": 2.7669789791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_mlp": 1.10899043, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0943042692373462, + "language_loss": 1.02071011, + "learning_rate": 0.0009059091926454854, + "loss": 1.03196073, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16064453, + "step": 97, + "time_per_iteration": 2.6028668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_mlp": 1.11052442, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.06745462513624549, + "language_loss": 1.0144124, + "learning_rate": 0.0009079402406897198, + "loss": 1.02567911, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.16137695, + "step": 98, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127975, + "balance_loss_mlp": 1.11166739, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.10523687850003575, + "language_loss": 1.03251696, + "learning_rate": 0.0009099506686008212, + "loss": 1.04379678, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16308594, + "step": 99, + "time_per_iteration": 2.8251914978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116643, + "balance_loss_mlp": 1.10100293, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.08495157768411668, + "language_loss": 1.0609076, + "learning_rate": 0.0009119408908644013, + "loss": 1.07207406, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15625, + "step": 100, + "time_per_iteration": 2.6573309898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.12211871, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.09022378013673595, + "language_loss": 1.11755276, + "learning_rate": 0.0009139113095929519, + "loss": 1.12892556, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15124512, + "step": 101, + "time_per_iteration": 2.844698429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159661, + "balance_loss_mlp": 1.14373517, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.0892612752622512, + "language_loss": 1.05698013, + "learning_rate": 0.0009158623150134762, + "loss": 1.06857681, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15917969, + "step": 102, + "time_per_iteration": 2.589857339859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_mlp": 1.12158906, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.06508497546963277, + "language_loss": 1.05496848, + "learning_rate": 0.000917794285931332, + "loss": 1.06634164, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15710449, + "step": 103, + "time_per_iteration": 2.6433918476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_mlp": 1.1019367, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.07675487095909958, + "language_loss": 0.97610366, + "learning_rate": 0.0009197075901716639, + "loss": 0.98728061, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.1574707, + "step": 104, + "time_per_iteration": 2.709157943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137693, + "balance_loss_mlp": 1.12159956, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.05257934075389246, + "language_loss": 1.0758431, + "learning_rate": 0.0009216025849997171, + "loss": 1.08722019, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16088867, + "step": 105, + "time_per_iteration": 2.7638583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111903, + "balance_loss_mlp": 1.09596467, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.07457888312135433, + "language_loss": 1.02261579, + "learning_rate": 0.0009234796175212258, + "loss": 1.03373492, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.15930176, + "step": 106, + "time_per_iteration": 2.9391980171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117989, + "balance_loss_mlp": 1.10228872, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.06024423434996524, + "language_loss": 1.05948544, + "learning_rate": 0.000925339025064007, + "loss": 1.07066536, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.15686035, + "step": 107, + "time_per_iteration": 2.975294828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118819, + "balance_loss_mlp": 1.10334611, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.07105297051955457, + "language_loss": 0.99294066, + "learning_rate": 0.0009271811355418027, + "loss": 1.00412893, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.15454102, + "step": 108, + "time_per_iteration": 2.8750014305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125269, + "balance_loss_mlp": 1.10940242, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09212378946406244, + "language_loss": 1.05636311, + "learning_rate": 0.0009290062678013548, + "loss": 1.06761575, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.15856934, + "step": 109, + "time_per_iteration": 2.8552017211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119898, + "balance_loss_mlp": 1.10393572, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.059465971869905314, + "language_loss": 1.04477715, + "learning_rate": 0.0009308147319536321, + "loss": 1.05597615, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.1595459, + "step": 110, + "time_per_iteration": 2.6493232250213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129754, + "balance_loss_mlp": 1.11385095, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.08324280754141193, + "language_loss": 1.10257316, + "learning_rate": 0.0009326068296900676, + "loss": 1.11387074, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.15893555, + "step": 111, + "time_per_iteration": 2.8384125232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112769, + "balance_loss_mlp": 1.11171615, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06941460102767082, + "language_loss": 1.01355243, + "learning_rate": 0.0009343828545846161, + "loss": 1.02482939, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.15966797, + "step": 112, + "time_per_iteration": 2.7743477821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114893, + "balance_loss_mlp": 1.13326573, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.047977415311889204, + "language_loss": 1.05199587, + "learning_rate": 0.0009361430923823841, + "loss": 1.06348515, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.15649414, + "step": 113, + "time_per_iteration": 2.6022982597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.10308659, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.080001842017843, + "language_loss": 1.09258401, + "learning_rate": 0.0009378878212755459, + "loss": 1.10376549, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15039062, + "step": 114, + "time_per_iteration": 2.491594076156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115419, + "balance_loss_mlp": 1.09967113, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.05036418666557463, + "language_loss": 0.9906168, + "learning_rate": 0.0009396173121672103, + "loss": 1.00177097, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.15734863, + "step": 115, + "time_per_iteration": 2.668848991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_mlp": 1.10945916, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.05918191636932359, + "language_loss": 1.04414749, + "learning_rate": 0.0009413318289238633, + "loss": 1.05539548, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.15307617, + "step": 116, + "time_per_iteration": 2.7496132850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106823, + "balance_loss_mlp": 1.09139705, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.1124204963758038, + "language_loss": 0.96924931, + "learning_rate": 0.0009430316286169771, + "loss": 0.98031747, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.15405273, + "step": 117, + "time_per_iteration": 3.026118278503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_mlp": 1.11998308, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.03693994945601898, + "language_loss": 1.02417183, + "learning_rate": 0.0009447169617543361, + "loss": 1.03552485, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15307617, + "step": 118, + "time_per_iteration": 2.575666666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156925, + "balance_loss_mlp": 1.14185703, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.10959367855453626, + "language_loss": 1.09001684, + "learning_rate": 0.0009463880725016029, + "loss": 1.1015861, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.15039062, + "step": 119, + "time_per_iteration": 2.6811347007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115422, + "balance_loss_mlp": 1.10052109, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.05068852434870314, + "language_loss": 1.03909945, + "learning_rate": 0.0009480451988946134, + "loss": 1.05025363, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.14880371, + "step": 120, + "time_per_iteration": 2.801814079284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_mlp": 1.09179425, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.05688398470992871, + "language_loss": 1.05377555, + "learning_rate": 0.0009496885730428627, + "loss": 1.06484532, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1517334, + "step": 121, + "time_per_iteration": 3.04720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_mlp": 1.10574555, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.08369646841136469, + "language_loss": 1.03908122, + "learning_rate": 0.0009513184213246156, + "loss": 1.05029583, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.15710449, + "step": 122, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129626, + "balance_loss_mlp": 1.11406958, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.05522871343558165, + "language_loss": 1.07008672, + "learning_rate": 0.0009529349645740552, + "loss": 1.08138299, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15539551, + "step": 123, + "time_per_iteration": 2.69759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129797, + "balance_loss_mlp": 1.11481285, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.053769267634074955, + "language_loss": 1.05687594, + "learning_rate": 0.0009545384182608524, + "loss": 1.06817389, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1496582, + "step": 124, + "time_per_iteration": 2.550584316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126, + "balance_loss_mlp": 1.11114669, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.08700167249890467, + "language_loss": 1.02945745, + "learning_rate": 0.0009561289926625252, + "loss": 1.04071736, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14831543, + "step": 125, + "time_per_iteration": 2.6619794368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_mlp": 1.10831082, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.07114777459455598, + "language_loss": 1.07932711, + "learning_rate": 0.0009577068930299292, + "loss": 1.09056234, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.15209961, + "step": 126, + "time_per_iteration": 2.553642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125226, + "balance_loss_mlp": 1.11038458, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.08279894264625885, + "language_loss": 1.03556633, + "learning_rate": 0.0009592723197462087, + "loss": 1.04681861, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.14819336, + "step": 127, + "time_per_iteration": 2.7255966663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_mlp": 1.10936916, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07600858050716931, + "language_loss": 0.99905002, + "learning_rate": 0.0009608254684795125, + "loss": 1.01029539, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15148926, + "step": 128, + "time_per_iteration": 2.9839587211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_mlp": 1.11718702, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.08573045125619827, + "language_loss": 1.02976727, + "learning_rate": 0.0009623665303297678, + "loss": 1.04109192, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.15258789, + "step": 129, + "time_per_iteration": 2.7344865798950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_mlp": 1.10497391, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.07510500588649292, + "language_loss": 1.07057762, + "learning_rate": 0.0009638956919697878, + "loss": 1.08177161, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.14416504, + "step": 130, + "time_per_iteration": 2.864952802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_mlp": 1.08930528, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.0567118244953117, + "language_loss": 0.99135083, + "learning_rate": 0.0009654131357809714, + "loss": 1.00239229, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.14819336, + "step": 131, + "time_per_iteration": 2.6095099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_mlp": 1.1081202, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.05892082702998288, + "language_loss": 1.08188879, + "learning_rate": 0.0009669190399838441, + "loss": 1.09312594, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.15576172, + "step": 132, + "time_per_iteration": 3.096733331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_mlp": 1.08531809, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.09564892115109941, + "language_loss": 1.01233923, + "learning_rate": 0.0009684135787636724, + "loss": 1.02334726, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.15478516, + "step": 133, + "time_per_iteration": 2.8120856285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111325, + "balance_loss_mlp": 1.09529161, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.04870542745948935, + "language_loss": 1.05797207, + "learning_rate": 0.0009698969223913726, + "loss": 1.06908536, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.16027832, + "step": 134, + "time_per_iteration": 3.0269176959991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_mlp": 1.10735679, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.04083122637660085, + "language_loss": 1.08225274, + "learning_rate": 0.0009713692373399265, + "loss": 1.09348655, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.16015625, + "step": 135, + "time_per_iteration": 2.690932273864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01798361, + "balance_loss_mlp": 1.75773478, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.2058674005568875, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.8125459, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.40625, + "step": 136, + "time_per_iteration": 5.460411548614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01507549, + "balance_loss_mlp": 1.47512448, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.12866590611947104, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79318589, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.32421875, + "step": 137, + "time_per_iteration": 4.989046335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146765, + "balance_loss_mlp": 1.13081443, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.04917093034878699, + "language_loss": 1.00934815, + "learning_rate": 0.0009757216201974225, + "loss": 1.02081585, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.1595459, + "step": 138, + "time_per_iteration": 2.9566736221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162305, + "balance_loss_mlp": 1.1448524, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.06281235859244827, + "language_loss": 1.0596863, + "learning_rate": 0.0009771514130396581, + "loss": 1.07130933, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17468262, + "step": 139, + "time_per_iteration": 2.683931350708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150087, + "balance_loss_mlp": 1.1330874, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09254080332591261, + "language_loss": 1.06202602, + "learning_rate": 0.00097857095638274, + "loss": 1.07352686, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17016602, + "step": 140, + "time_per_iteration": 2.558708906173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149417, + "balance_loss_mlp": 1.13241768, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.03864103733020509, + "language_loss": 0.97399604, + "learning_rate": 0.0009799803961288726, + "loss": 0.9854902, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17016602, + "step": 141, + "time_per_iteration": 2.992034673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_mlp": 1.10685217, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.06378420241673269, + "language_loss": 1.03629804, + "learning_rate": 0.000981379875086876, + "loss": 1.0475328, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16625977, + "step": 142, + "time_per_iteration": 3.063534736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121821, + "balance_loss_mlp": 1.10560894, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.046520134554953796, + "language_loss": 0.98784387, + "learning_rate": 0.0009827695330590185, + "loss": 0.99906206, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.1619873, + "step": 143, + "time_per_iteration": 2.6495330333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_mlp": 1.1078757, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.05485832849515215, + "language_loss": 0.98036379, + "learning_rate": 0.0009841495069248256, + "loss": 0.99160779, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.1652832, + "step": 144, + "time_per_iteration": 2.9577834606170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_mlp": 1.12901306, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.09798795242100523, + "language_loss": 0.97478735, + "learning_rate": 0.0009855199307219871, + "loss": 0.98624128, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.16381836, + "step": 145, + "time_per_iteration": 2.6759142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148365, + "balance_loss_mlp": 1.13168764, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.1254453322996171, + "language_loss": 0.99733889, + "learning_rate": 0.0009868809357244854, + "loss": 1.00882256, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16687012, + "step": 146, + "time_per_iteration": 2.66375994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113683, + "balance_loss_mlp": 1.11978364, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.08248071954181796, + "language_loss": 1.03600287, + "learning_rate": 0.0009882326505180556, + "loss": 1.04737115, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.1706543, + "step": 147, + "time_per_iteration": 2.719353437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_mlp": 1.13280392, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.12761243433758393, + "language_loss": 1.02101135, + "learning_rate": 0.0009895752010730906, + "loss": 1.03252351, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.1842041, + "step": 148, + "time_per_iteration": 2.9704201221466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141454, + "balance_loss_mlp": 1.12377512, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07962775403881484, + "language_loss": 1.0825479, + "learning_rate": 0.0009909087108150867, + "loss": 1.09396255, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.17687988, + "step": 149, + "time_per_iteration": 2.7516071796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151554, + "balance_loss_mlp": 1.13330352, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.10196194967952074, + "language_loss": 1.09083438, + "learning_rate": 0.0009922333006927371, + "loss": 1.10235, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.18249512, + "step": 150, + "time_per_iteration": 2.4685099124908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.15218103, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.13259475383105176, + "language_loss": 1.020684, + "learning_rate": 0.0009935490892437632, + "loss": 1.03238916, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.18322754, + "step": 151, + "time_per_iteration": 2.5665087699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166904, + "balance_loss_mlp": 1.14880824, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10481585745820837, + "language_loss": 1.00390673, + "learning_rate": 0.0009948561926585687, + "loss": 1.01557577, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.18103027, + "step": 152, + "time_per_iteration": 2.7641003131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139325, + "balance_loss_mlp": 1.122576, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09697971136145118, + "language_loss": 1.05073512, + "learning_rate": 0.0009961547248418122, + "loss": 1.06212831, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.16760254, + "step": 153, + "time_per_iteration": 2.631476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123418, + "balance_loss_mlp": 1.10662186, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.05437877185758658, + "language_loss": 1.01441622, + "learning_rate": 0.0009974447974719707, + "loss": 1.0256505, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.16809082, + "step": 154, + "time_per_iteration": 2.709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.11151338, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.09703401576709127, + "language_loss": 1.03478801, + "learning_rate": 0.0009987265200589763, + "loss": 1.0460813, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.17810059, + "step": 155, + "time_per_iteration": 2.77809739112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140894, + "balance_loss_mlp": 1.12376344, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.08300490544518559, + "language_loss": 1.02959824, + "learning_rate": 0.001, + "loss": 1.04100728, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.17150879, + "step": 156, + "time_per_iteration": 2.845790386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144802, + "balance_loss_mlp": 1.12720668, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07590676388764007, + "language_loss": 1.00599122, + "learning_rate": 0.0009999999029413921, + "loss": 1.01743913, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.17614746, + "step": 157, + "time_per_iteration": 2.833735227584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142594, + "balance_loss_mlp": 1.12554669, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.06607639809804342, + "language_loss": 1.01453137, + "learning_rate": 0.0009999996117656068, + "loss": 1.02595735, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.1706543, + "step": 158, + "time_per_iteration": 2.803636074066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011301, + "balance_loss_mlp": 1.11345792, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.08769352458743468, + "language_loss": 0.94982773, + "learning_rate": 0.0009999991264727564, + "loss": 0.96112871, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.16638184, + "step": 159, + "time_per_iteration": 2.7776851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.11870432, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.05788098803643346, + "language_loss": 1.06247735, + "learning_rate": 0.0009999984470630296, + "loss": 1.07383585, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.17163086, + "step": 160, + "time_per_iteration": 2.6311371326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125321, + "balance_loss_mlp": 1.10836911, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.05159431076001957, + "language_loss": 0.94850963, + "learning_rate": 0.0009999975735366902, + "loss": 0.95976287, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.16955566, + "step": 161, + "time_per_iteration": 3.0904829502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148114, + "balance_loss_mlp": 1.13099504, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0692270455282635, + "language_loss": 0.96706492, + "learning_rate": 0.0009999965058940775, + "loss": 0.97854608, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.17138672, + "step": 162, + "time_per_iteration": 3.490063428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150632, + "balance_loss_mlp": 1.13323975, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08572766411177644, + "language_loss": 1.03267431, + "learning_rate": 0.0009999952441356057, + "loss": 1.04418063, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.17382812, + "step": 163, + "time_per_iteration": 2.497690439224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130614, + "balance_loss_mlp": 1.11405563, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.05784293330097489, + "language_loss": 1.03805065, + "learning_rate": 0.000999993788261765, + "loss": 1.0493567, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.16564941, + "step": 164, + "time_per_iteration": 3.6041390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.1152972, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.05766532368121917, + "language_loss": 1.05311596, + "learning_rate": 0.00099999213827312, + "loss": 1.06444073, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.171875, + "step": 165, + "time_per_iteration": 2.806014060974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_mlp": 1.12589669, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.05992608893057494, + "language_loss": 1.00112009, + "learning_rate": 0.000999990294170312, + "loss": 1.01254439, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.16540527, + "step": 166, + "time_per_iteration": 2.6405951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.11351717, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.05363857392651908, + "language_loss": 1.03767109, + "learning_rate": 0.0009999882559540566, + "loss": 1.04897451, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.16845703, + "step": 167, + "time_per_iteration": 2.69801664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_mlp": 1.11079764, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.03971308084427602, + "language_loss": 1.00767386, + "learning_rate": 0.000999986023625145, + "loss": 1.01894999, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.16821289, + "step": 168, + "time_per_iteration": 2.710706949234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04227602, + "balance_loss_mlp": 3.93005633, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.49669676383753814, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8315202, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.96875, + "step": 169, + "time_per_iteration": 4.921034574508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178384, + "balance_loss_mlp": 1.15987098, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11256254520903143, + "language_loss": 1.01289928, + "learning_rate": 0.0009999809766328958, + "loss": 1.02468312, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.18518066, + "step": 170, + "time_per_iteration": 2.6784250736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236994, + "balance_loss_mlp": 1.21676469, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.13219145589868983, + "language_loss": 1.0357101, + "learning_rate": 0.0009999781619715177, + "loss": 1.04807997, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.20227051, + "step": 171, + "time_per_iteration": 2.5412755012512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234758, + "balance_loss_mlp": 1.21518433, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.05193788120122226, + "language_loss": 1.03408492, + "learning_rate": 0.000999975153201402, + "loss": 1.0464325, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.19567871, + "step": 172, + "time_per_iteration": 2.864586353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_mlp": 1.21688426, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.0814546252210238, + "language_loss": 1.01345742, + "learning_rate": 0.0009999719503237174, + "loss": 1.02582097, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.19470215, + "step": 173, + "time_per_iteration": 2.765923261642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266228, + "balance_loss_mlp": 1.24583161, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11494520888694326, + "language_loss": 1.10141742, + "learning_rate": 0.0009999685533397073, + "loss": 1.11407971, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20410156, + "step": 174, + "time_per_iteration": 2.5439114570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264706, + "balance_loss_mlp": 1.24525094, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.12313705571337571, + "language_loss": 1.01947784, + "learning_rate": 0.00099996496225069, + "loss": 1.03212488, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19445801, + "step": 175, + "time_per_iteration": 2.6815552711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257561, + "balance_loss_mlp": 1.23677111, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07888015485072913, + "language_loss": 1.04929149, + "learning_rate": 0.0009999611770580604, + "loss": 1.06186724, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.20788574, + "step": 176, + "time_per_iteration": 2.841484785079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258013, + "balance_loss_mlp": 1.23668683, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.1202186920466195, + "language_loss": 1.03394961, + "learning_rate": 0.0009999571977632876, + "loss": 1.04652977, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21350098, + "step": 177, + "time_per_iteration": 2.567788600921631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271496, + "balance_loss_mlp": 1.25026441, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.09201820914192435, + "language_loss": 1.05765235, + "learning_rate": 0.0009999530243679166, + "loss": 1.07036722, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.21240234, + "step": 178, + "time_per_iteration": 2.5753743648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258548, + "balance_loss_mlp": 1.23935485, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.06529189645852858, + "language_loss": 1.00495052, + "learning_rate": 0.0009999486568735675, + "loss": 1.01753592, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.19177246, + "step": 179, + "time_per_iteration": 3.0607473850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251876, + "balance_loss_mlp": 1.23275518, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.07628849485304477, + "language_loss": 1.00889277, + "learning_rate": 0.0009999440952819362, + "loss": 1.02141166, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.19116211, + "step": 180, + "time_per_iteration": 3.6515376567840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248658, + "balance_loss_mlp": 1.22853494, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.05983966318213213, + "language_loss": 1.0115366, + "learning_rate": 0.0009999393395947935, + "loss": 1.02402306, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2010498, + "step": 181, + "time_per_iteration": 2.799633502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253433, + "balance_loss_mlp": 1.23378766, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.0770350968764605, + "language_loss": 1.04747987, + "learning_rate": 0.0009999343898139858, + "loss": 1.06001413, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19641113, + "step": 182, + "time_per_iteration": 2.627434253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258891, + "balance_loss_mlp": 1.23675334, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.06485795323962908, + "language_loss": 1.03381288, + "learning_rate": 0.0009999292459414348, + "loss": 1.04640174, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.22131348, + "step": 183, + "time_per_iteration": 2.5552356243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227697, + "balance_loss_mlp": 1.20765769, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.06837915158031915, + "language_loss": 1.07873201, + "learning_rate": 0.0009999239079791374, + "loss": 1.0910089, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.20031738, + "step": 184, + "time_per_iteration": 2.5553643703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225953, + "balance_loss_mlp": 1.20453107, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.05538225102727573, + "language_loss": 1.00595856, + "learning_rate": 0.0009999183759291659, + "loss": 1.01821804, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.21435547, + "step": 185, + "time_per_iteration": 2.6955769062042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199054, + "balance_loss_mlp": 1.17938447, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.052094207769016576, + "language_loss": 1.02581143, + "learning_rate": 0.0009999126497936682, + "loss": 1.03780198, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1965332, + "step": 186, + "time_per_iteration": 2.5304598808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198293, + "balance_loss_mlp": 1.1770494, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057723222775786294, + "language_loss": 1.05774581, + "learning_rate": 0.0009999067295748676, + "loss": 1.06972873, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21252441, + "step": 187, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225876, + "balance_loss_mlp": 1.20496714, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.0756096280824464, + "language_loss": 1.03738201, + "learning_rate": 0.000999900615275062, + "loss": 1.04964077, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.20922852, + "step": 188, + "time_per_iteration": 2.677471399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211466, + "balance_loss_mlp": 1.18979406, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.0898221855427691, + "language_loss": 1.09605587, + "learning_rate": 0.0009998943068966256, + "loss": 1.10817051, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21679688, + "step": 189, + "time_per_iteration": 2.4233202934265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217638, + "balance_loss_mlp": 1.19651425, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.10338446511893212, + "language_loss": 1.03747463, + "learning_rate": 0.0009998878044420072, + "loss": 1.04965115, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.21130371, + "step": 190, + "time_per_iteration": 2.6978025436401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177731, + "balance_loss_mlp": 1.15573716, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06881722524262912, + "language_loss": 0.99768066, + "learning_rate": 0.0009998811079137318, + "loss": 1.00945807, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22009277, + "step": 191, + "time_per_iteration": 2.5934321880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.12218916, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.0852793637050772, + "language_loss": 1.0086391, + "learning_rate": 0.0009998742173143987, + "loss": 1.02007401, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.2130127, + "step": 192, + "time_per_iteration": 2.6706249713897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139307, + "balance_loss_mlp": 1.1180048, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.07456835679934387, + "language_loss": 1.01398337, + "learning_rate": 0.0009998671326466833, + "loss": 1.02537644, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.21313477, + "step": 193, + "time_per_iteration": 2.992595672607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10519516, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.08171257283174432, + "language_loss": 1.02813613, + "learning_rate": 0.0009998598539133362, + "loss": 1.03940392, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21594238, + "step": 194, + "time_per_iteration": 3.0081543922424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113199, + "balance_loss_mlp": 1.11179638, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.05573112518601677, + "language_loss": 1.02892375, + "learning_rate": 0.0009998523811171828, + "loss": 1.04024363, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2019043, + "step": 195, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149122, + "balance_loss_mlp": 1.12843966, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0935188115694547, + "language_loss": 1.0387187, + "learning_rate": 0.0009998447142611248, + "loss": 1.05020976, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.20690918, + "step": 196, + "time_per_iteration": 2.6388566493988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160139, + "balance_loss_mlp": 1.13986123, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.047444937864230444, + "language_loss": 0.96302813, + "learning_rate": 0.0009998368533481387, + "loss": 0.97462952, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.20275879, + "step": 197, + "time_per_iteration": 3.033572196960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132809, + "balance_loss_mlp": 1.11254394, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08710369828361038, + "language_loss": 0.9995833, + "learning_rate": 0.0009998287983812762, + "loss": 1.01091146, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.20263672, + "step": 198, + "time_per_iteration": 2.8421950340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155397, + "balance_loss_mlp": 1.13373709, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.10277508525357126, + "language_loss": 1.05776644, + "learning_rate": 0.0009998205493636646, + "loss": 1.06932044, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.2166748, + "step": 199, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141939, + "balance_loss_mlp": 1.12035084, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.09429923895154278, + "language_loss": 0.98451054, + "learning_rate": 0.0009998121062985063, + "loss": 0.99592984, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.21594238, + "step": 200, + "time_per_iteration": 2.6926732063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171328, + "balance_loss_mlp": 1.15014482, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.08332681767957313, + "language_loss": 1.00419915, + "learning_rate": 0.0009998034691890794, + "loss": 1.01591253, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.21203613, + "step": 201, + "time_per_iteration": 2.7643332481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165409, + "balance_loss_mlp": 1.14516699, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.11326578301102472, + "language_loss": 1.05536067, + "learning_rate": 0.0009997946380387369, + "loss": 1.06701469, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.20251465, + "step": 202, + "time_per_iteration": 2.630284070968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157571, + "balance_loss_mlp": 1.13723421, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09790094078320352, + "language_loss": 1.07388449, + "learning_rate": 0.0009997856128509076, + "loss": 1.08546019, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.20336914, + "step": 203, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144349, + "balance_loss_mlp": 1.12458408, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.1356659453961297, + "language_loss": 1.02559984, + "learning_rate": 0.0009997763936290952, + "loss": 1.03704333, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.19750977, + "step": 204, + "time_per_iteration": 2.503309965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138207, + "balance_loss_mlp": 1.11642766, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.053010676996176516, + "language_loss": 1.07603145, + "learning_rate": 0.0009997669803768789, + "loss": 1.08741355, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.21789551, + "step": 205, + "time_per_iteration": 2.7773749828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_mlp": 1.09366679, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07785432610828748, + "language_loss": 1.0289582, + "learning_rate": 0.0009997573730979134, + "loss": 1.04010415, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.20947266, + "step": 206, + "time_per_iteration": 2.7241222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04720912, + "balance_loss_mlp": 3.71993518, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.31672297251450016, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.83914113, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 10.0, + "step": 207, + "time_per_iteration": 4.65311074256897 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160001, + "balance_loss_mlp": 1.13651657, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.09244016287770654, + "language_loss": 1.01599813, + "learning_rate": 0.0009997375764747294, + "loss": 1.02759814, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.23449707, + "step": 208, + "time_per_iteration": 2.999249219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144547, + "balance_loss_mlp": 1.12159967, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.10768555369795524, + "language_loss": 0.98886019, + "learning_rate": 0.0009997273871381967, + "loss": 1.00030565, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.22949219, + "step": 209, + "time_per_iteration": 2.740895986557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154635, + "balance_loss_mlp": 1.13075733, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.0670178022721504, + "language_loss": 1.03911638, + "learning_rate": 0.0009997170037902862, + "loss": 1.05066276, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.23876953, + "step": 210, + "time_per_iteration": 2.7199809551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161677, + "balance_loss_mlp": 1.13826418, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.062356382061819024, + "language_loss": 1.06535935, + "learning_rate": 0.0009997064264350292, + "loss": 1.07697606, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.23413086, + "step": 211, + "time_per_iteration": 2.85477614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164794, + "balance_loss_mlp": 1.14111865, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.11782714892356931, + "language_loss": 1.00570273, + "learning_rate": 0.0009996956550765317, + "loss": 1.01735067, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.23657227, + "step": 212, + "time_per_iteration": 2.683258295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178964, + "balance_loss_mlp": 1.15452623, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07352585681220185, + "language_loss": 0.95357072, + "learning_rate": 0.0009996846897189762, + "loss": 0.9653604, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.24438477, + "step": 213, + "time_per_iteration": 2.64486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.14665973, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.06101080420793073, + "language_loss": 1.01569629, + "learning_rate": 0.0009996735303666193, + "loss": 1.02740788, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.24499512, + "step": 214, + "time_per_iteration": 2.719754934310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189275, + "balance_loss_mlp": 1.16434813, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.09805160088916984, + "language_loss": 1.03784573, + "learning_rate": 0.0009996621770237937, + "loss": 1.04973853, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24938965, + "step": 215, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202725, + "balance_loss_mlp": 1.17728579, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.05858333324383458, + "language_loss": 0.99328029, + "learning_rate": 0.0009996506296949073, + "loss": 1.00530756, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.25463867, + "step": 216, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175124, + "balance_loss_mlp": 1.14957714, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.09898600739692984, + "language_loss": 0.99386859, + "learning_rate": 0.0009996388883844428, + "loss": 1.00561976, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.25561523, + "step": 217, + "time_per_iteration": 2.5985324382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155134, + "balance_loss_mlp": 1.13007665, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06208913439552352, + "language_loss": 1.03500867, + "learning_rate": 0.0009996269530969588, + "loss": 1.04656017, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25048828, + "step": 218, + "time_per_iteration": 2.591993808746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152332, + "balance_loss_mlp": 1.12778735, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.08789931910276294, + "language_loss": 1.02762055, + "learning_rate": 0.0009996148238370888, + "loss": 1.0391438, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24536133, + "step": 219, + "time_per_iteration": 2.7247660160064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146753, + "balance_loss_mlp": 1.12125421, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.059765696203788965, + "language_loss": 0.98427057, + "learning_rate": 0.0009996025006095421, + "loss": 0.99573809, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.25524902, + "step": 220, + "time_per_iteration": 3.314250946044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04012538, + "balance_loss_mlp": 3.61886096, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.18322335632445477, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81795681, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 3.921875, + "step": 221, + "time_per_iteration": 5.397853851318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_mlp": 1.11779404, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.10045289138425088, + "language_loss": 0.98726314, + "learning_rate": 0.0009995772722706307, + "loss": 0.99869102, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.25, + "step": 222, + "time_per_iteration": 2.8346786499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168149, + "balance_loss_mlp": 1.14130318, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.07395583213906755, + "language_loss": 1.12709904, + "learning_rate": 0.0009995643671690604, + "loss": 1.13878047, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.26879883, + "step": 223, + "time_per_iteration": 2.4760169982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157966, + "balance_loss_mlp": 1.1317513, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.08239055528326475, + "language_loss": 1.00208497, + "learning_rate": 0.0009995512681194023, + "loss": 1.01366448, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.26257324, + "step": 224, + "time_per_iteration": 2.833751916885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151672, + "balance_loss_mlp": 1.12492132, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.058356102807926864, + "language_loss": 0.97854793, + "learning_rate": 0.0009995379751267417, + "loss": 0.99006462, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.2677002, + "step": 225, + "time_per_iteration": 3.295761823654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_mlp": 1.1551652, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.09032086206875983, + "language_loss": 0.99067688, + "learning_rate": 0.0009995244881962398, + "loss": 1.00250244, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.27416992, + "step": 226, + "time_per_iteration": 2.6147754192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162924, + "balance_loss_mlp": 1.1352675, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.05273235380658081, + "language_loss": 1.00220668, + "learning_rate": 0.0009995108073331323, + "loss": 1.01383591, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27661133, + "step": 227, + "time_per_iteration": 2.575477361679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165107, + "balance_loss_mlp": 1.13835633, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.07222661628022838, + "language_loss": 1.03328192, + "learning_rate": 0.0009994969325427309, + "loss": 1.04493296, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.26733398, + "step": 228, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159475, + "balance_loss_mlp": 1.13215184, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.05690950477809338, + "language_loss": 0.99788582, + "learning_rate": 0.0009994828638304218, + "loss": 1.0094806, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.2734375, + "step": 229, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160216, + "balance_loss_mlp": 1.13327467, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.0671245201901001, + "language_loss": 1.05080867, + "learning_rate": 0.0009994686012016675, + "loss": 1.06241083, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.26953125, + "step": 230, + "time_per_iteration": 2.5507686138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200075, + "balance_loss_mlp": 1.17368245, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.08083200993131012, + "language_loss": 1.04836714, + "learning_rate": 0.000999454144662005, + "loss": 1.06036782, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.26416016, + "step": 231, + "time_per_iteration": 2.872386932373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177085, + "balance_loss_mlp": 1.15090632, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.06521500069668446, + "language_loss": 0.98697901, + "learning_rate": 0.0009994394942170468, + "loss": 0.99874985, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.26208496, + "step": 232, + "time_per_iteration": 2.6734542846679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_mlp": 1.13452244, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06848368332912834, + "language_loss": 0.96340638, + "learning_rate": 0.0009994246498724808, + "loss": 0.97500765, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.25598145, + "step": 233, + "time_per_iteration": 2.735145330429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.14341569, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.09664881582101635, + "language_loss": 0.99309772, + "learning_rate": 0.00099940961163407, + "loss": 1.00479114, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.25964355, + "step": 234, + "time_per_iteration": 2.8988683223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_mlp": 1.11722803, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.06003753756121682, + "language_loss": 1.01686716, + "learning_rate": 0.0009993943795076528, + "loss": 1.02828944, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.25012207, + "step": 235, + "time_per_iteration": 2.6333067417144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132836, + "balance_loss_mlp": 1.10618043, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.08170413586498586, + "language_loss": 1.0374043, + "learning_rate": 0.0009993789534991427, + "loss": 1.04873264, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.26708984, + "step": 236, + "time_per_iteration": 2.4350106716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_mlp": 1.0960753, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.0440176634981383, + "language_loss": 0.99063611, + "learning_rate": 0.0009993633336145287, + "loss": 1.00186157, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26513672, + "step": 237, + "time_per_iteration": 2.6414294242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134799, + "balance_loss_mlp": 1.10904956, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.04213473561248219, + "language_loss": 1.02718055, + "learning_rate": 0.0009993475198598752, + "loss": 1.03852856, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.25756836, + "step": 238, + "time_per_iteration": 2.9781904220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152995, + "balance_loss_mlp": 1.12614954, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08613106589232603, + "language_loss": 1.00055635, + "learning_rate": 0.0009993315122413212, + "loss": 1.01208627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.26879883, + "step": 239, + "time_per_iteration": 2.6395275592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_mlp": 1.13594294, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.06839694959482054, + "language_loss": 0.99973977, + "learning_rate": 0.0009993153107650818, + "loss": 1.01136363, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.2644043, + "step": 240, + "time_per_iteration": 2.563133716583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_mlp": 1.13391829, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.06471449859153773, + "language_loss": 0.98970807, + "learning_rate": 0.0009992989154374468, + "loss": 1.00131631, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.26928711, + "step": 241, + "time_per_iteration": 2.5339503288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145463, + "balance_loss_mlp": 1.11914206, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06957696695924716, + "language_loss": 1.05868769, + "learning_rate": 0.0009992823262647817, + "loss": 1.07014227, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26342773, + "step": 242, + "time_per_iteration": 2.6841883659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111302, + "balance_loss_mlp": 1.08692503, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.0649477492764712, + "language_loss": 0.99848783, + "learning_rate": 0.0009992655432535264, + "loss": 1.00961804, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.2611084, + "step": 243, + "time_per_iteration": 2.7613234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107198, + "balance_loss_mlp": 1.08162785, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.05612685480258275, + "language_loss": 1.00329947, + "learning_rate": 0.0009992485664101973, + "loss": 1.01437151, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.25598145, + "step": 244, + "time_per_iteration": 2.717280387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.09556472, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.10316769075352135, + "language_loss": 1.02662849, + "learning_rate": 0.000999231395741385, + "loss": 1.03785205, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.26831055, + "step": 245, + "time_per_iteration": 3.095249891281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_mlp": 1.11837006, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.09647975042234339, + "language_loss": 1.01015186, + "learning_rate": 0.0009992140312537557, + "loss": 1.02159202, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.25671387, + "step": 246, + "time_per_iteration": 2.633258819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.09845233, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.09798218580430706, + "language_loss": 0.95550418, + "learning_rate": 0.000999196472954051, + "loss": 0.96674085, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.25231934, + "step": 247, + "time_per_iteration": 3.024939775466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02466762, + "balance_loss_mlp": 2.43700695, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.2831653982047738, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81891614, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 0.296875, + "step": 248, + "time_per_iteration": 5.486468076705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162703, + "balance_loss_mlp": 1.13626289, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.12969478117477343, + "language_loss": 1.03178453, + "learning_rate": 0.0009991607749457578, + "loss": 1.04341149, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.26464844, + "step": 249, + "time_per_iteration": 2.5253713130950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119774, + "balance_loss_mlp": 1.16941571, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.09425507858465235, + "language_loss": 1.01008546, + "learning_rate": 0.0009991426352510286, + "loss": 1.0220629, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.28295898, + "step": 250, + "time_per_iteration": 3.0042202472686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204128, + "balance_loss_mlp": 1.174016, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.07677732337183582, + "language_loss": 1.0282234, + "learning_rate": 0.0009991243017719422, + "loss": 1.04026473, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30126953, + "step": 251, + "time_per_iteration": 2.709934711456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206766, + "balance_loss_mlp": 1.17522311, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.1103729500964747, + "language_loss": 0.97436613, + "learning_rate": 0.0009991057745156165, + "loss": 0.9864338, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.31518555, + "step": 252, + "time_per_iteration": 2.5961716175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03348202, + "balance_loss_mlp": 3.30471396, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.3811060337507454, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85259187, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.43554688, + "step": 253, + "time_per_iteration": 5.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195312, + "balance_loss_mlp": 1.1623621, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.07473951959737497, + "language_loss": 1.05491519, + "learning_rate": 0.0009990681387000943, + "loss": 1.06686831, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.3293457, + "step": 254, + "time_per_iteration": 2.7937283515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121698, + "balance_loss_mlp": 1.18345821, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.06898181212790383, + "language_loss": 1.01063621, + "learning_rate": 0.0009990490301555093, + "loss": 1.02280605, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.33544922, + "step": 255, + "time_per_iteration": 2.9615726470947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05252755, + "balance_loss_mlp": 5.12458086, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.5609302024280507, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.84467912, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.28125, + "step": 256, + "time_per_iteration": 4.8413920402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03162439, + "balance_loss_mlp": 3.09758925, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.1723793408951341, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8240518, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6484375, + "step": 257, + "time_per_iteration": 4.985513687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03630928, + "balance_loss_mlp": 3.55844903, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4079591987734508, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73606813, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.7265625, + "step": 258, + "time_per_iteration": 4.858096361160278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403117, + "balance_loss_mlp": 1.35569584, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.11330256318865821, + "language_loss": 0.95339322, + "learning_rate": 0.0009989706585723202, + "loss": 0.96742439, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.47436523, + "step": 259, + "time_per_iteration": 2.794419765472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437412, + "balance_loss_mlp": 1.38651013, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.10381773722922016, + "language_loss": 1.0219605, + "learning_rate": 0.0009989505813633442, + "loss": 1.03633475, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.50927734, + "step": 260, + "time_per_iteration": 2.6660099029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145174, + "balance_loss_mlp": 1.39776254, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12909552841436595, + "language_loss": 1.02080631, + "learning_rate": 0.000998930310444573, + "loss": 1.03532374, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.5402832, + "step": 261, + "time_per_iteration": 2.7547266483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429363, + "balance_loss_mlp": 1.37698281, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.08616818959721087, + "language_loss": 0.99936116, + "learning_rate": 0.0009989098458238765, + "loss": 1.01365471, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.52441406, + "step": 262, + "time_per_iteration": 2.804656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431577, + "balance_loss_mlp": 1.38310647, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.10103635045761167, + "language_loss": 0.99213421, + "learning_rate": 0.0009988891875091998, + "loss": 1.00644994, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.48486328, + "step": 263, + "time_per_iteration": 2.780696392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359367, + "balance_loss_mlp": 1.31771505, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09437475228894394, + "language_loss": 0.93793595, + "learning_rate": 0.0009988683355085636, + "loss": 0.95152962, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.41625977, + "step": 264, + "time_per_iteration": 2.758275032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314446, + "balance_loss_mlp": 1.27684712, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09784246378207673, + "language_loss": 1.02612829, + "learning_rate": 0.000998847289830063, + "loss": 1.03927279, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37524414, + "step": 265, + "time_per_iteration": 2.8752288818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289086, + "balance_loss_mlp": 1.25468266, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.06973466471853282, + "language_loss": 0.95293748, + "learning_rate": 0.0009988260504818682, + "loss": 0.9658283, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.34423828, + "step": 266, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290407, + "balance_loss_mlp": 1.2563374, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.0971565340820806, + "language_loss": 1.02148294, + "learning_rate": 0.000998804617472226, + "loss": 1.03438699, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.34082031, + "step": 267, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275377, + "balance_loss_mlp": 1.24085402, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.10761719469623075, + "language_loss": 0.96939588, + "learning_rate": 0.0009987829908094568, + "loss": 0.98214972, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.34545898, + "step": 268, + "time_per_iteration": 2.8270740509033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271333, + "balance_loss_mlp": 1.23785877, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.1226169977774822, + "language_loss": 1.04002702, + "learning_rate": 0.0009987611705019569, + "loss": 1.05274034, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.33496094, + "step": 269, + "time_per_iteration": 4.483954429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277218, + "balance_loss_mlp": 1.24267149, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.07374197309260985, + "language_loss": 1.02401245, + "learning_rate": 0.0009987391565581978, + "loss": 1.03678453, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34594727, + "step": 270, + "time_per_iteration": 2.627356767654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304636, + "balance_loss_mlp": 1.26977956, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06923057034816653, + "language_loss": 0.94496262, + "learning_rate": 0.000998716948986726, + "loss": 0.95800889, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34887695, + "step": 271, + "time_per_iteration": 2.804185628890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322736, + "balance_loss_mlp": 1.28718746, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.1173780328671846, + "language_loss": 0.97372609, + "learning_rate": 0.0009986945477961633, + "loss": 0.9869535, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.35571289, + "step": 272, + "time_per_iteration": 2.739595890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297409, + "balance_loss_mlp": 1.2620039, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07261359465506025, + "language_loss": 1.02136993, + "learning_rate": 0.0009986719529952066, + "loss": 1.03434396, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.35424805, + "step": 273, + "time_per_iteration": 2.8717877864837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_mlp": 1.20389819, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.13624684616705834, + "language_loss": 1.01736569, + "learning_rate": 0.000998649164592628, + "loss": 1.0297575, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.35327148, + "step": 274, + "time_per_iteration": 2.590993642807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206885, + "balance_loss_mlp": 1.16945291, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.061304815826305474, + "language_loss": 0.99439085, + "learning_rate": 0.0009986261825972748, + "loss": 1.00645971, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.37426758, + "step": 275, + "time_per_iteration": 2.702202081680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_mlp": 1.14466429, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.10486338408500256, + "language_loss": 1.01433325, + "learning_rate": 0.000998603007018069, + "loss": 1.02616751, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.38745117, + "step": 276, + "time_per_iteration": 2.876267671585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190326, + "balance_loss_mlp": 1.15055728, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.08719890934761923, + "language_loss": 0.99445826, + "learning_rate": 0.0009985796378640089, + "loss": 1.00636148, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.39746094, + "step": 277, + "time_per_iteration": 2.74886155128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165278, + "balance_loss_mlp": 1.12720275, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.06292174667602014, + "language_loss": 0.99806106, + "learning_rate": 0.0009985560751441665, + "loss": 1.00971389, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.38061523, + "step": 278, + "time_per_iteration": 2.8894753456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175743, + "balance_loss_mlp": 1.13790607, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.06329003141341145, + "language_loss": 1.01538157, + "learning_rate": 0.00099853231886769, + "loss": 1.02713895, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.37792969, + "step": 279, + "time_per_iteration": 2.783085823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183406, + "balance_loss_mlp": 1.14633179, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06545769746199957, + "language_loss": 1.01316965, + "learning_rate": 0.0009985083690438024, + "loss": 1.02500367, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.37084961, + "step": 280, + "time_per_iteration": 2.707329511642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147788, + "balance_loss_mlp": 1.11245418, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.05305898567294309, + "language_loss": 0.9175781, + "learning_rate": 0.0009984842256818016, + "loss": 0.92905599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.35400391, + "step": 281, + "time_per_iteration": 3.1014201641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_mlp": 1.13106215, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.05782684737590577, + "language_loss": 1.02446878, + "learning_rate": 0.0009984598887910613, + "loss": 1.03612816, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.34912109, + "step": 282, + "time_per_iteration": 2.75343656539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_mlp": 1.14555514, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0631633618899466, + "language_loss": 0.98333299, + "learning_rate": 0.0009984353583810297, + "loss": 0.99513876, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.3503418, + "step": 283, + "time_per_iteration": 2.8092565536499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.15350997, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0821933313576245, + "language_loss": 1.00416183, + "learning_rate": 0.0009984106344612302, + "loss": 1.01602352, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.32666016, + "step": 284, + "time_per_iteration": 2.7632908821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_mlp": 1.1310904, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.06349155766627652, + "language_loss": 0.95740765, + "learning_rate": 0.0009983857170412615, + "loss": 0.96904278, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.32421875, + "step": 285, + "time_per_iteration": 2.9946134090423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130334, + "balance_loss_mlp": 1.09912539, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.0487694941790178, + "language_loss": 0.95326382, + "learning_rate": 0.000998360606130798, + "loss": 0.96456718, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.31176758, + "step": 286, + "time_per_iteration": 2.8205370903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.09512836, + "balance_loss_mlp": 7.26674223, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.42812971022266805, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.78585953, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 22.5, + "step": 287, + "time_per_iteration": 4.986966848373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173658, + "balance_loss_mlp": 1.14278328, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08917023960137904, + "language_loss": 1.01027536, + "learning_rate": 0.0009983098038774552, + "loss": 1.02201188, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.30834961, + "step": 288, + "time_per_iteration": 2.8100168704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06110836, + "balance_loss_mlp": 5.25634384, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.4031517895181362, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.84281063, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 8.5625, + "step": 289, + "time_per_iteration": 4.790200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_mlp": 1.23435044, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.18275347501036113, + "language_loss": 0.9955281, + "learning_rate": 0.0009982582277800948, + "loss": 1.00819802, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.32641602, + "step": 290, + "time_per_iteration": 2.5976333618164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281728, + "balance_loss_mlp": 1.24694288, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.14603269886404707, + "language_loss": 1.06751418, + "learning_rate": 0.0009982321495648908, + "loss": 1.08033144, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.34838867, + "step": 291, + "time_per_iteration": 2.8513312339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250537, + "balance_loss_mlp": 1.21348643, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.09283742859778188, + "language_loss": 0.97403693, + "learning_rate": 0.0009982058779188115, + "loss": 0.98654234, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.37011719, + "step": 292, + "time_per_iteration": 2.728203773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230786, + "balance_loss_mlp": 1.19170928, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.08826519450204054, + "language_loss": 1.05705655, + "learning_rate": 0.0009981794128520567, + "loss": 1.06936455, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.39038086, + "step": 293, + "time_per_iteration": 2.79616379737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253904, + "balance_loss_mlp": 1.21258569, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.08065602932127632, + "language_loss": 1.01724029, + "learning_rate": 0.000998152754374901, + "loss": 1.02977943, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.41333008, + "step": 294, + "time_per_iteration": 2.9352946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232141, + "balance_loss_mlp": 1.19132411, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.07309017642696977, + "language_loss": 0.9826439, + "learning_rate": 0.0009981259024976943, + "loss": 0.99496531, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.40820312, + "step": 295, + "time_per_iteration": 2.7376105785369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244019, + "balance_loss_mlp": 1.20112753, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.07769478500482971, + "language_loss": 0.96765345, + "learning_rate": 0.0009980988572308612, + "loss": 0.9800936, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.42871094, + "step": 296, + "time_per_iteration": 3.001779556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226011, + "balance_loss_mlp": 1.18197489, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.0588150430335769, + "language_loss": 0.99343681, + "learning_rate": 0.0009980716185849015, + "loss": 1.00569689, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44067383, + "step": 297, + "time_per_iteration": 2.9817121028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223805, + "balance_loss_mlp": 1.18153381, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06400414638033543, + "language_loss": 0.95616293, + "learning_rate": 0.0009980441865703904, + "loss": 0.96840101, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4230957, + "step": 298, + "time_per_iteration": 2.615875244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122669, + "balance_loss_mlp": 1.18513405, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.09089975305964836, + "language_loss": 1.03662193, + "learning_rate": 0.000998016561197978, + "loss": 1.04888892, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.41577148, + "step": 299, + "time_per_iteration": 2.765833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219698, + "balance_loss_mlp": 1.17835617, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.05662219614280908, + "language_loss": 0.94978034, + "learning_rate": 0.0009979887424783895, + "loss": 0.96197736, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.41357422, + "step": 300, + "time_per_iteration": 2.8931760787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122099, + "balance_loss_mlp": 1.17850339, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.05388706690809858, + "language_loss": 0.94851983, + "learning_rate": 0.0009979607304224248, + "loss": 0.96072972, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.42504883, + "step": 301, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213648, + "balance_loss_mlp": 1.16951644, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.0564182452216587, + "language_loss": 1.02312028, + "learning_rate": 0.000997932525040959, + "loss": 1.03525686, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.44140625, + "step": 302, + "time_per_iteration": 2.7084572315216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.14165473, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.07525794393376325, + "language_loss": 1.04335976, + "learning_rate": 0.000997904126344943, + "loss": 1.05521822, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.44165039, + "step": 303, + "time_per_iteration": 2.6271631717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121438, + "balance_loss_mlp": 1.17055893, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.0664075129682053, + "language_loss": 1.00263453, + "learning_rate": 0.0009978755343454018, + "loss": 1.01477838, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.791146993637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182664, + "balance_loss_mlp": 1.13869941, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07350056034493838, + "language_loss": 1.01461756, + "learning_rate": 0.0009978467490534355, + "loss": 1.0264442, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.43969727, + "step": 305, + "time_per_iteration": 2.614455461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186922, + "balance_loss_mlp": 1.14424467, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.056638515612222363, + "language_loss": 0.97774673, + "learning_rate": 0.00099781777048022, + "loss": 0.98961592, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.42700195, + "step": 306, + "time_per_iteration": 2.717700481414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011718, + "balance_loss_mlp": 1.12855101, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.056560878082468485, + "language_loss": 0.99827361, + "learning_rate": 0.0009977885986370057, + "loss": 1.00999165, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.43310547, + "step": 307, + "time_per_iteration": 2.557203531265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164879, + "balance_loss_mlp": 1.12263095, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.05991229640473007, + "language_loss": 0.9525907, + "learning_rate": 0.000997759233535118, + "loss": 0.9642396, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.42285156, + "step": 308, + "time_per_iteration": 2.8033511638641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174986, + "balance_loss_mlp": 1.1345737, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.06710738832596337, + "language_loss": 1.01122141, + "learning_rate": 0.0009977296751859576, + "loss": 1.02297115, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.40405273, + "step": 309, + "time_per_iteration": 2.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164837, + "balance_loss_mlp": 1.12487829, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.05223481097130428, + "language_loss": 1.03482628, + "learning_rate": 0.0009976999236009998, + "loss": 1.0464747, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.39941406, + "step": 310, + "time_per_iteration": 2.769092321395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164403, + "balance_loss_mlp": 1.1263994, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.05685909644716586, + "language_loss": 1.04877043, + "learning_rate": 0.0009976699787917955, + "loss": 1.06041443, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37963867, + "step": 311, + "time_per_iteration": 2.6526851654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08775091, + "balance_loss_mlp": 7.79852915, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.2725707199289832, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.82218087, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 9.75, + "step": 312, + "time_per_iteration": 5.006884813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_mlp": 1.12172294, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.06726838636277511, + "language_loss": 0.96427834, + "learning_rate": 0.0009976095095472243, + "loss": 0.97589004, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39428711, + "step": 313, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166252, + "balance_loss_mlp": 1.12738967, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.0761643630364548, + "language_loss": 0.97957367, + "learning_rate": 0.0009975789851353334, + "loss": 0.99123621, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.38818359, + "step": 314, + "time_per_iteration": 2.814901828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_mlp": 1.13191843, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07475166161853689, + "language_loss": 1.00319684, + "learning_rate": 0.0009975482675461487, + "loss": 1.0149318, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.41601562, + "step": 315, + "time_per_iteration": 2.65468692779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159286, + "balance_loss_mlp": 1.11591756, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08252555003670439, + "language_loss": 0.98425788, + "learning_rate": 0.0009975173567915952, + "loss": 0.99585068, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.43383789, + "step": 316, + "time_per_iteration": 2.6916940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.12767935, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.0640207679679256, + "language_loss": 0.91960573, + "learning_rate": 0.000997486252883674, + "loss": 0.93133986, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.45727539, + "step": 317, + "time_per_iteration": 2.8535635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188261, + "balance_loss_mlp": 1.13979006, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.0671416603225842, + "language_loss": 0.97457695, + "learning_rate": 0.0009974549558344602, + "loss": 0.98645949, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.484375, + "step": 318, + "time_per_iteration": 3.6911113262176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189393, + "balance_loss_mlp": 1.14037383, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.09268216800999254, + "language_loss": 1.06808639, + "learning_rate": 0.000997423465656105, + "loss": 1.07998025, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.49023438, + "step": 319, + "time_per_iteration": 2.727130651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.096205, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.06029287427116143, + "language_loss": 1.04509127, + "learning_rate": 0.0009973917823608335, + "loss": 1.05656588, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.51318359, + "step": 320, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.09605646, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.03213952729051003, + "language_loss": 0.98612553, + "learning_rate": 0.0009973599059609462, + "loss": 0.99760658, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.52075195, + "step": 321, + "time_per_iteration": 2.7024786472320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.09133446, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.04984356389382333, + "language_loss": 0.97161096, + "learning_rate": 0.000997327836468819, + "loss": 0.9830358, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.51147461, + "step": 322, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_mlp": 1.0917964, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.06671524152363617, + "language_loss": 0.99795449, + "learning_rate": 0.000997295573896902, + "loss": 1.00938356, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.51171875, + "step": 323, + "time_per_iteration": 2.834237813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03299168, + "balance_loss_mlp": 3.12445545, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.43556355854402456, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.84495211, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.75, + "step": 324, + "time_per_iteration": 4.770992040634155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02151431, + "balance_loss_mlp": 1.9545927, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.14082611715048204, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80723369, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.9609375, + "step": 325, + "time_per_iteration": 4.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.14768362, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.08367806581965369, + "language_loss": 0.93651855, + "learning_rate": 0.000997197627828043, + "loss": 0.94848073, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.4855957, + "step": 326, + "time_per_iteration": 2.5508148670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215208, + "balance_loss_mlp": 1.16862106, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.06635735350324974, + "language_loss": 0.89348811, + "learning_rate": 0.0009971645930629716, + "loss": 0.90564024, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.46533203, + "step": 327, + "time_per_iteration": 2.711386203765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125047, + "balance_loss_mlp": 1.20192814, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.08863859510008423, + "language_loss": 1.03147936, + "learning_rate": 0.0009971313652814872, + "loss": 1.04398406, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.48486328, + "step": 328, + "time_per_iteration": 2.8484854698181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225004, + "balance_loss_mlp": 1.17553234, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.08503417282278386, + "language_loss": 1.0059731, + "learning_rate": 0.0009970979444964903, + "loss": 1.01822317, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.49487305, + "step": 329, + "time_per_iteration": 2.957482099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197604, + "balance_loss_mlp": 1.14846587, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.06790724972181753, + "language_loss": 1.01849604, + "learning_rate": 0.0009970643307209556, + "loss": 1.03047216, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.49121094, + "step": 330, + "time_per_iteration": 2.8220374584198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170349, + "balance_loss_mlp": 1.1215446, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.06721894230078661, + "language_loss": 0.98097444, + "learning_rate": 0.0009970305239679334, + "loss": 0.99267793, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.48803711, + "step": 331, + "time_per_iteration": 2.8813369274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176679, + "balance_loss_mlp": 1.12754059, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.056286161373139375, + "language_loss": 1.03013992, + "learning_rate": 0.0009969965242505483, + "loss": 1.04190671, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.4909668, + "step": 332, + "time_per_iteration": 2.6662604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168774, + "balance_loss_mlp": 1.11932611, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06031850484613652, + "language_loss": 0.99096131, + "learning_rate": 0.0009969623315820007, + "loss": 1.00264907, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.49487305, + "step": 333, + "time_per_iteration": 2.6671581268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.10619712, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06229524640691676, + "language_loss": 0.99215055, + "learning_rate": 0.000996927945975565, + "loss": 1.00368309, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.47070312, + "step": 334, + "time_per_iteration": 2.568838357925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.1125921, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.05620099657237302, + "language_loss": 0.95852566, + "learning_rate": 0.0009968933674445906, + "loss": 0.97011936, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.46728516, + "step": 335, + "time_per_iteration": 2.6725666522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160514, + "balance_loss_mlp": 1.1122818, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.05589062806096766, + "language_loss": 0.97974062, + "learning_rate": 0.0009968585960025028, + "loss": 0.99134576, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.48217773, + "step": 336, + "time_per_iteration": 2.945194959640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0396516, + "balance_loss_mlp": 3.85834861, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.42886267506062575, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.81618351, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.0703125, + "step": 337, + "time_per_iteration": 4.802944183349609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215082, + "balance_loss_mlp": 1.16968668, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.09324534870618859, + "language_loss": 0.96021777, + "learning_rate": 0.0009967884744390583, + "loss": 0.9723686, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.45361328, + "step": 338, + "time_per_iteration": 3.5247950553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251582, + "balance_loss_mlp": 1.2060678, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.09123718626917265, + "language_loss": 0.97373873, + "learning_rate": 0.0009967531243449256, + "loss": 0.98625457, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.45507812, + "step": 339, + "time_per_iteration": 2.681973695755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211309, + "balance_loss_mlp": 1.163077, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.06030156589334856, + "language_loss": 1.04525125, + "learning_rate": 0.000996717581394126, + "loss": 1.05736434, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.48242188, + "step": 340, + "time_per_iteration": 2.6031126976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205107, + "balance_loss_mlp": 1.15630233, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.06934362388274598, + "language_loss": 1.05133414, + "learning_rate": 0.000996681845600459, + "loss": 1.06338525, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.48803711, + "step": 341, + "time_per_iteration": 2.6689491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190009, + "balance_loss_mlp": 1.1402986, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07929020766121274, + "language_loss": 0.97276402, + "learning_rate": 0.0009966459169777982, + "loss": 0.98466408, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.49731445, + "step": 342, + "time_per_iteration": 2.5235347747802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183772, + "balance_loss_mlp": 1.13444376, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.06503113555429127, + "language_loss": 1.05431008, + "learning_rate": 0.0009966097955400924, + "loss": 1.0661478, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.4934082, + "step": 343, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195626, + "balance_loss_mlp": 1.14772749, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.05810753199069879, + "language_loss": 0.99792945, + "learning_rate": 0.0009965734813013652, + "loss": 1.00988579, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.47924805, + "step": 344, + "time_per_iteration": 2.8092823028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211149, + "balance_loss_mlp": 1.16191518, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.08606224500635251, + "language_loss": 1.02011895, + "learning_rate": 0.0009965369742757151, + "loss": 1.03223062, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.49243164, + "step": 345, + "time_per_iteration": 2.5981764793395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193116, + "balance_loss_mlp": 1.14435959, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.0619511290056959, + "language_loss": 0.98293203, + "learning_rate": 0.0009965002744773152, + "loss": 0.99486327, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.48730469, + "step": 346, + "time_per_iteration": 3.4968950748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178364, + "balance_loss_mlp": 1.13115668, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.04856723246232052, + "language_loss": 0.95658922, + "learning_rate": 0.0009964633819204139, + "loss": 0.96837282, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.47167969, + "step": 347, + "time_per_iteration": 2.6705336570739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04576048, + "balance_loss_mlp": 4.3029151, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.32603271390487504, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.86377156, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 2.734375, + "step": 348, + "time_per_iteration": 4.961863994598389 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03789769, + "balance_loss_mlp": 3.60590124, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.16497869204612428, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.78943658, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.8359375, + "step": 349, + "time_per_iteration": 4.876751184463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181375, + "balance_loss_mlp": 1.13578987, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.07770510755269132, + "language_loss": 0.96067584, + "learning_rate": 0.000996351547842304, + "loss": 0.9724896, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.45581055, + "step": 350, + "time_per_iteration": 3.166680097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217287, + "balance_loss_mlp": 1.16969919, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.06167835917893234, + "language_loss": 0.94333142, + "learning_rate": 0.0009963138843953744, + "loss": 0.9555043, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.47558594, + "step": 351, + "time_per_iteration": 2.5784904956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122803, + "balance_loss_mlp": 1.18005991, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.06188972934791396, + "language_loss": 0.98543227, + "learning_rate": 0.000996276028262306, + "loss": 0.99771261, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.47924805, + "step": 352, + "time_per_iteration": 2.7985076904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216963, + "balance_loss_mlp": 1.16760993, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.0659402302829914, + "language_loss": 1.04801619, + "learning_rate": 0.0009962379794577964, + "loss": 1.06018579, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.49365234, + "step": 353, + "time_per_iteration": 2.608032703399658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123128, + "balance_loss_mlp": 1.18266606, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.051231802586423875, + "language_loss": 0.94352609, + "learning_rate": 0.000996199737996617, + "loss": 0.95583886, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.48657227, + "step": 354, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227436, + "balance_loss_mlp": 1.17770219, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.05676190931504088, + "language_loss": 1.03759205, + "learning_rate": 0.0009961613038936149, + "loss": 1.04986644, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.49755859, + "step": 355, + "time_per_iteration": 2.617859125137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216553, + "balance_loss_mlp": 1.16572189, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.04878484453506707, + "language_loss": 0.95482612, + "learning_rate": 0.000996122677163711, + "loss": 0.96699166, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.50878906, + "step": 356, + "time_per_iteration": 2.8171308040618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230039, + "balance_loss_mlp": 1.18037653, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.06504242786199886, + "language_loss": 1.01527905, + "learning_rate": 0.000996083857821902, + "loss": 1.02757955, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.49682617, + "step": 357, + "time_per_iteration": 3.0562636852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221322, + "balance_loss_mlp": 1.17237508, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.043415107047687695, + "language_loss": 0.99947309, + "learning_rate": 0.0009960448458832588, + "loss": 1.01168633, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.48925781, + "step": 358, + "time_per_iteration": 2.6778266429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224961, + "balance_loss_mlp": 1.17675292, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.061398357107108094, + "language_loss": 0.99686754, + "learning_rate": 0.000996005641362927, + "loss": 1.00911713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.48193359, + "step": 359, + "time_per_iteration": 2.5839953422546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218039, + "balance_loss_mlp": 1.16792321, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.045504813624839685, + "language_loss": 1.02907789, + "learning_rate": 0.0009959662442761274, + "loss": 1.04125834, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.5012207, + "step": 360, + "time_per_iteration": 2.9012227058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225991, + "balance_loss_mlp": 1.17504108, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.05242893208235044, + "language_loss": 0.96392268, + "learning_rate": 0.000995926654638155, + "loss": 0.97618258, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.50976562, + "step": 361, + "time_per_iteration": 2.7972850799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120421, + "balance_loss_mlp": 1.15323579, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0452718414118582, + "language_loss": 0.98678619, + "learning_rate": 0.00099588687246438, + "loss": 0.99882829, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.51025391, + "step": 362, + "time_per_iteration": 2.845742702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011953, + "balance_loss_mlp": 1.14241886, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.06654716127982052, + "language_loss": 1.06146324, + "learning_rate": 0.0009958468977702471, + "loss": 1.07341623, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.52978516, + "step": 363, + "time_per_iteration": 2.5876591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05386722, + "balance_loss_mlp": 5.09527922, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.35536528906135745, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.85121429, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 2.921875, + "step": 364, + "time_per_iteration": 4.7958595752716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183221, + "balance_loss_mlp": 1.12800324, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.06493728064972926, + "language_loss": 0.94085538, + "learning_rate": 0.0009957663708830612, + "loss": 0.95268762, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.55273438, + "step": 365, + "time_per_iteration": 3.238919258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188034, + "balance_loss_mlp": 1.13048029, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.06418297657416602, + "language_loss": 0.98210049, + "learning_rate": 0.0009957258187212714, + "loss": 0.99398077, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.57470703, + "step": 366, + "time_per_iteration": 3.0337131023406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0292345, + "balance_loss_mlp": 2.78612089, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.09868001986151984, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.82118309, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.375, + "step": 367, + "time_per_iteration": 4.825684070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118916, + "balance_loss_mlp": 1.12988925, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.06345017711900697, + "language_loss": 0.94456601, + "learning_rate": 0.0009956441370400167, + "loss": 0.95645761, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.59179688, + "step": 368, + "time_per_iteration": 2.6685595512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203671, + "balance_loss_mlp": 1.14411354, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.07550644934377632, + "language_loss": 1.00098681, + "learning_rate": 0.0009956030075522636, + "loss": 1.0130235, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.59472656, + "step": 369, + "time_per_iteration": 2.7824065685272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.12555027, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0634963537383221, + "language_loss": 1.00245738, + "learning_rate": 0.0009955616856543587, + "loss": 1.01431036, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.59667969, + "step": 370, + "time_per_iteration": 2.6869115829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117739, + "balance_loss_mlp": 1.11649847, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.04749901473855408, + "language_loss": 0.92605507, + "learning_rate": 0.0009955201713623448, + "loss": 0.93782902, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.60791016, + "step": 371, + "time_per_iteration": 2.7894065380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03553003, + "balance_loss_mlp": 3.34700894, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.1539254818196356, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.80225718, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 2.0625, + "step": 372, + "time_per_iteration": 5.025646924972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_mlp": 1.12739396, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.05697389015463885, + "language_loss": 1.05361807, + "learning_rate": 0.0009954365656605333, + "loss": 1.06550562, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.61328125, + "step": 373, + "time_per_iteration": 2.5767741203308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203971, + "balance_loss_mlp": 1.13878703, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.0561234241567743, + "language_loss": 0.98981488, + "learning_rate": 0.0009953944742831947, + "loss": 1.00185454, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.65185547, + "step": 374, + "time_per_iteration": 3.0126912593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209318, + "balance_loss_mlp": 1.14351439, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.05197007853134015, + "language_loss": 1.02623391, + "learning_rate": 0.0009953521905766642, + "loss": 1.0383271, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.65820312, + "step": 375, + "time_per_iteration": 2.9678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207965, + "balance_loss_mlp": 1.14464104, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.05250799377029981, + "language_loss": 1.01212132, + "learning_rate": 0.0009953097145573577, + "loss": 1.02420104, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.6328125, + "step": 376, + "time_per_iteration": 2.7048561573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121329, + "balance_loss_mlp": 1.1502521, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.050651846587156886, + "language_loss": 0.98499894, + "learning_rate": 0.000995267046241766, + "loss": 0.99713182, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.62988281, + "step": 377, + "time_per_iteration": 3.287705421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225924, + "balance_loss_mlp": 1.16341114, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.05776369312695448, + "language_loss": 0.98701203, + "learning_rate": 0.0009952241856464547, + "loss": 0.99927127, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.62451172, + "step": 378, + "time_per_iteration": 2.5897629261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220563, + "balance_loss_mlp": 1.16010034, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.05450855675542614, + "language_loss": 1.05642247, + "learning_rate": 0.0009951811327880632, + "loss": 1.06862807, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.60351562, + "step": 379, + "time_per_iteration": 2.7320594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220943, + "balance_loss_mlp": 1.15924072, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.04947645913164449, + "language_loss": 0.99005401, + "learning_rate": 0.0009951378876833063, + "loss": 1.00226343, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.61669922, + "step": 380, + "time_per_iteration": 2.595810651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196634, + "balance_loss_mlp": 1.13798296, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.058807068798268386, + "language_loss": 1.05567527, + "learning_rate": 0.0009950944503489736, + "loss": 1.06764162, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.5859375, + "step": 381, + "time_per_iteration": 2.733560562133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197608, + "balance_loss_mlp": 1.13914812, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.06747680453051412, + "language_loss": 0.99337935, + "learning_rate": 0.0009950508208019285, + "loss": 1.00535548, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.58398438, + "step": 382, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176507, + "balance_loss_mlp": 1.12062192, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.05827239016363537, + "language_loss": 1.03707182, + "learning_rate": 0.0009950069990591096, + "loss": 1.04883695, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.55908203, + "step": 383, + "time_per_iteration": 2.6856980323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05393736, + "balance_loss_mlp": 5.19079447, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.38241300139143997, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.81795102, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 2.03125, + "step": 384, + "time_per_iteration": 4.860661268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128436, + "balance_loss_mlp": 1.07369518, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.06005395599718801, + "language_loss": 0.96679938, + "learning_rate": 0.0009949187790542777, + "loss": 0.97808379, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.54760742, + "step": 385, + "time_per_iteration": 2.7245922088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.09042215, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.06780842756482337, + "language_loss": 0.9270733, + "learning_rate": 0.0009948743808265148, + "loss": 0.93854064, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.56298828, + "step": 386, + "time_per_iteration": 2.6745331287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187036, + "balance_loss_mlp": 1.13334417, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.04295711334598506, + "language_loss": 1.02854586, + "learning_rate": 0.0009948297904714782, + "loss": 1.04041624, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.53759766, + "step": 387, + "time_per_iteration": 2.681718111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202671, + "balance_loss_mlp": 1.15167296, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.05564614333293379, + "language_loss": 0.94366896, + "learning_rate": 0.0009947850080064796, + "loss": 0.95569569, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.51049805, + "step": 388, + "time_per_iteration": 2.788663148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216483, + "balance_loss_mlp": 1.16817975, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.07112384111458, + "language_loss": 0.99713415, + "learning_rate": 0.0009947400334489047, + "loss": 1.00929892, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.48291016, + "step": 389, + "time_per_iteration": 2.9905049800872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227498, + "balance_loss_mlp": 1.17926562, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.06900212518032732, + "language_loss": 0.91264081, + "learning_rate": 0.0009946948668162145, + "loss": 0.92491579, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.48168945, + "step": 390, + "time_per_iteration": 2.767531394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012247, + "balance_loss_mlp": 1.17277205, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.052104168644034804, + "language_loss": 0.95126128, + "learning_rate": 0.0009946495081259441, + "loss": 0.96350825, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.52001953, + "step": 391, + "time_per_iteration": 2.816908597946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192311, + "balance_loss_mlp": 1.14057434, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.051504782312047234, + "language_loss": 0.99421549, + "learning_rate": 0.0009946039573957035, + "loss": 1.00613856, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.51782227, + "step": 392, + "time_per_iteration": 2.9265222549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116666, + "balance_loss_mlp": 1.11478019, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.055053573084277836, + "language_loss": 0.95799196, + "learning_rate": 0.000994558214643177, + "loss": 0.96965855, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.51928711, + "step": 393, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165121, + "balance_loss_mlp": 1.11352682, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.05925711706254076, + "language_loss": 0.97585773, + "learning_rate": 0.000994512279886123, + "loss": 0.98750889, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.51660156, + "step": 394, + "time_per_iteration": 3.0709142684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.09191656, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.04191079383555719, + "language_loss": 0.97239089, + "learning_rate": 0.0009944661531423758, + "loss": 0.98382699, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.51757812, + "step": 395, + "time_per_iteration": 2.7044599056243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134219, + "balance_loss_mlp": 1.08338809, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.05545815376917658, + "language_loss": 0.96390671, + "learning_rate": 0.000994419834429843, + "loss": 0.97524893, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.50854492, + "step": 396, + "time_per_iteration": 2.6767609119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135922, + "balance_loss_mlp": 1.08525789, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.05307630449121137, + "language_loss": 1.01208472, + "learning_rate": 0.0009943733237665069, + "loss": 1.02344394, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.50683594, + "step": 397, + "time_per_iteration": 2.819148302078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124426, + "balance_loss_mlp": 1.07502615, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.049844903289807924, + "language_loss": 0.99488425, + "learning_rate": 0.0009943266211704248, + "loss": 1.00612843, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.49389648, + "step": 398, + "time_per_iteration": 2.9555482864379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125466, + "balance_loss_mlp": 1.07675719, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.05620775813161816, + "language_loss": 1.01430082, + "learning_rate": 0.000994279726659728, + "loss": 1.02555549, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.48706055, + "step": 399, + "time_per_iteration": 2.5138003826141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.07761765, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.05674792404596756, + "language_loss": 0.99883693, + "learning_rate": 0.0009942326402526231, + "loss": 1.01010823, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.49511719, + "step": 400, + "time_per_iteration": 2.5245604515075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_mlp": 1.07793891, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.036646942736225624, + "language_loss": 0.9767518, + "learning_rate": 0.0009941853619673902, + "loss": 0.98802906, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.49804688, + "step": 401, + "time_per_iteration": 2.644771099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_mlp": 1.07451057, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.057554732491620374, + "language_loss": 1.01884329, + "learning_rate": 0.0009941378918223844, + "loss": 1.0300777, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.48876953, + "step": 402, + "time_per_iteration": 3.051617383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_mlp": 1.07618988, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.04510164642433069, + "language_loss": 0.94372368, + "learning_rate": 0.0009940902298360354, + "loss": 0.95496523, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.47924805, + "step": 403, + "time_per_iteration": 2.7302582263946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118279, + "balance_loss_mlp": 1.0687592, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.062376946911402976, + "language_loss": 1.04687834, + "learning_rate": 0.0009940423760268473, + "loss": 1.05806112, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.49536133, + "step": 404, + "time_per_iteration": 2.856938600540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118682, + "balance_loss_mlp": 1.07009196, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.046838991637930295, + "language_loss": 0.97888398, + "learning_rate": 0.0009939943304133982, + "loss": 0.99007082, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.48608398, + "step": 405, + "time_per_iteration": 2.6161091327667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115161, + "balance_loss_mlp": 1.06881261, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.04496148345425058, + "language_loss": 1.04081011, + "learning_rate": 0.0009939460930143416, + "loss": 1.0519619, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.46337891, + "step": 406, + "time_per_iteration": 2.6310677528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119218, + "balance_loss_mlp": 1.07332289, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.037201804651944344, + "language_loss": 0.98071587, + "learning_rate": 0.0009938976638484043, + "loss": 0.99190807, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.45874023, + "step": 407, + "time_per_iteration": 2.8977036476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.06844616, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.04629061554837057, + "language_loss": 0.97991359, + "learning_rate": 0.0009938490429343887, + "loss": 0.99104249, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.44458008, + "step": 408, + "time_per_iteration": 2.562168836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07315516, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.04004461216150975, + "language_loss": 0.97974342, + "learning_rate": 0.0009938002302911709, + "loss": 0.99092889, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.4543457, + "step": 409, + "time_per_iteration": 2.738518238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123547, + "balance_loss_mlp": 1.07915401, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.07048914756312923, + "language_loss": 1.00401747, + "learning_rate": 0.0009937512259377015, + "loss": 1.01525307, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.44384766, + "step": 410, + "time_per_iteration": 2.670149564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110678, + "balance_loss_mlp": 1.0668565, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.049646402233970426, + "language_loss": 0.99659574, + "learning_rate": 0.000993702029893006, + "loss": 1.00770259, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.4387207, + "step": 411, + "time_per_iteration": 2.7853777408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118473, + "balance_loss_mlp": 1.07200527, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.04880092350488667, + "language_loss": 0.98862529, + "learning_rate": 0.0009936526421761838, + "loss": 0.99981004, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.46435547, + "step": 412, + "time_per_iteration": 3.030674457550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114951, + "balance_loss_mlp": 1.07043815, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.04383720282943398, + "language_loss": 1.01490402, + "learning_rate": 0.000993603062806409, + "loss": 1.02605367, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.4453125, + "step": 413, + "time_per_iteration": 2.7101500034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0637151, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.046157231925668944, + "language_loss": 1.04664707, + "learning_rate": 0.0009935532918029298, + "loss": 1.05774391, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.45947266, + "step": 414, + "time_per_iteration": 2.593390941619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118947, + "balance_loss_mlp": 1.07278943, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.058468816323775735, + "language_loss": 0.97956645, + "learning_rate": 0.0009935033291850694, + "loss": 0.99075592, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.46166992, + "step": 415, + "time_per_iteration": 2.6693851947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_mlp": 1.0654031, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.061030352209764355, + "language_loss": 1.00225627, + "learning_rate": 0.0009934531749722247, + "loss": 1.01337099, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.46044922, + "step": 416, + "time_per_iteration": 2.578746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_mlp": 1.07337523, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.05071064772829009, + "language_loss": 0.98778659, + "learning_rate": 0.0009934028291838672, + "loss": 0.99898028, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.45996094, + "step": 417, + "time_per_iteration": 2.7096333503723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106202, + "balance_loss_mlp": 1.06166553, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.045680808340910005, + "language_loss": 0.94326293, + "learning_rate": 0.0009933522918395433, + "loss": 0.95432496, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.44555664, + "step": 418, + "time_per_iteration": 2.644414186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04959176, + "balance_loss_mlp": 4.71808767, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.3214703434406663, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.83210278, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 2.40625, + "step": 419, + "time_per_iteration": 4.868964195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_mlp": 1.07108891, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08060687528614664, + "language_loss": 1.13036489, + "learning_rate": 0.000993250642561551, + "loss": 1.14152122, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.4453125, + "step": 420, + "time_per_iteration": 2.632162094116211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121548, + "balance_loss_mlp": 1.07538986, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.08633853635548816, + "language_loss": 0.9784801, + "learning_rate": 0.0009931995306673466, + "loss": 0.98969555, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.46118164, + "step": 421, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134412, + "balance_loss_mlp": 1.08815861, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.038770411105538145, + "language_loss": 1.03907061, + "learning_rate": 0.000993148227296103, + "loss": 1.05041468, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.4621582, + "step": 422, + "time_per_iteration": 2.669496536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133253, + "balance_loss_mlp": 1.08707166, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.053095831055692516, + "language_loss": 0.9112367, + "learning_rate": 0.000993096732467738, + "loss": 0.92256927, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.46166992, + "step": 423, + "time_per_iteration": 2.961660861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150855, + "balance_loss_mlp": 1.10498345, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.08137036582560589, + "language_loss": 0.99760056, + "learning_rate": 0.0009930450462022435, + "loss": 1.00910902, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.45874023, + "step": 424, + "time_per_iteration": 2.7952311038970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03600409, + "balance_loss_mlp": 3.48901963, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.18349806711668631, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.82790214, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.1171875, + "step": 425, + "time_per_iteration": 4.8854875564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.11344862, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.06491953183218531, + "language_loss": 0.9776966, + "learning_rate": 0.0009929410994402065, + "loss": 0.98928833, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.45703125, + "step": 426, + "time_per_iteration": 4.275091886520386 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169515, + "balance_loss_mlp": 1.12223697, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.07437504582125473, + "language_loss": 1.02033544, + "learning_rate": 0.0009928888389840196, + "loss": 1.03203058, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.47241211, + "step": 427, + "time_per_iteration": 2.7036454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145234, + "balance_loss_mlp": 1.09941018, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.05964472172349544, + "language_loss": 1.03706717, + "learning_rate": 0.0009928363871714147, + "loss": 1.04851961, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.45849609, + "step": 428, + "time_per_iteration": 2.6669116020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.10254741, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.07530468467255677, + "language_loss": 0.97491598, + "learning_rate": 0.0009927837440227556, + "loss": 0.98641634, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.47485352, + "step": 429, + "time_per_iteration": 2.8463807106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120703, + "balance_loss_mlp": 1.07588065, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.04140843961960757, + "language_loss": 0.92054397, + "learning_rate": 0.0009927309095584798, + "loss": 0.93175101, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.44824219, + "step": 430, + "time_per_iteration": 2.9767606258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116415, + "balance_loss_mlp": 1.07278419, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.04726827868993605, + "language_loss": 1.04780793, + "learning_rate": 0.0009926778837991, + "loss": 1.05897212, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.43652344, + "step": 431, + "time_per_iteration": 2.5883395671844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.06749809, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.049074519776006666, + "language_loss": 1.0243988, + "learning_rate": 0.000992624666765202, + "loss": 1.0355196, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.44604492, + "step": 432, + "time_per_iteration": 2.7943906784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_mlp": 1.07200766, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.04417562175093811, + "language_loss": 1.00109053, + "learning_rate": 0.000992571258477447, + "loss": 1.01224887, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.43823242, + "step": 433, + "time_per_iteration": 2.836127758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_mlp": 1.07260084, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.04319706549365549, + "language_loss": 0.93695247, + "learning_rate": 0.0009925176589565695, + "loss": 0.94812053, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.44213867, + "step": 434, + "time_per_iteration": 2.8157734870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131219, + "balance_loss_mlp": 1.08756483, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.04172416189060796, + "language_loss": 1.04242814, + "learning_rate": 0.0009924638682233791, + "loss": 1.05374026, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.43652344, + "step": 435, + "time_per_iteration": 2.5577316284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503783, + "balance_loss_mlp": 2.3527205, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06968128915635463, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82068378, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.5078125, + "step": 436, + "time_per_iteration": 4.594938516616821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.08348453, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.0610737753852808, + "language_loss": 0.94037408, + "learning_rate": 0.0009923557132036668, + "loss": 0.95166528, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.45629883, + "step": 437, + "time_per_iteration": 3.0716845989227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_mlp": 1.07430601, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.04662895628051273, + "language_loss": 0.97730738, + "learning_rate": 0.0009923013489591345, + "loss": 0.98849535, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.4453125, + "step": 438, + "time_per_iteration": 2.726792812347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_mlp": 1.06685066, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04626496214247174, + "language_loss": 0.96079296, + "learning_rate": 0.0009922467935862681, + "loss": 0.97189873, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.4375, + "step": 439, + "time_per_iteration": 3.0908052921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119416, + "balance_loss_mlp": 1.07273376, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.048922855388473234, + "language_loss": 0.99432743, + "learning_rate": 0.0009921920471062478, + "loss": 1.00552154, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.46655273, + "step": 440, + "time_per_iteration": 2.622451066970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117818, + "balance_loss_mlp": 1.07342434, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.07502031783190574, + "language_loss": 0.9797709, + "learning_rate": 0.0009921371095403281, + "loss": 0.99094903, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.44433594, + "step": 441, + "time_per_iteration": 2.705152750015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011116, + "balance_loss_mlp": 1.06863689, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.04941418140969711, + "language_loss": 1.00754833, + "learning_rate": 0.0009920819809098379, + "loss": 1.01866436, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.42993164, + "step": 442, + "time_per_iteration": 2.5887317657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119689, + "balance_loss_mlp": 1.07715499, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.06964486535702215, + "language_loss": 0.96275294, + "learning_rate": 0.0009920266612361798, + "loss": 0.97394979, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.42578125, + "step": 443, + "time_per_iteration": 2.745222330093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_mlp": 1.06587708, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.05163049083883061, + "language_loss": 0.96866751, + "learning_rate": 0.0009919711505408308, + "loss": 0.97974443, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.41821289, + "step": 444, + "time_per_iteration": 2.780095100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106314, + "balance_loss_mlp": 1.0654248, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.054748359311131624, + "language_loss": 0.94535226, + "learning_rate": 0.000991915448845342, + "loss": 0.95641541, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.40893555, + "step": 445, + "time_per_iteration": 2.5229337215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.06279922, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.0575820537988498, + "language_loss": 1.03181779, + "learning_rate": 0.000991859556171339, + "loss": 1.04284596, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.40039062, + "step": 446, + "time_per_iteration": 2.5957653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_mlp": 1.06497526, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.04289742759235468, + "language_loss": 1.05262291, + "learning_rate": 0.000991803472540521, + "loss": 1.06367946, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.40673828, + "step": 447, + "time_per_iteration": 2.6220486164093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_mlp": 1.06550729, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.04330621576945977, + "language_loss": 1.00096428, + "learning_rate": 0.0009917471979746615, + "loss": 1.01202178, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.40234375, + "step": 448, + "time_per_iteration": 2.9767467975616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.07379115, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.03609686036920932, + "language_loss": 0.98485255, + "learning_rate": 0.0009916907324956086, + "loss": 0.99600053, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.41015625, + "step": 449, + "time_per_iteration": 2.701143980026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117003, + "balance_loss_mlp": 1.07480288, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.04834207301210501, + "language_loss": 0.95441091, + "learning_rate": 0.0009916340761252837, + "loss": 0.965581, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.42211914, + "step": 450, + "time_per_iteration": 2.6036393642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129901, + "balance_loss_mlp": 1.08910751, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.07269963588094165, + "language_loss": 0.9243114, + "learning_rate": 0.0009915772288856832, + "loss": 0.93561041, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.40820312, + "step": 451, + "time_per_iteration": 3.05719256401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125865, + "balance_loss_mlp": 1.08359361, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.05954656443346509, + "language_loss": 0.93746579, + "learning_rate": 0.000991520190798877, + "loss": 0.94872439, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.42285156, + "step": 452, + "time_per_iteration": 2.804128885269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_mlp": 1.07723105, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.05604676795867647, + "language_loss": 1.04000187, + "learning_rate": 0.0009914629618870089, + "loss": 1.05120206, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.42797852, + "step": 453, + "time_per_iteration": 2.8959083557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02032313, + "balance_loss_mlp": 1.86675501, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.06678910630402063, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.80708182, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.65625, + "step": 454, + "time_per_iteration": 4.753306865692139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974747, + "balance_loss_mlp": 1.80537415, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.06350102966569023, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83402705, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.6953125, + "step": 455, + "time_per_iteration": 4.909627914428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_mlp": 1.05778539, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.07384563339861851, + "language_loss": 0.95938599, + "learning_rate": 0.0009912901304235883, + "loss": 0.97038674, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.42333984, + "step": 456, + "time_per_iteration": 3.0303096771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_mlp": 1.05112898, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.061767025741825826, + "language_loss": 0.93898749, + "learning_rate": 0.000991232138434397, + "loss": 0.94991863, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.41992188, + "step": 457, + "time_per_iteration": 2.834221601486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089137, + "balance_loss_mlp": 1.04824805, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.05183647995223567, + "language_loss": 1.00765896, + "learning_rate": 0.000991173955731976, + "loss": 1.0185504, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.40869141, + "step": 458, + "time_per_iteration": 2.628783702850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_mlp": 1.05569601, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.052575936673692925, + "language_loss": 1.04489028, + "learning_rate": 0.0009911155823389137, + "loss": 1.0558753, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.42797852, + "step": 459, + "time_per_iteration": 2.964416742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_mlp": 1.06523609, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.05270293395412616, + "language_loss": 1.00385904, + "learning_rate": 0.000991057018277873, + "loss": 1.01492882, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.41748047, + "step": 460, + "time_per_iteration": 2.6944808959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_mlp": 1.06245136, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.04953210926048159, + "language_loss": 1.01399374, + "learning_rate": 0.0009909982635715898, + "loss": 1.02504039, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.42236328, + "step": 461, + "time_per_iteration": 2.6137924194335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096015, + "balance_loss_mlp": 1.05374336, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.050729417377465176, + "language_loss": 1.00123549, + "learning_rate": 0.0009909393182428751, + "loss": 1.01219559, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.42285156, + "step": 462, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109539, + "balance_loss_mlp": 1.06891286, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.043715633324142876, + "language_loss": 0.94138575, + "learning_rate": 0.000990880182314614, + "loss": 0.95248115, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.40625, + "step": 463, + "time_per_iteration": 2.733408212661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_mlp": 1.06121325, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.051961844945365605, + "language_loss": 0.94176865, + "learning_rate": 0.0009908208558097643, + "loss": 0.9527818, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.40087891, + "step": 464, + "time_per_iteration": 2.9006474018096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105621, + "balance_loss_mlp": 1.06508923, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.04470923680131565, + "language_loss": 0.9716863, + "learning_rate": 0.000990761338751359, + "loss": 0.98274255, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.40527344, + "step": 465, + "time_per_iteration": 2.775830030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410893, + "balance_loss_mlp": 1.25296497, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.0425617539044403, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75070524, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.578125, + "step": 466, + "time_per_iteration": 5.023500919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_mlp": 1.05869305, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.04007163966797277, + "language_loss": 0.9983623, + "learning_rate": 0.0009906417330663815, + "loss": 1.00936306, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.41381836, + "step": 467, + "time_per_iteration": 2.6194305419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099405, + "balance_loss_mlp": 1.05889773, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.03985353179312445, + "language_loss": 0.96447593, + "learning_rate": 0.0009905816444862442, + "loss": 0.97546995, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.4050293, + "step": 468, + "time_per_iteration": 2.623267889022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_mlp": 1.06568456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.038840192804800056, + "language_loss": 0.93513083, + "learning_rate": 0.0009905213654454216, + "loss": 0.94620228, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.41455078, + "step": 469, + "time_per_iteration": 2.9024641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_mlp": 1.06466317, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.04985478927164425, + "language_loss": 1.01848495, + "learning_rate": 0.0009904608959673158, + "loss": 1.02953827, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.40649414, + "step": 470, + "time_per_iteration": 2.7711682319641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097659, + "balance_loss_mlp": 1.0588448, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.04989175862356038, + "language_loss": 1.02851224, + "learning_rate": 0.000990400236075403, + "loss": 1.03948903, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.38793945, + "step": 471, + "time_per_iteration": 2.536189317703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109095, + "balance_loss_mlp": 1.05113411, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.03738902964718639, + "language_loss": 0.98994756, + "learning_rate": 0.0009903393857932338, + "loss": 1.000857, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.39794922, + "step": 472, + "time_per_iteration": 2.6588857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097802, + "balance_loss_mlp": 1.05908275, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.045733529486957185, + "language_loss": 0.97091877, + "learning_rate": 0.0009902783451444317, + "loss": 0.98189688, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.38720703, + "step": 473, + "time_per_iteration": 2.6981122493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091239, + "balance_loss_mlp": 1.05406976, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.04942472768420212, + "language_loss": 1.00819659, + "learning_rate": 0.0009902171141526956, + "loss": 1.01910901, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.37158203, + "step": 474, + "time_per_iteration": 2.527256727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099497, + "balance_loss_mlp": 1.06225586, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.04275448033987936, + "language_loss": 0.88210893, + "learning_rate": 0.000990155692841797, + "loss": 0.8931039, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.37231445, + "step": 475, + "time_per_iteration": 2.989063262939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_mlp": 1.06084871, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.04412440376655801, + "language_loss": 1.00229144, + "learning_rate": 0.0009900940812355818, + "loss": 1.01326227, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.36254883, + "step": 476, + "time_per_iteration": 2.8778445720672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105736, + "balance_loss_mlp": 1.07011676, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.06417087981964828, + "language_loss": 0.97168529, + "learning_rate": 0.00099003227935797, + "loss": 0.98274267, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.35620117, + "step": 477, + "time_per_iteration": 2.708608627319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101416, + "balance_loss_mlp": 1.06369829, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.06707216335576115, + "language_loss": 1.01291215, + "learning_rate": 0.000989970287232955, + "loss": 1.02392626, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.37695312, + "step": 478, + "time_per_iteration": 2.783325672149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090795, + "balance_loss_mlp": 1.05431736, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.05564878549890474, + "language_loss": 0.9726451, + "learning_rate": 0.0009899081048846043, + "loss": 0.98355305, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.36474609, + "step": 479, + "time_per_iteration": 2.6017916202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097049, + "balance_loss_mlp": 1.05964088, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.06044394784495309, + "language_loss": 1.03484094, + "learning_rate": 0.0009898457323370593, + "loss": 1.04581141, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.37402344, + "step": 480, + "time_per_iteration": 2.575676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.0533123, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.05778783373137127, + "language_loss": 0.99753714, + "learning_rate": 0.000989783169614535, + "loss": 1.00844884, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.37817383, + "step": 481, + "time_per_iteration": 2.646942615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283887, + "balance_loss_mlp": 1.15876544, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.01956789957612316, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80036646, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.25, + "step": 482, + "time_per_iteration": 4.860741376876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_mlp": 1.06158745, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.06801501049369231, + "language_loss": 0.97102278, + "learning_rate": 0.000989657473741779, + "loss": 0.98201108, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.37231445, + "step": 483, + "time_per_iteration": 2.819138526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095911, + "balance_loss_mlp": 1.05979109, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.038333848574242754, + "language_loss": 0.98462784, + "learning_rate": 0.0009895943406403465, + "loss": 0.99558693, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.36132812, + "step": 484, + "time_per_iteration": 2.7088170051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_mlp": 1.06854701, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.05828015098596693, + "language_loss": 0.92231822, + "learning_rate": 0.0009895310174615338, + "loss": 0.933357, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.35351562, + "step": 485, + "time_per_iteration": 2.760511636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265799, + "balance_loss_mlp": 1.14983261, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.018538812380254305, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76984316, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.15625, + "step": 486, + "time_per_iteration": 4.656491994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_mlp": 1.0699296, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.04721263549483299, + "language_loss": 0.95839012, + "learning_rate": 0.0009894038009701782, + "loss": 0.96944392, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.35498047, + "step": 487, + "time_per_iteration": 2.6169629096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_mlp": 1.06868315, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.05102581257360949, + "language_loss": 0.98848963, + "learning_rate": 0.0009893399077070253, + "loss": 0.99952644, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.3503418, + "step": 488, + "time_per_iteration": 2.5845744609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_mlp": 1.07193291, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.05918319403016569, + "language_loss": 0.92944884, + "learning_rate": 0.0009892758244652718, + "loss": 0.94051951, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.35180664, + "step": 489, + "time_per_iteration": 2.660200357437134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091731, + "balance_loss_mlp": 1.05801892, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.041386989889926534, + "language_loss": 1.00010514, + "learning_rate": 0.0009892115512697968, + "loss": 1.01102245, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.33740234, + "step": 490, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_mlp": 1.05631554, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.04182034264497562, + "language_loss": 1.00108159, + "learning_rate": 0.0009891470881455537, + "loss": 1.01198137, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.33666992, + "step": 491, + "time_per_iteration": 2.746169328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108736, + "balance_loss_mlp": 1.05319476, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.0458284589248403, + "language_loss": 0.98654628, + "learning_rate": 0.0009890824351175692, + "loss": 0.99741989, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.34204102, + "step": 492, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.05654192, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.041327442652051224, + "language_loss": 1.0219661, + "learning_rate": 0.0009890175922109435, + "loss": 1.0328722, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.34082031, + "step": 493, + "time_per_iteration": 2.6482973098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010971, + "balance_loss_mlp": 1.06086028, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.06926989533772566, + "language_loss": 1.01090789, + "learning_rate": 0.0009889525594508513, + "loss": 1.02187896, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.36254883, + "step": 494, + "time_per_iteration": 3.0095505714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_mlp": 1.05596447, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.04986765426945594, + "language_loss": 0.94310975, + "learning_rate": 0.0009888873368625404, + "loss": 0.95402986, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.3605957, + "step": 495, + "time_per_iteration": 2.5451042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05426204, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.05650320770937666, + "language_loss": 0.98877072, + "learning_rate": 0.0009888219244713326, + "loss": 0.99966443, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.3515625, + "step": 496, + "time_per_iteration": 2.8157310485839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086342, + "balance_loss_mlp": 1.05100799, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.05039739829653265, + "language_loss": 0.99588835, + "learning_rate": 0.0009887563223026229, + "loss": 1.00675178, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.35375977, + "step": 497, + "time_per_iteration": 2.6563401222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244906, + "balance_loss_mlp": 1.14648652, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.01649790273231252, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80313075, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.98046875, + "step": 498, + "time_per_iteration": 4.8689799308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098776, + "balance_loss_mlp": 1.0630604, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.06260101269903841, + "language_loss": 0.97272921, + "learning_rate": 0.0009886245487346482, + "loss": 0.98371696, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35742188, + "step": 499, + "time_per_iteration": 3.0292818546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.08159947, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.055723050712230264, + "language_loss": 1.00704551, + "learning_rate": 0.0009885583773865422, + "loss": 1.01822114, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.35986328, + "step": 500, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117756, + "balance_loss_mlp": 1.08137345, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.06268683986847115, + "language_loss": 0.9714855, + "learning_rate": 0.0009884920163632524, + "loss": 0.98266304, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.36352539, + "step": 501, + "time_per_iteration": 2.666341781616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111747, + "balance_loss_mlp": 1.07638931, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.04553274405873497, + "language_loss": 1.01245189, + "learning_rate": 0.000988425465690543, + "loss": 1.02356935, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35375977, + "step": 502, + "time_per_iteration": 2.55082106590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06867552, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.04373339165225573, + "language_loss": 0.99427342, + "learning_rate": 0.0009883587253942505, + "loss": 1.00530469, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.34472656, + "step": 503, + "time_per_iteration": 2.7674455642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_mlp": 1.07378531, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.051161986083573203, + "language_loss": 1.04393589, + "learning_rate": 0.0009882917955002862, + "loss": 1.05501866, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.3449707, + "step": 504, + "time_per_iteration": 2.549203872680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_mlp": 1.07116556, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.04840022534917253, + "language_loss": 0.95342839, + "learning_rate": 0.0009882246760346343, + "loss": 0.96448457, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.3449707, + "step": 505, + "time_per_iteration": 2.653627872467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115925, + "balance_loss_mlp": 1.08128262, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.08271599518488834, + "language_loss": 1.02799106, + "learning_rate": 0.0009881573670233533, + "loss": 1.03915036, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.34692383, + "step": 506, + "time_per_iteration": 2.5279319286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104761, + "balance_loss_mlp": 1.07061946, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.05291653517072512, + "language_loss": 0.96169406, + "learning_rate": 0.0009880898684925747, + "loss": 0.97274166, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.34179688, + "step": 507, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_mlp": 1.06039834, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.053809005456099755, + "language_loss": 0.94680405, + "learning_rate": 0.0009880221804685037, + "loss": 0.95776224, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.35424805, + "step": 508, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245061, + "balance_loss_mlp": 1.15503371, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.024665830319341657, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80589479, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.8984375, + "step": 509, + "time_per_iteration": 4.705655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094606, + "balance_loss_mlp": 1.05932045, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.06644626598388864, + "language_loss": 1.02131915, + "learning_rate": 0.0009878862360456733, + "loss": 1.03226519, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.35327148, + "step": 510, + "time_per_iteration": 2.682035446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097961, + "balance_loss_mlp": 1.06336641, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.06543943311749917, + "language_loss": 0.9266718, + "learning_rate": 0.0009878179796996922, + "loss": 0.9376514, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.34619141, + "step": 511, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105256, + "balance_loss_mlp": 1.07030368, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.054213046356477584, + "language_loss": 0.96428764, + "learning_rate": 0.0009877495339659754, + "loss": 0.97534013, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.34985352, + "step": 512, + "time_per_iteration": 2.746337413787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105714, + "balance_loss_mlp": 1.07190621, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.0573170093193853, + "language_loss": 0.91841626, + "learning_rate": 0.000987680898871096, + "loss": 0.9294734, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.33837891, + "step": 513, + "time_per_iteration": 2.7060482501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110186, + "balance_loss_mlp": 1.07675993, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.0786420176645203, + "language_loss": 0.95400196, + "learning_rate": 0.0009876120744417, + "loss": 0.96510386, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33447266, + "step": 514, + "time_per_iteration": 2.9473536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105767, + "balance_loss_mlp": 1.07071972, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.04861145683213968, + "language_loss": 1.01586378, + "learning_rate": 0.0009875430607045078, + "loss": 1.02692139, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.35058594, + "step": 515, + "time_per_iteration": 2.6745734214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095325, + "balance_loss_mlp": 1.06044412, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.061184004848699555, + "language_loss": 0.96467805, + "learning_rate": 0.000987473857686313, + "loss": 0.97563124, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.34887695, + "step": 516, + "time_per_iteration": 2.70771861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_mlp": 1.06909752, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.06268031252544905, + "language_loss": 1.01795554, + "learning_rate": 0.0009874044654139824, + "loss": 1.02899015, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34399414, + "step": 517, + "time_per_iteration": 2.7501027584075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104488, + "balance_loss_mlp": 1.07020378, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.05802057466070587, + "language_loss": 1.01047516, + "learning_rate": 0.0009873348839144563, + "loss": 1.02152014, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34301758, + "step": 518, + "time_per_iteration": 2.5247762203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125408, + "balance_loss_mlp": 1.09100425, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.057276560313135924, + "language_loss": 1.0153054, + "learning_rate": 0.000987265113214749, + "loss": 1.02655947, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34448242, + "step": 519, + "time_per_iteration": 2.569776773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151248, + "balance_loss_mlp": 1.11705852, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.06886779278024428, + "language_loss": 1.05486548, + "learning_rate": 0.0009871951533419476, + "loss": 1.066378, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.34204102, + "step": 520, + "time_per_iteration": 2.646489381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155904, + "balance_loss_mlp": 1.12085652, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.06947260655531057, + "language_loss": 0.93715644, + "learning_rate": 0.0009871250043232132, + "loss": 0.94871557, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.35058594, + "step": 521, + "time_per_iteration": 2.729825258255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145676, + "balance_loss_mlp": 1.11196363, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.05700460680955029, + "language_loss": 0.94319808, + "learning_rate": 0.0009870546661857797, + "loss": 0.95465487, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.33740234, + "step": 522, + "time_per_iteration": 2.589205026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.10572577, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.0627280587118585, + "language_loss": 1.04607201, + "learning_rate": 0.0009869841389569553, + "loss": 1.05746591, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.33666992, + "step": 523, + "time_per_iteration": 3.007927656173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_mlp": 1.07816648, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.07025860249961899, + "language_loss": 0.94709289, + "learning_rate": 0.0009869134226641206, + "loss": 0.95821834, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.34399414, + "step": 524, + "time_per_iteration": 2.5647661685943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096367, + "balance_loss_mlp": 1.06134343, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.0754869647085307, + "language_loss": 0.96719551, + "learning_rate": 0.0009868425173347303, + "loss": 0.97815919, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.3503418, + "step": 525, + "time_per_iteration": 2.675762414932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_mlp": 1.04816294, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.04461045481777941, + "language_loss": 1.01427031, + "learning_rate": 0.0009867714229963125, + "loss": 1.02508664, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.3347168, + "step": 526, + "time_per_iteration": 2.7551424503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_mlp": 1.06672287, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.06519670287778681, + "language_loss": 0.99495387, + "learning_rate": 0.000986700139676468, + "loss": 1.00596797, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34716797, + "step": 527, + "time_per_iteration": 2.5689845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_mlp": 1.08317983, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.055001529425537175, + "language_loss": 0.97175169, + "learning_rate": 0.0009866286674028717, + "loss": 0.98293233, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.34936523, + "step": 528, + "time_per_iteration": 2.6308236122131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118149, + "balance_loss_mlp": 1.08307743, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.06791274268555884, + "language_loss": 0.93964088, + "learning_rate": 0.0009865570062032717, + "loss": 0.95082229, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.35083008, + "step": 529, + "time_per_iteration": 2.931939125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117806, + "balance_loss_mlp": 1.08104193, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.05469252484924326, + "language_loss": 0.97321147, + "learning_rate": 0.0009864851561054893, + "loss": 0.98438954, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.36743164, + "step": 530, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_mlp": 1.0567745, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.053032092698093954, + "language_loss": 0.97237867, + "learning_rate": 0.0009864131171374191, + "loss": 0.9832958, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34936523, + "step": 531, + "time_per_iteration": 2.671963930130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_mlp": 1.05704737, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.037042660663456926, + "language_loss": 0.97530323, + "learning_rate": 0.0009863408893270292, + "loss": 0.98621887, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.34521484, + "step": 532, + "time_per_iteration": 2.8692965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080567, + "balance_loss_mlp": 1.0459249, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.045189468397627275, + "language_loss": 0.93818736, + "learning_rate": 0.0009862684727023605, + "loss": 0.94899297, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34692383, + "step": 533, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_mlp": 1.04978406, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.041807858593286534, + "language_loss": 0.94846106, + "learning_rate": 0.0009861958672915283, + "loss": 0.95930672, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.34814453, + "step": 534, + "time_per_iteration": 2.7894833087921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088029, + "balance_loss_mlp": 1.05348206, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.04113334704287127, + "language_loss": 0.93477535, + "learning_rate": 0.0009861230731227201, + "loss": 0.94565558, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.34570312, + "step": 535, + "time_per_iteration": 2.8369100093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_mlp": 1.06589389, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.06472741174466715, + "language_loss": 0.9716177, + "learning_rate": 0.0009860500902241973, + "loss": 0.98262858, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.35205078, + "step": 536, + "time_per_iteration": 2.6308608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_mlp": 1.06559658, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.06015330648509861, + "language_loss": 1.02488375, + "learning_rate": 0.0009859769186242942, + "loss": 1.0358845, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.34521484, + "step": 537, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094076, + "balance_loss_mlp": 1.06188989, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04182272700248836, + "language_loss": 0.96166039, + "learning_rate": 0.0009859035583514187, + "loss": 0.97260106, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32177734, + "step": 538, + "time_per_iteration": 2.665483236312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107546, + "balance_loss_mlp": 1.07497787, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.03728554890083732, + "language_loss": 0.9932602, + "learning_rate": 0.0009858300094340517, + "loss": 1.00433564, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.32568359, + "step": 539, + "time_per_iteration": 2.772207021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_mlp": 1.07908368, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.05284254114338104, + "language_loss": 0.91679931, + "learning_rate": 0.0009857562719007473, + "loss": 0.92790818, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.31787109, + "step": 540, + "time_per_iteration": 2.633002519607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_mlp": 1.06964111, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.07454941449424961, + "language_loss": 0.93962657, + "learning_rate": 0.0009856823457801331, + "loss": 0.95063812, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.888354539871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098965, + "balance_loss_mlp": 1.06682634, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.06016078646373104, + "language_loss": 1.01014686, + "learning_rate": 0.00098560823110091, + "loss": 1.02113652, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32128906, + "step": 542, + "time_per_iteration": 2.612365484237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.05664408, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.07331709746631812, + "language_loss": 0.99634022, + "learning_rate": 0.000985533927891851, + "loss": 1.00722837, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.3215332, + "step": 543, + "time_per_iteration": 2.6642584800720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_mlp": 1.05406535, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.07406485241554656, + "language_loss": 0.99318308, + "learning_rate": 0.0009854594361818044, + "loss": 1.00405657, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33300781, + "step": 544, + "time_per_iteration": 2.650541067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087044, + "balance_loss_mlp": 1.05357027, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.05515562757052397, + "language_loss": 0.98072803, + "learning_rate": 0.0009853847559996897, + "loss": 0.99159849, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.3347168, + "step": 545, + "time_per_iteration": 2.7268693447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098973, + "balance_loss_mlp": 1.0640682, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.05014767442192859, + "language_loss": 0.9781934, + "learning_rate": 0.0009853098873745, + "loss": 0.98918307, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34936523, + "step": 546, + "time_per_iteration": 3.001844644546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094885, + "balance_loss_mlp": 1.06010008, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.06665960072991474, + "language_loss": 0.96499509, + "learning_rate": 0.0009852348303353027, + "loss": 0.97594392, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34814453, + "step": 547, + "time_per_iteration": 2.7768120765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109085, + "balance_loss_mlp": 1.05692363, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.04477171592325676, + "language_loss": 0.89746928, + "learning_rate": 0.000985159584911237, + "loss": 0.90837783, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33959961, + "step": 548, + "time_per_iteration": 3.1397063732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109064, + "balance_loss_mlp": 1.0567131, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.057455808878804256, + "language_loss": 0.97617745, + "learning_rate": 0.0009850841511315162, + "loss": 0.98708391, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.33959961, + "step": 549, + "time_per_iteration": 2.6143858432769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.05660701, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.04134640300819554, + "language_loss": 0.97230792, + "learning_rate": 0.0009850085290254256, + "loss": 0.98321134, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33740234, + "step": 550, + "time_per_iteration": 2.784057855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_mlp": 1.05478084, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.041486348142279396, + "language_loss": 0.9340632, + "learning_rate": 0.0009849327186223246, + "loss": 0.94494367, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.33276367, + "step": 551, + "time_per_iteration": 2.822755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086641, + "balance_loss_mlp": 1.0536921, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.044652358506572586, + "language_loss": 1.00453854, + "learning_rate": 0.000984856719951646, + "loss": 1.01540482, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.32958984, + "step": 552, + "time_per_iteration": 2.561384439468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_mlp": 1.05577254, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.05595352831954139, + "language_loss": 0.98322356, + "learning_rate": 0.0009847805330428943, + "loss": 0.99410868, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.32739258, + "step": 553, + "time_per_iteration": 2.8988356590270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082477, + "balance_loss_mlp": 1.04940784, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05618387686115577, + "language_loss": 1.02895415, + "learning_rate": 0.0009847041579256481, + "loss": 1.03977895, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.33081055, + "step": 554, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088152, + "balance_loss_mlp": 1.05548859, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.04459262579832553, + "language_loss": 0.99802542, + "learning_rate": 0.0009846275946295592, + "loss": 1.00890684, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32641602, + "step": 555, + "time_per_iteration": 2.6283833980560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108533, + "balance_loss_mlp": 1.05347764, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.04108965909817336, + "language_loss": 0.92502242, + "learning_rate": 0.0009845508431843518, + "loss": 0.93587577, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.31835938, + "step": 556, + "time_per_iteration": 3.0189473628997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087957, + "balance_loss_mlp": 1.05612838, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.05029379164990677, + "language_loss": 0.95060432, + "learning_rate": 0.0009844739036198233, + "loss": 0.96148396, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.31811523, + "step": 557, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06340766, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.047100661757994676, + "language_loss": 1.0152961, + "learning_rate": 0.0009843967759658448, + "loss": 1.02625763, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.32739258, + "step": 558, + "time_per_iteration": 2.6677682399749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264894, + "balance_loss_mlp": 1.19775486, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.03689581784010691, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74032652, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.671875, + "step": 559, + "time_per_iteration": 4.873044013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.07234466, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.06480790167761245, + "language_loss": 1.01098323, + "learning_rate": 0.000984241956509384, + "loss": 1.02203977, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.33325195, + "step": 560, + "time_per_iteration": 2.655430555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095265, + "balance_loss_mlp": 1.0617907, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.05361377514900226, + "language_loss": 1.00074768, + "learning_rate": 0.0009841642647670078, + "loss": 1.01170027, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.33496094, + "step": 561, + "time_per_iteration": 2.5627329349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.05633116, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.04993888185520414, + "language_loss": 0.93071151, + "learning_rate": 0.0009840863850553944, + "loss": 0.94160575, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33105469, + "step": 562, + "time_per_iteration": 3.0020592212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108807, + "balance_loss_mlp": 1.05686092, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.046287089248472475, + "language_loss": 0.97956204, + "learning_rate": 0.0009840083174047782, + "loss": 0.99044275, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.31176758, + "step": 563, + "time_per_iteration": 2.7123258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_mlp": 1.06275535, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.036863902598139514, + "language_loss": 0.91394317, + "learning_rate": 0.0009839300618454685, + "loss": 0.92488301, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31176758, + "step": 564, + "time_per_iteration": 2.855482578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_mlp": 1.05386496, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0447892393855046, + "language_loss": 0.97269231, + "learning_rate": 0.0009838516184078466, + "loss": 0.98355657, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.32568359, + "step": 565, + "time_per_iteration": 2.8027093410491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090742, + "balance_loss_mlp": 1.05881739, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.039430635834492286, + "language_loss": 0.95326865, + "learning_rate": 0.0009837729871223669, + "loss": 0.964176, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3190918, + "step": 566, + "time_per_iteration": 2.621044158935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097443, + "balance_loss_mlp": 1.06473231, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.03524126234366562, + "language_loss": 0.96988255, + "learning_rate": 0.0009836941680195568, + "loss": 0.98085701, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.32714844, + "step": 567, + "time_per_iteration": 2.8241846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_mlp": 1.06359148, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.05940738915226433, + "language_loss": 0.94011569, + "learning_rate": 0.0009836151611300166, + "loss": 0.95106757, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.31567383, + "step": 568, + "time_per_iteration": 3.2259325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093362, + "balance_loss_mlp": 1.06327355, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.04952949609465528, + "language_loss": 1.01886261, + "learning_rate": 0.0009835359664844194, + "loss": 1.02979624, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.30029297, + "step": 569, + "time_per_iteration": 2.61936616897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235986, + "balance_loss_mlp": 1.17113578, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.02580255803672051, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82272792, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.6484375, + "step": 570, + "time_per_iteration": 4.946800470352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_mlp": 1.06947398, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.04088785760268294, + "language_loss": 0.98121774, + "learning_rate": 0.0009833770140481118, + "loss": 0.99224108, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.32861328, + "step": 571, + "time_per_iteration": 2.6676580905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_mlp": 1.07113993, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.04146527084622454, + "language_loss": 0.88084227, + "learning_rate": 0.000983297256319112, + "loss": 0.89187813, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.32446289, + "step": 572, + "time_per_iteration": 3.1977450847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098726, + "balance_loss_mlp": 1.06503749, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.11112801331440751, + "language_loss": 0.93675387, + "learning_rate": 0.000983217310957477, + "loss": 0.94774115, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33691406, + "step": 573, + "time_per_iteration": 2.771477222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08530974, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.046936313049011164, + "language_loss": 0.98079342, + "learning_rate": 0.000983137177994244, + "loss": 0.99198341, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.3371582, + "step": 574, + "time_per_iteration": 2.842641830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127826, + "balance_loss_mlp": 1.0945909, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.047970587572460185, + "language_loss": 0.91368234, + "learning_rate": 0.0009830568574605235, + "loss": 0.92496061, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.33227539, + "step": 575, + "time_per_iteration": 2.9841148853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136053, + "balance_loss_mlp": 1.10260296, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.06212944390612344, + "language_loss": 0.95608473, + "learning_rate": 0.0009829763493874992, + "loss": 0.96744525, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3347168, + "step": 576, + "time_per_iteration": 3.094599485397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122949, + "balance_loss_mlp": 1.08918953, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.040009357062280086, + "language_loss": 1.0022918, + "learning_rate": 0.0009828956538064264, + "loss": 1.01352131, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.33764648, + "step": 577, + "time_per_iteration": 2.7913765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128001, + "balance_loss_mlp": 1.09428823, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07834189266391174, + "language_loss": 0.97103804, + "learning_rate": 0.0009828147707486344, + "loss": 0.98231804, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.33740234, + "step": 578, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.0659467, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.066476002167881, + "language_loss": 0.94244707, + "learning_rate": 0.0009827337002455245, + "loss": 0.95344198, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.33544922, + "step": 579, + "time_per_iteration": 2.6212143898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.05940461, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.0598380025645264, + "language_loss": 0.93403691, + "learning_rate": 0.0009826524423285712, + "loss": 0.94494587, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.31469727, + "step": 580, + "time_per_iteration": 2.916363000869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_mlp": 1.05466461, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.051352596452175936, + "language_loss": 0.95457065, + "learning_rate": 0.0009825709970293218, + "loss": 0.96543789, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.32055664, + "step": 581, + "time_per_iteration": 2.975459575653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094975, + "balance_loss_mlp": 1.06414759, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.06330579048660655, + "language_loss": 1.01360774, + "learning_rate": 0.0009824893643793956, + "loss": 1.02455735, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.30810547, + "step": 582, + "time_per_iteration": 3.0850436687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109989, + "balance_loss_mlp": 1.06772757, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.05517621871728721, + "language_loss": 0.96568394, + "learning_rate": 0.0009824075444104857, + "loss": 0.9766829, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3215332, + "step": 583, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104353, + "balance_loss_mlp": 1.07214284, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.05273776870459213, + "language_loss": 1.00669086, + "learning_rate": 0.000982325537154357, + "loss": 1.01773441, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.32202148, + "step": 584, + "time_per_iteration": 2.566066265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109419, + "balance_loss_mlp": 1.07768583, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.05755454669423396, + "language_loss": 1.01869726, + "learning_rate": 0.0009822433426428484, + "loss": 1.02979159, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31713867, + "step": 585, + "time_per_iteration": 2.611968994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_mlp": 1.08987498, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.06034275506000564, + "language_loss": 0.93750811, + "learning_rate": 0.0009821609609078697, + "loss": 0.94872963, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.32275391, + "step": 586, + "time_per_iteration": 2.584847927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0726887, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.06416707827025614, + "language_loss": 0.95279968, + "learning_rate": 0.0009820783919814045, + "loss": 0.96384937, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.32275391, + "step": 587, + "time_per_iteration": 2.7885184288024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_mlp": 1.06359744, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.049104346633589514, + "language_loss": 0.92135406, + "learning_rate": 0.0009819956358955095, + "loss": 0.93231547, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32543945, + "step": 588, + "time_per_iteration": 2.560117483139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_mlp": 1.05427432, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.05114307144868452, + "language_loss": 0.93675017, + "learning_rate": 0.0009819126926823127, + "loss": 0.94761813, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.32519531, + "step": 589, + "time_per_iteration": 2.517035722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.05966008, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.04613241529975588, + "language_loss": 0.94437975, + "learning_rate": 0.000981829562374016, + "loss": 0.95531201, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.33569336, + "step": 590, + "time_per_iteration": 2.8174262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_mlp": 1.05913091, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.05348492004263644, + "language_loss": 1.04949331, + "learning_rate": 0.0009817462450028933, + "loss": 1.0604248, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.34057617, + "step": 591, + "time_per_iteration": 2.6302859783172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_mlp": 1.0668143, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.2030818500746725, + "language_loss": 0.92329478, + "learning_rate": 0.0009816627406012916, + "loss": 0.93430716, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.34472656, + "step": 592, + "time_per_iteration": 2.8384313583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.09943521, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.0774704650100976, + "language_loss": 0.91851664, + "learning_rate": 0.0009815790492016295, + "loss": 0.92987645, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36523438, + "step": 593, + "time_per_iteration": 2.9409682750701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136834, + "balance_loss_mlp": 1.10192943, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.09332707993556091, + "language_loss": 0.94690275, + "learning_rate": 0.0009814951708363993, + "loss": 0.95827115, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.34912109, + "step": 594, + "time_per_iteration": 2.8599631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221657, + "balance_loss_mlp": 1.16023993, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.030934197408724044, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79212642, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.61328125, + "step": 595, + "time_per_iteration": 4.801583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_mlp": 1.10138512, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.0746127254366864, + "language_loss": 0.94972038, + "learning_rate": 0.0009813268533395648, + "loss": 0.96109354, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.359375, + "step": 596, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_mlp": 1.0882678, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.061536990211155544, + "language_loss": 0.95371294, + "learning_rate": 0.0009812424142733073, + "loss": 0.96494377, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.34765625, + "step": 597, + "time_per_iteration": 2.5663998126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07387781, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.04795398370622496, + "language_loss": 0.91199464, + "learning_rate": 0.000981157788372175, + "loss": 0.92308056, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.34716797, + "step": 598, + "time_per_iteration": 3.004436492919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_mlp": 1.06864619, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.04762632796488997, + "language_loss": 0.94997883, + "learning_rate": 0.0009810729756690223, + "loss": 0.96100628, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.34106445, + "step": 599, + "time_per_iteration": 2.704676628112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_mlp": 1.06947374, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.06699944809564747, + "language_loss": 0.98224139, + "learning_rate": 0.0009809879761967766, + "loss": 0.99328732, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35107422, + "step": 600, + "time_per_iteration": 2.953348159790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_mlp": 1.07922578, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.06801646297960097, + "language_loss": 0.96874714, + "learning_rate": 0.0009809027899884378, + "loss": 0.97988677, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.34765625, + "step": 601, + "time_per_iteration": 2.896559953689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104267, + "balance_loss_mlp": 1.07014918, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.062436318450634756, + "language_loss": 0.9484992, + "learning_rate": 0.0009808174170770779, + "loss": 0.95954192, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.34130859, + "step": 602, + "time_per_iteration": 2.814558982849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220455, + "balance_loss_mlp": 1.16704941, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.025680107820064087, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86118698, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.53515625, + "step": 603, + "time_per_iteration": 4.897503614425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118739, + "balance_loss_mlp": 1.08566999, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.05533944227900463, + "language_loss": 1.0028702, + "learning_rate": 0.0009806461112779462, + "loss": 1.01405764, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.33081055, + "step": 604, + "time_per_iteration": 2.6172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115094, + "balance_loss_mlp": 1.08281231, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.07231087595972972, + "language_loss": 0.97971618, + "learning_rate": 0.0009805601784566814, + "loss": 0.99086702, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.32250977, + "step": 605, + "time_per_iteration": 2.4791650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125941, + "balance_loss_mlp": 1.09208584, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.06015253149930396, + "language_loss": 1.02430916, + "learning_rate": 0.0009804740590654089, + "loss": 1.03556848, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.33862305, + "step": 606, + "time_per_iteration": 2.614476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124787, + "balance_loss_mlp": 1.09229016, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.08034134565527169, + "language_loss": 0.97153747, + "learning_rate": 0.0009803877531375635, + "loss": 0.9827854, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.32495117, + "step": 607, + "time_per_iteration": 2.851011276245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_mlp": 1.09228706, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.05400582488055185, + "language_loss": 0.97512484, + "learning_rate": 0.0009803012607066523, + "loss": 0.9864068, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.35913086, + "step": 608, + "time_per_iteration": 2.700596570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128537, + "balance_loss_mlp": 1.09294093, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.15792902837654846, + "language_loss": 0.95375645, + "learning_rate": 0.0009802145818062543, + "loss": 0.96504182, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.35620117, + "step": 609, + "time_per_iteration": 2.693417549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123637, + "balance_loss_mlp": 1.08742094, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.06851059455565046, + "language_loss": 0.99132365, + "learning_rate": 0.0009801277164700212, + "loss": 1.00256002, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36254883, + "step": 610, + "time_per_iteration": 2.5825185775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131797, + "balance_loss_mlp": 1.09541452, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.1113382534985323, + "language_loss": 0.96033651, + "learning_rate": 0.0009800406647316776, + "loss": 0.97165447, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.36376953, + "step": 611, + "time_per_iteration": 2.8625166416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231096, + "balance_loss_mlp": 1.18112373, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.03346184177846584, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78145558, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.49804688, + "step": 612, + "time_per_iteration": 4.748431444168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137214, + "balance_loss_mlp": 1.09880471, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.07612220197102978, + "language_loss": 0.95326376, + "learning_rate": 0.000979866002183916, + "loss": 0.96463591, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.38378906, + "step": 613, + "time_per_iteration": 2.6311473846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155666, + "balance_loss_mlp": 1.11482501, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.0832714106614858, + "language_loss": 0.96221644, + "learning_rate": 0.0009797783914423082, + "loss": 0.97377312, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.40844727, + "step": 614, + "time_per_iteration": 2.8568782806396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126933, + "balance_loss_mlp": 1.08721232, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.08355321383380138, + "language_loss": 0.91733479, + "learning_rate": 0.0009796905944342094, + "loss": 0.92860413, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.3972168, + "step": 615, + "time_per_iteration": 2.8348331451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07517743, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.05175964705030883, + "language_loss": 0.94486296, + "learning_rate": 0.0009796026111937057, + "loss": 0.9560017, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.38671875, + "step": 616, + "time_per_iteration": 2.609276056289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111065, + "balance_loss_mlp": 1.07393384, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.1779679576065946, + "language_loss": 0.94108498, + "learning_rate": 0.0009795144417549552, + "loss": 0.95219147, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.3671875, + "step": 617, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.07760203, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.0639893702788804, + "language_loss": 0.95137906, + "learning_rate": 0.0009794260861521883, + "loss": 0.96252483, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36987305, + "step": 618, + "time_per_iteration": 2.779780387878418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125632, + "balance_loss_mlp": 1.08908224, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.062080445707157726, + "language_loss": 0.94238096, + "learning_rate": 0.0009793375444197075, + "loss": 0.95363724, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.3659668, + "step": 619, + "time_per_iteration": 2.6269500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.12132859, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.05728911446624217, + "language_loss": 0.93181753, + "learning_rate": 0.000979248816591888, + "loss": 0.94341516, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.38452148, + "step": 620, + "time_per_iteration": 2.7879464626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155629, + "balance_loss_mlp": 1.11600351, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.05539388103354017, + "language_loss": 0.93241715, + "learning_rate": 0.0009791599027031766, + "loss": 0.94397342, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.39624023, + "step": 621, + "time_per_iteration": 3.058497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152711, + "balance_loss_mlp": 1.11439681, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.05959109763307043, + "language_loss": 0.93889141, + "learning_rate": 0.0009790708027880932, + "loss": 0.95041847, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.38330078, + "step": 622, + "time_per_iteration": 2.857905864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217773, + "balance_loss_mlp": 1.17447615, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.033264976771994935, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78645062, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.43359375, + "step": 623, + "time_per_iteration": 4.817517518997192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130228, + "balance_loss_mlp": 1.09372652, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.07130736684785184, + "language_loss": 0.99442542, + "learning_rate": 0.0009788920450172487, + "loss": 1.00572777, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.36499023, + "step": 624, + "time_per_iteration": 2.6089231967926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_mlp": 1.0987401, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.053387747347518576, + "language_loss": 0.97139525, + "learning_rate": 0.0009788023872308875, + "loss": 0.98273742, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35522461, + "step": 625, + "time_per_iteration": 2.5482659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171918, + "balance_loss_mlp": 1.12614214, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.016755812295179123, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76600921, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.45703125, + "step": 626, + "time_per_iteration": 4.767898797988892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142548, + "balance_loss_mlp": 1.10609388, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.053046953839951706, + "language_loss": 0.99526918, + "learning_rate": 0.0009786225140303285, + "loss": 1.00669467, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.36425781, + "step": 627, + "time_per_iteration": 2.666975975036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145866, + "balance_loss_mlp": 1.10974586, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.06539343990980159, + "language_loss": 0.97403502, + "learning_rate": 0.0009785322986859634, + "loss": 0.98549366, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.36132812, + "step": 628, + "time_per_iteration": 2.6613006591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116443, + "balance_loss_mlp": 1.12830925, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.05337423256033143, + "language_loss": 0.99038112, + "learning_rate": 0.0009784418975588838, + "loss": 1.00202537, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.36108398, + "step": 629, + "time_per_iteration": 2.7266693115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.11248696, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.06598420413892771, + "language_loss": 0.97636682, + "learning_rate": 0.0009783513106841862, + "loss": 0.98784697, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.35522461, + "step": 630, + "time_per_iteration": 2.7734336853027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122661, + "balance_loss_mlp": 1.17663717, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.0364602282496576, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77959311, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.5, + "step": 631, + "time_per_iteration": 4.955650091171265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118689, + "balance_loss_mlp": 1.08283055, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.061523486228641615, + "language_loss": 0.94419873, + "learning_rate": 0.0009781695798326854, + "loss": 0.95538557, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35888672, + "step": 632, + "time_per_iteration": 2.6072514057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111815, + "balance_loss_mlp": 1.08319819, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.05761126083629287, + "language_loss": 0.93996418, + "learning_rate": 0.0009780784359264365, + "loss": 0.95114571, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.34985352, + "step": 633, + "time_per_iteration": 2.6186299324035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201074, + "balance_loss_mlp": 1.15548825, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.024414945484573326, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75389773, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.45507812, + "step": 634, + "time_per_iteration": 4.757866144180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_mlp": 1.05732846, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.05071444395915749, + "language_loss": 0.91919303, + "learning_rate": 0.000977895591329867, + "loss": 0.93010104, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.3347168, + "step": 635, + "time_per_iteration": 2.7802233695983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094425, + "balance_loss_mlp": 1.06006885, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.05652682698430024, + "language_loss": 0.93613631, + "learning_rate": 0.000977803890710533, + "loss": 0.94708061, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.34399414, + "step": 636, + "time_per_iteration": 2.719989538192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109182, + "balance_loss_mlp": 1.0546267, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.05019916823038997, + "language_loss": 0.97873759, + "learning_rate": 0.0009777120045912774, + "loss": 0.98965579, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.37231445, + "step": 637, + "time_per_iteration": 2.5960683822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099212, + "balance_loss_mlp": 1.06139851, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.05186361253186237, + "language_loss": 0.97095829, + "learning_rate": 0.0009776199330077736, + "loss": 0.9819504, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37841797, + "step": 638, + "time_per_iteration": 2.7152581214904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_mlp": 1.05121303, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.05467339203371928, + "language_loss": 0.99154645, + "learning_rate": 0.0009775276759957667, + "loss": 1.00242841, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.37011719, + "step": 639, + "time_per_iteration": 2.6985981464385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090176, + "balance_loss_mlp": 1.05465198, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.06600893718108056, + "language_loss": 0.97933781, + "learning_rate": 0.0009774352335910745, + "loss": 0.99023956, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.35546875, + "step": 640, + "time_per_iteration": 2.813744306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_mlp": 1.05298471, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.05927901471916764, + "language_loss": 0.99468219, + "learning_rate": 0.000977342605829586, + "loss": 1.00554824, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.33642578, + "step": 641, + "time_per_iteration": 2.73280668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110577, + "balance_loss_mlp": 1.07240582, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.07046674646118828, + "language_loss": 0.92099506, + "learning_rate": 0.0009772497927472623, + "loss": 0.93210077, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.38183594, + "step": 642, + "time_per_iteration": 3.1258397102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.09514427, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.07438352262018386, + "language_loss": 0.93366879, + "learning_rate": 0.0009771567943801368, + "loss": 0.94501698, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3972168, + "step": 643, + "time_per_iteration": 2.6720776557922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149366, + "balance_loss_mlp": 1.10912085, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.055730629552303436, + "language_loss": 0.96261084, + "learning_rate": 0.0009770636107643152, + "loss": 0.97410446, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.40234375, + "step": 644, + "time_per_iteration": 2.7093722820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144915, + "balance_loss_mlp": 1.10734022, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.05250459899213186, + "language_loss": 0.92937833, + "learning_rate": 0.0009769702419359738, + "loss": 0.94082749, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.37597656, + "step": 645, + "time_per_iteration": 2.661512613296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173428, + "balance_loss_mlp": 1.13146591, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.052890865129340166, + "language_loss": 0.94770992, + "learning_rate": 0.000976876687931362, + "loss": 0.95944417, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.41943359, + "step": 646, + "time_per_iteration": 2.972522258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164317, + "balance_loss_mlp": 1.12555003, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.07033761546633982, + "language_loss": 0.91270661, + "learning_rate": 0.0009767829487868005, + "loss": 0.92434984, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.38769531, + "step": 647, + "time_per_iteration": 2.6150805950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164183, + "balance_loss_mlp": 1.12281775, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.07269814667774141, + "language_loss": 0.95938772, + "learning_rate": 0.000976689024538682, + "loss": 0.97102952, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.41381836, + "step": 648, + "time_per_iteration": 2.6567764282226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_mlp": 1.11497951, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.06659282576896536, + "language_loss": 0.94783676, + "learning_rate": 0.0009765949152234716, + "loss": 0.95937783, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.39135742, + "step": 649, + "time_per_iteration": 2.9032628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118823, + "balance_loss_mlp": 1.15084565, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.027365485913225348, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79874313, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.37304688, + "step": 650, + "time_per_iteration": 4.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145487, + "balance_loss_mlp": 1.10395491, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.07758701561639549, + "language_loss": 0.88880539, + "learning_rate": 0.0009764061415379919, + "loss": 0.90026021, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.41552734, + "step": 651, + "time_per_iteration": 3.2588987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_mlp": 1.09766221, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08409279007421946, + "language_loss": 0.94380724, + "learning_rate": 0.0009763114772410109, + "loss": 0.95518184, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.39794922, + "step": 652, + "time_per_iteration": 2.5698702335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112103, + "balance_loss_mlp": 1.08359814, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.056536251661147445, + "language_loss": 0.92061114, + "learning_rate": 0.0009762166280235146, + "loss": 0.93182147, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37451172, + "step": 653, + "time_per_iteration": 2.938668966293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_mlp": 1.08191729, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.0771848817407848, + "language_loss": 0.94092464, + "learning_rate": 0.0009761215939223267, + "loss": 0.95209974, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.35644531, + "step": 654, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_mlp": 1.06834149, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.07424845664771389, + "language_loss": 0.9475044, + "learning_rate": 0.0009760263749743428, + "loss": 0.95853353, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.34570312, + "step": 655, + "time_per_iteration": 2.5710902214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101838, + "balance_loss_mlp": 1.06771994, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.053259035011575195, + "language_loss": 0.94285154, + "learning_rate": 0.0009759309712165299, + "loss": 0.95386994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34130859, + "step": 656, + "time_per_iteration": 2.70626163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101868, + "balance_loss_mlp": 1.06858444, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.0693418830287988, + "language_loss": 0.9812479, + "learning_rate": 0.0009758353826859272, + "loss": 0.99226654, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.33300781, + "step": 657, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_mlp": 1.0663563, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.06782991509763603, + "language_loss": 0.96008623, + "learning_rate": 0.0009757396094196456, + "loss": 0.97111744, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36791992, + "step": 658, + "time_per_iteration": 2.8277065753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115911, + "balance_loss_mlp": 1.07926583, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.053606842709613675, + "language_loss": 0.89398581, + "learning_rate": 0.0009756436514548673, + "loss": 0.90514493, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36645508, + "step": 659, + "time_per_iteration": 2.796175718307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120986, + "balance_loss_mlp": 1.0811224, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.060525818769901533, + "language_loss": 0.92384607, + "learning_rate": 0.0009755475088288466, + "loss": 0.93505597, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.39916992, + "step": 660, + "time_per_iteration": 2.678682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133341, + "balance_loss_mlp": 1.09271395, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08191197530717065, + "language_loss": 0.958794, + "learning_rate": 0.0009754511815789095, + "loss": 0.97012746, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.40600586, + "step": 661, + "time_per_iteration": 2.7371177673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130898, + "balance_loss_mlp": 1.09093928, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08687138171908054, + "language_loss": 0.92166948, + "learning_rate": 0.0009753546697424533, + "loss": 0.93297845, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.39941406, + "step": 662, + "time_per_iteration": 2.704432249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125889, + "balance_loss_mlp": 1.08700323, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.06194581367760624, + "language_loss": 0.95628935, + "learning_rate": 0.0009752579733569475, + "loss": 0.96754825, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.38891602, + "step": 663, + "time_per_iteration": 2.682892084121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165087, + "balance_loss_mlp": 1.1326623, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.0245621431528993, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76046479, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.32421875, + "step": 664, + "time_per_iteration": 4.981603622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146598, + "balance_loss_mlp": 1.1060189, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.07818489478946229, + "language_loss": 0.96962506, + "learning_rate": 0.0009750640270890217, + "loss": 0.98109102, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.40576172, + "step": 665, + "time_per_iteration": 2.7139556407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139516, + "balance_loss_mlp": 1.10115409, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.10418725554084544, + "language_loss": 1.02824736, + "learning_rate": 0.0009749667772818983, + "loss": 1.03964257, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.38354492, + "step": 666, + "time_per_iteration": 3.000227689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148195, + "balance_loss_mlp": 1.11481678, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.027847994605201966, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78084135, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.33398438, + "step": 667, + "time_per_iteration": 4.858838319778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.1255703, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.0747922247275706, + "language_loss": 1.00932169, + "learning_rate": 0.0009747717245101093, + "loss": 1.0209403, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.36303711, + "step": 668, + "time_per_iteration": 2.4917514324188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172854, + "balance_loss_mlp": 1.13518405, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0795363237311063, + "language_loss": 0.91087645, + "learning_rate": 0.00097467392162117, + "loss": 0.92260504, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.37719727, + "step": 669, + "time_per_iteration": 2.601151466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196603, + "balance_loss_mlp": 1.15540457, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.0744221392925499, + "language_loss": 0.95630497, + "learning_rate": 0.0009745759344474708, + "loss": 0.96827102, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.41162109, + "step": 670, + "time_per_iteration": 2.878068447113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200159, + "balance_loss_mlp": 1.16012812, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.07162427386273244, + "language_loss": 0.95158428, + "learning_rate": 0.0009744777630270536, + "loss": 0.96358585, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.40063477, + "step": 671, + "time_per_iteration": 2.5778517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220294, + "balance_loss_mlp": 1.17752171, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.07459259564874297, + "language_loss": 0.99775112, + "learning_rate": 0.000974379407398032, + "loss": 1.00995398, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.42797852, + "step": 672, + "time_per_iteration": 2.862168073654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_mlp": 1.15175724, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.05795101219152752, + "language_loss": 0.86696863, + "learning_rate": 0.0009742808675985913, + "loss": 0.87888587, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.3996582, + "step": 673, + "time_per_iteration": 3.0987160205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011832, + "balance_loss_mlp": 1.14142871, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.06292984682523013, + "language_loss": 0.96893597, + "learning_rate": 0.0009741821436669876, + "loss": 0.98076797, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.41772461, + "step": 674, + "time_per_iteration": 2.565317153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160814, + "balance_loss_mlp": 1.12123656, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.07127578315040689, + "language_loss": 0.99621803, + "learning_rate": 0.0009740832356415492, + "loss": 1.00782621, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.39550781, + "step": 675, + "time_per_iteration": 2.4777724742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144275, + "balance_loss_mlp": 1.10538852, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.07563598794059366, + "language_loss": 0.94837546, + "learning_rate": 0.0009739841435606756, + "loss": 0.95981824, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.38867188, + "step": 676, + "time_per_iteration": 2.9838767051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_mlp": 1.09186864, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.06693149578557214, + "language_loss": 0.94293654, + "learning_rate": 0.0009738848674628377, + "loss": 0.95424765, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.39233398, + "step": 677, + "time_per_iteration": 2.7052054405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130656, + "balance_loss_mlp": 1.0923903, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.05501746541124835, + "language_loss": 0.94784498, + "learning_rate": 0.000973785407386578, + "loss": 0.95915151, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.38232422, + "step": 678, + "time_per_iteration": 2.7535152435302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_mlp": 1.09727383, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.05430769504454563, + "language_loss": 0.91185606, + "learning_rate": 0.0009736857633705103, + "loss": 0.92322862, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.3996582, + "step": 679, + "time_per_iteration": 2.8686013221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135266, + "balance_loss_mlp": 1.09575987, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.06387426976514826, + "language_loss": 0.97335434, + "learning_rate": 0.0009735859354533196, + "loss": 0.984707, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.39501953, + "step": 680, + "time_per_iteration": 2.6952273845672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09626174, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.07637025474680663, + "language_loss": 0.97434723, + "learning_rate": 0.0009734859236737628, + "loss": 0.98571181, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.40185547, + "step": 681, + "time_per_iteration": 2.607431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_mlp": 1.09720194, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.06515090437153119, + "language_loss": 0.9831785, + "learning_rate": 0.0009733857280706678, + "loss": 0.99454683, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.39599609, + "step": 682, + "time_per_iteration": 2.5730957984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140739, + "balance_loss_mlp": 1.1007328, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.08408851923922504, + "language_loss": 0.89817083, + "learning_rate": 0.000973285348682934, + "loss": 0.90957826, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.39990234, + "step": 683, + "time_per_iteration": 2.7041609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_mlp": 1.08460057, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.021197399820989362, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7901845, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.35546875, + "step": 684, + "time_per_iteration": 4.7803051471710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145399, + "balance_loss_mlp": 1.10579789, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.06796914093678033, + "language_loss": 0.90116858, + "learning_rate": 0.0009730840387095046, + "loss": 0.91262257, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.39575195, + "step": 685, + "time_per_iteration": 3.289513111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154301, + "balance_loss_mlp": 1.11412716, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.0690044047280534, + "language_loss": 0.95956922, + "learning_rate": 0.0009729831082019642, + "loss": 0.97111225, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.40185547, + "step": 686, + "time_per_iteration": 2.8214356899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131343, + "balance_loss_mlp": 1.09383941, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08080780289155233, + "language_loss": 0.93596351, + "learning_rate": 0.0009728819940660958, + "loss": 0.94727689, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.375, + "step": 687, + "time_per_iteration": 2.749385118484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011246, + "balance_loss_mlp": 1.08542764, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.08853955851107219, + "language_loss": 0.91695315, + "learning_rate": 0.0009727806963411557, + "loss": 0.92819917, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.39135742, + "step": 688, + "time_per_iteration": 2.592099666595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_mlp": 1.08777368, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.06370494383790047, + "language_loss": 0.92130053, + "learning_rate": 0.000972679215066471, + "loss": 0.93258381, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.40551758, + "step": 689, + "time_per_iteration": 2.7344043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114145, + "balance_loss_mlp": 1.10246885, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.08478699193898473, + "language_loss": 1.04583168, + "learning_rate": 0.0009725775502814401, + "loss": 1.05724621, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.3894043, + "step": 690, + "time_per_iteration": 2.5881311893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155383, + "balance_loss_mlp": 1.1147325, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.07994389842197654, + "language_loss": 0.90077579, + "learning_rate": 0.0009724757020255327, + "loss": 0.91232961, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.40649414, + "step": 691, + "time_per_iteration": 2.8452539443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_mlp": 1.12566948, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09039906445052394, + "language_loss": 0.91914684, + "learning_rate": 0.0009723736703382902, + "loss": 0.93079573, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.39208984, + "step": 692, + "time_per_iteration": 2.5472824573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198661, + "balance_loss_mlp": 1.15557849, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07689546631051256, + "language_loss": 0.86461794, + "learning_rate": 0.0009722714552593244, + "loss": 0.87660456, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.4309082, + "step": 693, + "time_per_iteration": 2.6273465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199876, + "balance_loss_mlp": 1.15560198, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08142665414192346, + "language_loss": 1.00438499, + "learning_rate": 0.000972169056828319, + "loss": 1.01638389, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.44262695, + "step": 694, + "time_per_iteration": 2.477491617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221806, + "balance_loss_mlp": 1.17741275, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.07001491486919184, + "language_loss": 0.90590984, + "learning_rate": 0.0009720664750850283, + "loss": 0.91812789, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.4440918, + "step": 695, + "time_per_iteration": 2.7817704677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209285, + "balance_loss_mlp": 1.16870594, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.07077521288835904, + "language_loss": 0.97240067, + "learning_rate": 0.0009719637100692784, + "loss": 0.98449349, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.40625, + "step": 696, + "time_per_iteration": 2.7099833488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214589, + "balance_loss_mlp": 1.17069626, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.06395797985697109, + "language_loss": 0.87399805, + "learning_rate": 0.0009718607618209661, + "loss": 0.88614392, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.43896484, + "step": 697, + "time_per_iteration": 2.8280160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226846, + "balance_loss_mlp": 1.18445516, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.08853583224950028, + "language_loss": 0.91527486, + "learning_rate": 0.0009717576303800595, + "loss": 0.92754334, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.42382812, + "step": 698, + "time_per_iteration": 3.0102553367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206724, + "balance_loss_mlp": 1.16385674, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.07140979809376953, + "language_loss": 0.90443981, + "learning_rate": 0.0009716543157865975, + "loss": 0.91650712, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.4284668, + "step": 699, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192988, + "balance_loss_mlp": 1.15047789, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.0971528894423257, + "language_loss": 0.87731719, + "learning_rate": 0.0009715508180806907, + "loss": 0.88924706, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.42504883, + "step": 700, + "time_per_iteration": 3.183608055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.12189686, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07253928509691168, + "language_loss": 0.94940412, + "learning_rate": 0.0009714471373025202, + "loss": 0.96104908, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.42578125, + "step": 701, + "time_per_iteration": 3.4071736335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_mlp": 1.10978746, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07349692890686976, + "language_loss": 0.93387866, + "learning_rate": 0.0009713432734923386, + "loss": 0.94542348, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.44702148, + "step": 702, + "time_per_iteration": 2.61545467376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149917, + "balance_loss_mlp": 1.10523736, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.07475145021416552, + "language_loss": 0.90919894, + "learning_rate": 0.0009712392266904696, + "loss": 0.92069811, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.44702148, + "step": 703, + "time_per_iteration": 2.739295482635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156115, + "balance_loss_mlp": 1.11219811, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.09690331363255131, + "language_loss": 0.90325272, + "learning_rate": 0.0009711349969373076, + "loss": 0.91481388, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.43945312, + "step": 704, + "time_per_iteration": 3.1653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175158, + "balance_loss_mlp": 1.12780786, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.09111648779989767, + "language_loss": 0.84997714, + "learning_rate": 0.0009710305842733178, + "loss": 0.86172873, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.47314453, + "step": 705, + "time_per_iteration": 2.7402727603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117117, + "balance_loss_mlp": 1.12737262, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.10189351673448747, + "language_loss": 0.9379847, + "learning_rate": 0.0009709259887390373, + "loss": 0.94969636, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.43774414, + "step": 706, + "time_per_iteration": 2.5640039443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.10467625, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.07946562356881365, + "language_loss": 0.95178437, + "learning_rate": 0.0009708212103750737, + "loss": 0.96325481, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.42382812, + "step": 707, + "time_per_iteration": 2.6138036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153312, + "balance_loss_mlp": 1.1095618, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.07708082078191984, + "language_loss": 0.91549516, + "learning_rate": 0.0009707162492221051, + "loss": 0.9270283, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.43725586, + "step": 708, + "time_per_iteration": 2.879612684249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143626, + "balance_loss_mlp": 1.10121179, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.08764140181907645, + "language_loss": 0.92509496, + "learning_rate": 0.0009706111053208815, + "loss": 0.93653119, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.42431641, + "step": 709, + "time_per_iteration": 2.804469347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156089, + "balance_loss_mlp": 1.10947847, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.07097269092186763, + "language_loss": 0.89579999, + "learning_rate": 0.0009705057787122232, + "loss": 0.90736091, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.46630859, + "step": 710, + "time_per_iteration": 2.568406105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174212, + "balance_loss_mlp": 1.12874603, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.06463299548184855, + "language_loss": 0.94250202, + "learning_rate": 0.0009704002694370216, + "loss": 0.9542442, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.45410156, + "step": 711, + "time_per_iteration": 2.525240659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116324, + "balance_loss_mlp": 1.11820245, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.06677275778781674, + "language_loss": 0.90675253, + "learning_rate": 0.0009702945775362388, + "loss": 0.91838491, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.45043945, + "step": 712, + "time_per_iteration": 2.572566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171995, + "balance_loss_mlp": 1.12478852, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.06549167744569931, + "language_loss": 0.91151595, + "learning_rate": 0.0009701887030509086, + "loss": 0.92323589, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.47167969, + "step": 713, + "time_per_iteration": 2.645202875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_mlp": 1.11450684, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.07696267649297317, + "language_loss": 0.95333648, + "learning_rate": 0.0009700826460221346, + "loss": 0.96490526, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.42382812, + "step": 714, + "time_per_iteration": 2.649831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187773, + "balance_loss_mlp": 1.13980293, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.08597126409557068, + "language_loss": 0.96336859, + "learning_rate": 0.0009699764064910921, + "loss": 0.97524625, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.47998047, + "step": 715, + "time_per_iteration": 2.8645238876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178216, + "balance_loss_mlp": 1.1317718, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08366808602410432, + "language_loss": 0.90892398, + "learning_rate": 0.0009698699844990268, + "loss": 0.92070615, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.46435547, + "step": 716, + "time_per_iteration": 2.635460376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171183, + "balance_loss_mlp": 1.12731409, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.051528021496160425, + "language_loss": 0.91132116, + "learning_rate": 0.0009697633800872555, + "loss": 0.923033, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.4387207, + "step": 717, + "time_per_iteration": 2.887854814529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189757, + "balance_loss_mlp": 1.1432178, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.07388540586481528, + "language_loss": 0.94422555, + "learning_rate": 0.0009696565932971655, + "loss": 0.95612311, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.46557617, + "step": 718, + "time_per_iteration": 2.8565313816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171127, + "balance_loss_mlp": 1.12580407, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.06166568969162735, + "language_loss": 0.92794299, + "learning_rate": 0.0009695496241702153, + "loss": 0.93965423, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.45361328, + "step": 719, + "time_per_iteration": 2.827193021774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178611, + "balance_loss_mlp": 1.13152349, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.07046673128739296, + "language_loss": 0.8903814, + "learning_rate": 0.0009694424727479339, + "loss": 0.9021675, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.47094727, + "step": 720, + "time_per_iteration": 2.958855628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12150323, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.07332050167219753, + "language_loss": 0.91946507, + "learning_rate": 0.0009693351390719213, + "loss": 0.93114913, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.46899414, + "step": 721, + "time_per_iteration": 2.6910197734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012083, + "balance_loss_mlp": 1.15742183, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.06188248769550966, + "language_loss": 0.93531096, + "learning_rate": 0.000969227623183848, + "loss": 0.94739395, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.50830078, + "step": 722, + "time_per_iteration": 2.791097640991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.14776587, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06666345220966835, + "language_loss": 0.93550557, + "learning_rate": 0.0009691199251254554, + "loss": 0.94745386, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.47045898, + "step": 723, + "time_per_iteration": 2.8282151222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173107, + "balance_loss_mlp": 1.13059711, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.07191970231420823, + "language_loss": 0.88703346, + "learning_rate": 0.0009690120449385555, + "loss": 0.89876461, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.42504883, + "step": 724, + "time_per_iteration": 2.775456190109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158197, + "balance_loss_mlp": 1.11332655, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.06680700276551169, + "language_loss": 0.95181078, + "learning_rate": 0.0009689039826650312, + "loss": 0.96339279, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.44824219, + "step": 725, + "time_per_iteration": 2.7623417377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164951, + "balance_loss_mlp": 1.12756717, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.03995326528410751, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77688015, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.37304688, + "step": 726, + "time_per_iteration": 4.914167642593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146003, + "balance_loss_mlp": 1.09567261, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.07822541163530779, + "language_loss": 0.90488958, + "learning_rate": 0.0009686873120259941, + "loss": 0.91634959, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.50341797, + "step": 727, + "time_per_iteration": 2.563333749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132914, + "balance_loss_mlp": 1.09092879, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.0725242002086287, + "language_loss": 0.89161742, + "learning_rate": 0.0009685787037446004, + "loss": 0.90294659, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.41992188, + "step": 728, + "time_per_iteration": 2.7803192138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137226, + "balance_loss_mlp": 1.09192598, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.10183800223701604, + "language_loss": 0.9064362, + "learning_rate": 0.0009684699135448201, + "loss": 0.91780847, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.453125, + "step": 729, + "time_per_iteration": 2.750023603439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142476, + "balance_loss_mlp": 1.0995841, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.06503689668024501, + "language_loss": 0.94054115, + "learning_rate": 0.0009683609414688895, + "loss": 0.95196593, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.42895508, + "step": 730, + "time_per_iteration": 2.708470344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116576, + "balance_loss_mlp": 1.11652613, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.07277464462784268, + "language_loss": 0.89072424, + "learning_rate": 0.0009682517875591154, + "loss": 0.9023819, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.49243164, + "step": 731, + "time_per_iteration": 2.734145402908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173563, + "balance_loss_mlp": 1.12640429, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.08810260071203486, + "language_loss": 0.88790858, + "learning_rate": 0.0009681424518578749, + "loss": 0.8996442, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.47192383, + "step": 732, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166119, + "balance_loss_mlp": 1.11900759, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.07053265121681873, + "language_loss": 0.9010576, + "learning_rate": 0.000968032934407616, + "loss": 0.91271877, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.47143555, + "step": 733, + "time_per_iteration": 2.625128746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161975, + "balance_loss_mlp": 1.11514974, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.08143861058365946, + "language_loss": 0.84579933, + "learning_rate": 0.0009679232352508571, + "loss": 0.85741913, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.46850586, + "step": 734, + "time_per_iteration": 2.7461798191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145124, + "balance_loss_mlp": 1.10046864, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.0788084271092868, + "language_loss": 0.83272535, + "learning_rate": 0.0009678133544301871, + "loss": 0.84417665, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.44677734, + "step": 735, + "time_per_iteration": 2.68129301071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130971, + "balance_loss_mlp": 1.08731616, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.05044431767963513, + "language_loss": 0.93706036, + "learning_rate": 0.0009677032919882658, + "loss": 0.94837004, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.43652344, + "step": 736, + "time_per_iteration": 2.663874387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141167, + "balance_loss_mlp": 1.0970124, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.07155994363363784, + "language_loss": 0.94151366, + "learning_rate": 0.000967593047967823, + "loss": 0.95292532, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.44116211, + "step": 737, + "time_per_iteration": 2.512871265411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.10376751, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.07145762863961741, + "language_loss": 0.89657855, + "learning_rate": 0.0009674826224116593, + "loss": 0.90808284, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.46655273, + "step": 738, + "time_per_iteration": 2.797337293624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_mlp": 1.09865868, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.07589062836694223, + "language_loss": 0.89765012, + "learning_rate": 0.0009673720153626455, + "loss": 0.90910375, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.46728516, + "step": 739, + "time_per_iteration": 2.5743062496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.09274864, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07239717331604524, + "language_loss": 0.89863205, + "learning_rate": 0.0009672612268637235, + "loss": 0.9100163, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.45678711, + "step": 740, + "time_per_iteration": 2.6074059009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125723, + "balance_loss_mlp": 1.08125818, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.08552249660547784, + "language_loss": 0.8725301, + "learning_rate": 0.0009671502569579048, + "loss": 0.88378727, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.44458008, + "step": 741, + "time_per_iteration": 2.729733467102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116563, + "balance_loss_mlp": 1.07338512, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.05753110737252733, + "language_loss": 0.92330521, + "learning_rate": 0.0009670391056882719, + "loss": 0.93447083, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.43188477, + "step": 742, + "time_per_iteration": 2.69399356842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115871, + "balance_loss_mlp": 1.07367063, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.06711892894426404, + "language_loss": 0.91615599, + "learning_rate": 0.0009669277730979776, + "loss": 0.92731464, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.421875, + "step": 743, + "time_per_iteration": 3.1732802391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123454, + "balance_loss_mlp": 1.079561, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.07488288596065623, + "language_loss": 0.88249421, + "learning_rate": 0.0009668162592302449, + "loss": 0.89372879, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.43896484, + "step": 744, + "time_per_iteration": 2.88962459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114179, + "balance_loss_mlp": 1.09551311, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.08170086657731683, + "language_loss": 0.8873378, + "learning_rate": 0.0009667045641283676, + "loss": 0.89875567, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.46289062, + "step": 745, + "time_per_iteration": 2.6374380588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136601, + "balance_loss_mlp": 1.09158731, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.07376324969806651, + "language_loss": 0.9752661, + "learning_rate": 0.0009665926878357092, + "loss": 0.98663211, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.44995117, + "step": 746, + "time_per_iteration": 2.908377170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138589, + "balance_loss_mlp": 1.09283662, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.055840413500964095, + "language_loss": 0.93229979, + "learning_rate": 0.0009664806303957043, + "loss": 0.94368571, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.45751953, + "step": 747, + "time_per_iteration": 2.6940197944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_mlp": 1.11397541, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.07422855656653271, + "language_loss": 0.89923358, + "learning_rate": 0.0009663683918518571, + "loss": 0.91087878, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.50463867, + "step": 748, + "time_per_iteration": 2.8905599117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_mlp": 1.10977423, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.06951396400432043, + "language_loss": 0.88074797, + "learning_rate": 0.0009662559722477428, + "loss": 0.89237428, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.52880859, + "step": 749, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111749, + "balance_loss_mlp": 1.09059644, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.031134761916572575, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77280462, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.26953125, + "step": 750, + "time_per_iteration": 4.978729009628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_mlp": 1.09359622, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.06451546089111031, + "language_loss": 0.9124738, + "learning_rate": 0.0009660305900333632, + "loss": 0.92388898, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.47973633, + "step": 751, + "time_per_iteration": 2.6556403636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145582, + "balance_loss_mlp": 1.09849465, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08083819383046088, + "language_loss": 0.8480792, + "learning_rate": 0.0009659176275105992, + "loss": 0.85953498, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.47070312, + "step": 752, + "time_per_iteration": 2.6868016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154886, + "balance_loss_mlp": 1.10667825, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.0601727082776222, + "language_loss": 0.87400204, + "learning_rate": 0.0009658044841025701, + "loss": 0.88555086, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.48217773, + "step": 753, + "time_per_iteration": 2.7701456546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189813, + "balance_loss_mlp": 1.136765, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.0800468655776831, + "language_loss": 0.83957088, + "learning_rate": 0.0009656911598532021, + "loss": 0.85146904, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.53051758, + "step": 754, + "time_per_iteration": 2.630211353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192149, + "balance_loss_mlp": 1.13943434, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.0631545589319864, + "language_loss": 0.9278729, + "learning_rate": 0.0009655776548064917, + "loss": 0.93979442, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.52758789, + "step": 755, + "time_per_iteration": 2.6447510719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.12506902, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.06497808848967317, + "language_loss": 0.90460694, + "learning_rate": 0.0009654639690065054, + "loss": 0.91637456, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.51708984, + "step": 756, + "time_per_iteration": 2.910578727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116602, + "balance_loss_mlp": 1.11785972, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.0580393303136577, + "language_loss": 0.90340179, + "learning_rate": 0.00096535010249738, + "loss": 0.91506201, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.48120117, + "step": 757, + "time_per_iteration": 2.7232277393341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149847, + "balance_loss_mlp": 1.10092402, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.07370663524734816, + "language_loss": 0.8531146, + "learning_rate": 0.0009652360553233224, + "loss": 0.86461306, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.48901367, + "step": 758, + "time_per_iteration": 2.7501397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.03528047, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.02263224740377231, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74837828, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.28710938, + "step": 759, + "time_per_iteration": 4.953639268875122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150341, + "balance_loss_mlp": 1.1019187, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.05750780582661247, + "language_loss": 0.83513778, + "learning_rate": 0.0009650074191575883, + "loss": 0.84664118, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.48388672, + "step": 760, + "time_per_iteration": 3.202252149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152626, + "balance_loss_mlp": 1.10179496, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.05303129095981597, + "language_loss": 0.88240772, + "learning_rate": 0.0009648928302546766, + "loss": 0.89393395, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.50878906, + "step": 761, + "time_per_iteration": 2.65380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_mlp": 1.09960222, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.06114398209353547, + "language_loss": 0.87573165, + "learning_rate": 0.0009647780608643613, + "loss": 0.88720453, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.47705078, + "step": 762, + "time_per_iteration": 3.3394339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.10831833, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.09093438426480749, + "language_loss": 0.90765309, + "learning_rate": 0.0009646631110312001, + "loss": 0.91919315, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.45678711, + "step": 763, + "time_per_iteration": 2.622671604156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.11200595, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.047784585244551814, + "language_loss": 0.90468627, + "learning_rate": 0.0009645479807998203, + "loss": 0.91626436, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.45751953, + "step": 764, + "time_per_iteration": 2.7322580814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156125, + "balance_loss_mlp": 1.11487842, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06523928090243644, + "language_loss": 0.94106412, + "learning_rate": 0.0009644326702149196, + "loss": 0.95262539, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.41235352, + "step": 765, + "time_per_iteration": 2.7013158798217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174535, + "balance_loss_mlp": 1.12761474, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.08055574364553787, + "language_loss": 0.86730242, + "learning_rate": 0.0009643171793212653, + "loss": 0.87904775, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.46923828, + "step": 766, + "time_per_iteration": 3.083709478378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_mlp": 1.11473966, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.07722330054572468, + "language_loss": 0.92188174, + "learning_rate": 0.0009642015081636952, + "loss": 0.93350834, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.47949219, + "step": 767, + "time_per_iteration": 2.6836585998535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.1132586, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.07123168873353844, + "language_loss": 0.90995437, + "learning_rate": 0.0009640856567871166, + "loss": 0.9215681, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.48168945, + "step": 768, + "time_per_iteration": 2.543670177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156907, + "balance_loss_mlp": 1.10626745, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07039727350928661, + "language_loss": 0.9123286, + "learning_rate": 0.0009639696252365072, + "loss": 0.92389768, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.50634766, + "step": 769, + "time_per_iteration": 3.027188539505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146634, + "balance_loss_mlp": 1.10326576, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.06094559984807647, + "language_loss": 0.83659029, + "learning_rate": 0.0009638534135569144, + "loss": 0.84805667, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.43359375, + "step": 770, + "time_per_iteration": 2.9126267433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_mlp": 1.09489226, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.06702358278695762, + "language_loss": 0.92293191, + "learning_rate": 0.0009637370217934554, + "loss": 0.93433982, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.45922852, + "step": 771, + "time_per_iteration": 2.6426541805267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.08600211, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.04968709901212579, + "language_loss": 0.84857935, + "learning_rate": 0.0009636204499913175, + "loss": 0.85987568, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.43603516, + "step": 772, + "time_per_iteration": 2.830029010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_mlp": 1.08478057, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06444605868824185, + "language_loss": 0.90028566, + "learning_rate": 0.0009635036981957581, + "loss": 0.91150796, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.37451172, + "step": 773, + "time_per_iteration": 2.850893259048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128047, + "balance_loss_mlp": 1.08546507, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.07558916443605426, + "language_loss": 0.92137265, + "learning_rate": 0.0009633867664521043, + "loss": 0.93265319, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.42553711, + "step": 774, + "time_per_iteration": 2.8405416011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154281, + "balance_loss_mlp": 1.10614467, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.07793461844194936, + "language_loss": 0.8938297, + "learning_rate": 0.0009632696548057527, + "loss": 0.9053725, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.48168945, + "step": 775, + "time_per_iteration": 2.5543088912963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158921, + "balance_loss_mlp": 1.11419404, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.07948352168051111, + "language_loss": 0.86982578, + "learning_rate": 0.0009631523633021704, + "loss": 0.88141501, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.44702148, + "step": 776, + "time_per_iteration": 2.8373982906341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151944, + "balance_loss_mlp": 1.10726452, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.07613081492567164, + "language_loss": 0.90593684, + "learning_rate": 0.0009630348919868936, + "loss": 0.91745627, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.4465332, + "step": 777, + "time_per_iteration": 2.688340187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164011, + "balance_loss_mlp": 1.1162796, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.07284380806791231, + "language_loss": 0.83743048, + "learning_rate": 0.0009629172409055293, + "loss": 0.84907055, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.47753906, + "step": 778, + "time_per_iteration": 2.496121406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_mlp": 1.13260555, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.0582041699055768, + "language_loss": 0.89173234, + "learning_rate": 0.0009627994101037531, + "loss": 0.9034642, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.40576172, + "step": 779, + "time_per_iteration": 2.7287445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116917, + "balance_loss_mlp": 1.12670779, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.06429714570378213, + "language_loss": 0.91374522, + "learning_rate": 0.0009626813996273114, + "loss": 0.92543697, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.42431641, + "step": 780, + "time_per_iteration": 2.8357532024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174237, + "balance_loss_mlp": 1.13258517, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.07735356487079731, + "language_loss": 0.90820873, + "learning_rate": 0.0009625632095220198, + "loss": 0.91995108, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.41625977, + "step": 781, + "time_per_iteration": 2.8360986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165333, + "balance_loss_mlp": 1.12408686, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.07591811383481707, + "language_loss": 0.88784671, + "learning_rate": 0.0009624448398337637, + "loss": 0.89950007, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.41259766, + "step": 782, + "time_per_iteration": 2.550873041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_mlp": 1.09920812, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.06500535683801296, + "language_loss": 0.90907973, + "learning_rate": 0.0009623262906084984, + "loss": 0.92046738, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.39550781, + "step": 783, + "time_per_iteration": 3.002237319946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127947, + "balance_loss_mlp": 1.08622408, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.06722303964642193, + "language_loss": 0.92323947, + "learning_rate": 0.0009622075618922486, + "loss": 0.93451893, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.41699219, + "step": 784, + "time_per_iteration": 2.669541120529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117088, + "balance_loss_mlp": 1.07636571, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.06286377137641418, + "language_loss": 0.88948303, + "learning_rate": 0.0009620886537311091, + "loss": 0.90065384, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.40722656, + "step": 785, + "time_per_iteration": 2.6505391597747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132411, + "balance_loss_mlp": 1.08563375, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.06858268632652799, + "language_loss": 0.87318397, + "learning_rate": 0.000961969566171244, + "loss": 0.88450807, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.46777344, + "step": 786, + "time_per_iteration": 2.5492002964019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143812, + "balance_loss_mlp": 1.10037243, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.06762455123923776, + "language_loss": 0.9226557, + "learning_rate": 0.0009618502992588873, + "loss": 0.93409383, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.43481445, + "step": 787, + "time_per_iteration": 2.6596381664276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153043, + "balance_loss_mlp": 1.10714722, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07210135364095939, + "language_loss": 0.90213263, + "learning_rate": 0.0009617308530403424, + "loss": 0.91366303, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.45922852, + "step": 788, + "time_per_iteration": 2.9965012073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133874, + "balance_loss_mlp": 1.09358144, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0646084728999688, + "language_loss": 0.89177096, + "learning_rate": 0.0009616112275619825, + "loss": 0.90310967, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.40283203, + "step": 789, + "time_per_iteration": 2.702927350997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.08760214, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.04914514873585108, + "language_loss": 0.85434246, + "learning_rate": 0.0009614914228702503, + "loss": 0.86562753, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.40917969, + "step": 790, + "time_per_iteration": 2.734309196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120439, + "balance_loss_mlp": 1.08031344, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.0510031662309952, + "language_loss": 0.90581405, + "learning_rate": 0.0009613714390116581, + "loss": 0.91701841, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.40112305, + "step": 791, + "time_per_iteration": 2.9846036434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119178, + "balance_loss_mlp": 1.07890868, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.06466161117660295, + "language_loss": 0.87842512, + "learning_rate": 0.0009612512760327879, + "loss": 0.88961697, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.40283203, + "step": 792, + "time_per_iteration": 2.879507303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.0749234, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.06761791569724282, + "language_loss": 0.86834276, + "learning_rate": 0.0009611309339802909, + "loss": 0.87955594, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.46435547, + "step": 793, + "time_per_iteration": 2.4628419876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125819, + "balance_loss_mlp": 1.08180666, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.06955338926819006, + "language_loss": 0.85776877, + "learning_rate": 0.0009610104129008881, + "loss": 0.86902696, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.43994141, + "step": 794, + "time_per_iteration": 3.1157610416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112048, + "balance_loss_mlp": 1.07751703, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.0812849574801687, + "language_loss": 0.89832217, + "learning_rate": 0.0009608897128413701, + "loss": 0.90952694, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.4296875, + "step": 795, + "time_per_iteration": 2.7580387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_mlp": 1.08070254, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.07320179377966478, + "language_loss": 0.87414771, + "learning_rate": 0.0009607688338485965, + "loss": 0.88536048, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.40576172, + "step": 796, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112803, + "balance_loss_mlp": 1.08358848, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.08676784428227541, + "language_loss": 0.92063487, + "learning_rate": 0.0009606477759694969, + "loss": 0.93191516, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.4440918, + "step": 797, + "time_per_iteration": 3.0136139392852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129989, + "balance_loss_mlp": 1.08547592, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07379760567815713, + "language_loss": 0.89430279, + "learning_rate": 0.0009605265392510703, + "loss": 0.90560269, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.44555664, + "step": 798, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_mlp": 1.10169339, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.06797963908333281, + "language_loss": 0.93481082, + "learning_rate": 0.0009604051237403846, + "loss": 0.94626689, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.43896484, + "step": 799, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167168, + "balance_loss_mlp": 1.1217972, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.06891264186704958, + "language_loss": 0.88271165, + "learning_rate": 0.0009602835294845776, + "loss": 0.89438331, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.45361328, + "step": 800, + "time_per_iteration": 2.4739739894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_mlp": 1.12188447, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.06820302888180714, + "language_loss": 0.91848779, + "learning_rate": 0.0009601617565308565, + "loss": 0.93017173, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.46557617, + "step": 801, + "time_per_iteration": 2.599102020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196072, + "balance_loss_mlp": 1.14941311, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.08155438121007776, + "language_loss": 0.88506758, + "learning_rate": 0.0009600398049264977, + "loss": 0.89702827, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.46679688, + "step": 802, + "time_per_iteration": 2.9645981788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193217, + "balance_loss_mlp": 1.14574742, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.10468166660144326, + "language_loss": 0.93512642, + "learning_rate": 0.0009599176747188469, + "loss": 0.94705856, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.47485352, + "step": 803, + "time_per_iteration": 2.7997000217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160191, + "balance_loss_mlp": 1.11856318, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.07174757520021151, + "language_loss": 0.84728193, + "learning_rate": 0.0009597953659553196, + "loss": 0.85888386, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.41625977, + "step": 804, + "time_per_iteration": 2.700530529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_mlp": 1.09408379, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.4143347029392257, + "language_loss": 0.9033978, + "learning_rate": 0.0009596728786833997, + "loss": 0.91473466, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.39575195, + "step": 805, + "time_per_iteration": 2.6122889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150784, + "balance_loss_mlp": 1.10772574, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.061887733402931855, + "language_loss": 0.91321814, + "learning_rate": 0.0009595502129506415, + "loss": 0.92472601, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.43066406, + "step": 806, + "time_per_iteration": 3.336061716079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180508, + "balance_loss_mlp": 1.13694847, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.06807019640067784, + "language_loss": 0.84292483, + "learning_rate": 0.0009594273688046678, + "loss": 0.85472989, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.43579102, + "step": 807, + "time_per_iteration": 2.709182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210101, + "balance_loss_mlp": 1.15960383, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.0856522073787927, + "language_loss": 0.8780278, + "learning_rate": 0.000959304346293171, + "loss": 0.89012885, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.50512695, + "step": 808, + "time_per_iteration": 2.6307153701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236008, + "balance_loss_mlp": 1.18305564, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.09531038088821206, + "language_loss": 0.90107393, + "learning_rate": 0.0009591811454639125, + "loss": 0.91343403, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.52954102, + "step": 809, + "time_per_iteration": 2.742725372314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197955, + "balance_loss_mlp": 1.15184498, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.06212883071305714, + "language_loss": 0.902493, + "learning_rate": 0.0009590577663647234, + "loss": 0.91447246, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.4609375, + "step": 810, + "time_per_iteration": 2.711411237716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187108, + "balance_loss_mlp": 1.13837492, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.06321996034865444, + "language_loss": 0.88015836, + "learning_rate": 0.0009589342090435036, + "loss": 0.8920294, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.48779297, + "step": 811, + "time_per_iteration": 2.763784170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.12610841, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07315119709604147, + "language_loss": 0.89953744, + "learning_rate": 0.0009588104735482223, + "loss": 0.91127443, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.47631836, + "step": 812, + "time_per_iteration": 2.645106077194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169234, + "balance_loss_mlp": 1.12019134, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.06895714089970095, + "language_loss": 0.86002952, + "learning_rate": 0.0009586865599269177, + "loss": 0.87172186, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.49047852, + "step": 813, + "time_per_iteration": 2.6313953399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144128, + "balance_loss_mlp": 1.09851837, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.06467027207336487, + "language_loss": 0.90443802, + "learning_rate": 0.0009585624682276977, + "loss": 0.91587937, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.45605469, + "step": 814, + "time_per_iteration": 2.7377047538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144046, + "balance_loss_mlp": 1.09705353, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.06824176290368998, + "language_loss": 0.89156437, + "learning_rate": 0.0009584381984987386, + "loss": 0.90300483, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.47021484, + "step": 815, + "time_per_iteration": 2.5524120330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134862, + "balance_loss_mlp": 1.09225655, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.061358262400161866, + "language_loss": 0.92449033, + "learning_rate": 0.0009583137507882864, + "loss": 0.93583906, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.42626953, + "step": 816, + "time_per_iteration": 2.699207305908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.08698916, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.06309616730716378, + "language_loss": 0.82620019, + "learning_rate": 0.000958189125144656, + "loss": 0.8375479, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.47851562, + "step": 817, + "time_per_iteration": 2.6626293659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142354, + "balance_loss_mlp": 1.09493256, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08013787804574789, + "language_loss": 0.90297949, + "learning_rate": 0.0009580643216162313, + "loss": 0.91440302, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.47436523, + "step": 818, + "time_per_iteration": 2.6708288192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.09368527, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.06582812199168771, + "language_loss": 0.82167578, + "learning_rate": 0.0009579393402514652, + "loss": 0.83310658, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.49389648, + "step": 819, + "time_per_iteration": 2.577592611312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_mlp": 1.09898734, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.07647809261390527, + "language_loss": 0.92362559, + "learning_rate": 0.0009578141810988801, + "loss": 0.93505466, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.43920898, + "step": 820, + "time_per_iteration": 2.5464515686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152369, + "balance_loss_mlp": 1.10678363, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07136182637629812, + "language_loss": 0.92042351, + "learning_rate": 0.0009576888442070668, + "loss": 0.93194717, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.45556641, + "step": 821, + "time_per_iteration": 2.5755786895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114609, + "balance_loss_mlp": 1.10288835, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08295395391365894, + "language_loss": 0.94583452, + "learning_rate": 0.0009575633296246854, + "loss": 0.95729542, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.43212891, + "step": 822, + "time_per_iteration": 2.5701425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162298, + "balance_loss_mlp": 1.11821485, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.06548151577025092, + "language_loss": 0.85385978, + "learning_rate": 0.0009574376374004652, + "loss": 0.86548281, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.44116211, + "step": 823, + "time_per_iteration": 2.622905731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_mlp": 1.12019491, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.1009087476503521, + "language_loss": 0.82624936, + "learning_rate": 0.000957311767583204, + "loss": 0.83794677, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.49536133, + "step": 824, + "time_per_iteration": 2.5683999061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196161, + "balance_loss_mlp": 1.1752758, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.05150472419389455, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83267754, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.20898438, + "step": 825, + "time_per_iteration": 4.722898960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176776, + "balance_loss_mlp": 1.12170124, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.10062471557735768, + "language_loss": 0.94017303, + "learning_rate": 0.0009570594953650961, + "loss": 0.95194077, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.55029297, + "step": 826, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173437, + "balance_loss_mlp": 1.12091362, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.0719939675894647, + "language_loss": 0.8219676, + "learning_rate": 0.00095693309306219, + "loss": 0.83370197, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.52612305, + "step": 827, + "time_per_iteration": 3.0926811695098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_mlp": 1.12434745, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.06038838021195225, + "language_loss": 0.90083122, + "learning_rate": 0.0009568065133621244, + "loss": 0.91261542, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54077148, + "step": 828, + "time_per_iteration": 3.315122604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164888, + "balance_loss_mlp": 1.12013662, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.07025990147709567, + "language_loss": 0.87178355, + "learning_rate": 0.0009566797563140422, + "loss": 0.88343245, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.44775391, + "step": 829, + "time_per_iteration": 2.8680243492126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116659, + "balance_loss_mlp": 1.11912107, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.061296828426512996, + "language_loss": 0.89984798, + "learning_rate": 0.0009565528219671547, + "loss": 0.91151381, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.47460938, + "step": 830, + "time_per_iteration": 2.9325318336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.1076839, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07652275644998038, + "language_loss": 0.86699682, + "learning_rate": 0.0009564257103707418, + "loss": 0.87860584, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.53198242, + "step": 831, + "time_per_iteration": 2.598191976547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184474, + "balance_loss_mlp": 1.12973261, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08337472663089728, + "language_loss": 0.92543364, + "learning_rate": 0.0009562984215741533, + "loss": 0.93727839, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54736328, + "step": 832, + "time_per_iteration": 2.676666736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_mlp": 1.11177731, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.05762908483075192, + "language_loss": 0.8408711, + "learning_rate": 0.0009561709556268065, + "loss": 0.85247904, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.49047852, + "step": 833, + "time_per_iteration": 2.7075538635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162528, + "balance_loss_mlp": 1.11141133, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.06044842900072245, + "language_loss": 0.96042889, + "learning_rate": 0.0009560433125781884, + "loss": 0.97205412, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.51171875, + "step": 834, + "time_per_iteration": 2.7619521617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.09130979, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.06441579465763399, + "language_loss": 0.94159138, + "learning_rate": 0.0009559154924778544, + "loss": 0.95304114, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.53686523, + "step": 835, + "time_per_iteration": 2.7467222213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_mlp": 1.08218372, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.07312538570388089, + "language_loss": 0.86469144, + "learning_rate": 0.0009557874953754284, + "loss": 0.87598646, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.47314453, + "step": 836, + "time_per_iteration": 3.0907793045043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126281, + "balance_loss_mlp": 1.07618928, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08101808751207061, + "language_loss": 0.85894346, + "learning_rate": 0.0009556593213206038, + "loss": 0.87020624, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.5012207, + "step": 837, + "time_per_iteration": 2.7060487270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.07765627, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.060960398488271, + "language_loss": 0.89031309, + "learning_rate": 0.0009555309703631414, + "loss": 0.9015379, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.44848633, + "step": 838, + "time_per_iteration": 2.6838622093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131245, + "balance_loss_mlp": 1.07853079, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.0637381399971671, + "language_loss": 0.88547724, + "learning_rate": 0.0009554024425528722, + "loss": 0.89678967, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.52685547, + "step": 839, + "time_per_iteration": 2.7301504611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124486, + "balance_loss_mlp": 1.07978272, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0692663948027758, + "language_loss": 0.90811443, + "learning_rate": 0.0009552737379396948, + "loss": 0.91935933, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.44726562, + "step": 840, + "time_per_iteration": 2.6181893348693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129368, + "balance_loss_mlp": 1.08208978, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06449676765287365, + "language_loss": 0.89640445, + "learning_rate": 0.0009551448565735767, + "loss": 0.90769809, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.47265625, + "step": 841, + "time_per_iteration": 2.731602907180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135502, + "balance_loss_mlp": 1.08555281, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.07291825437583387, + "language_loss": 0.86443651, + "learning_rate": 0.0009550157985045543, + "loss": 0.87579155, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.49926758, + "step": 842, + "time_per_iteration": 3.0523600578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_mlp": 1.08724499, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.06222432903322319, + "language_loss": 0.90556312, + "learning_rate": 0.0009548865637827321, + "loss": 0.91690183, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.46630859, + "step": 843, + "time_per_iteration": 2.6370396614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113271, + "balance_loss_mlp": 1.08757734, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.07459586377830821, + "language_loss": 0.91347718, + "learning_rate": 0.0009547571524582838, + "loss": 0.92480427, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.45141602, + "step": 844, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142931, + "balance_loss_mlp": 1.09460354, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.08463351541898638, + "language_loss": 0.94371468, + "learning_rate": 0.0009546275645814512, + "loss": 0.95514405, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.48339844, + "step": 845, + "time_per_iteration": 2.632861375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117157, + "balance_loss_mlp": 1.12107265, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.08033911629378378, + "language_loss": 0.92129737, + "learning_rate": 0.0009544978002025446, + "loss": 0.93301302, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.50561523, + "step": 846, + "time_per_iteration": 2.7044737339019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193795, + "balance_loss_mlp": 1.14096177, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.052695226385161484, + "language_loss": 0.88037688, + "learning_rate": 0.0009543678593719434, + "loss": 0.89231491, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.52880859, + "step": 847, + "time_per_iteration": 2.798231601715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208721, + "balance_loss_mlp": 1.15734136, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.056853368929671785, + "language_loss": 0.88963962, + "learning_rate": 0.0009542377421400945, + "loss": 0.90172684, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.51391602, + "step": 848, + "time_per_iteration": 2.7955727577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122402, + "balance_loss_mlp": 1.16584587, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06352967983147602, + "language_loss": 0.85259467, + "learning_rate": 0.0009541074485575145, + "loss": 0.86483485, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.58154297, + "step": 849, + "time_per_iteration": 2.703871488571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_mlp": 1.17088127, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07774946886845908, + "language_loss": 0.93468195, + "learning_rate": 0.0009539769786747874, + "loss": 0.94693196, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.54125977, + "step": 850, + "time_per_iteration": 2.6687557697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012154, + "balance_loss_mlp": 1.16130245, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.057605035940766894, + "language_loss": 0.82393861, + "learning_rate": 0.0009538463325425665, + "loss": 0.83609259, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.54101562, + "step": 851, + "time_per_iteration": 2.751335382461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199288, + "balance_loss_mlp": 1.1491015, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.06621147850271279, + "language_loss": 0.87526274, + "learning_rate": 0.0009537155102115728, + "loss": 0.88725561, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.50170898, + "step": 852, + "time_per_iteration": 2.568573474884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168884, + "balance_loss_mlp": 1.12236834, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.07419725806034035, + "language_loss": 0.85374665, + "learning_rate": 0.0009535845117325961, + "loss": 0.86543554, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.46533203, + "step": 853, + "time_per_iteration": 2.628973960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137862, + "balance_loss_mlp": 1.09511375, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.05551255594321189, + "language_loss": 0.94495642, + "learning_rate": 0.0009534533371564946, + "loss": 0.95633507, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.42724609, + "step": 854, + "time_per_iteration": 2.780510902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133546, + "balance_loss_mlp": 1.09003448, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.08632067881035285, + "language_loss": 0.90547508, + "learning_rate": 0.0009533219865341949, + "loss": 0.91681051, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.43530273, + "step": 855, + "time_per_iteration": 2.583874464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_mlp": 1.07188785, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.06082853882497287, + "language_loss": 0.88071746, + "learning_rate": 0.0009531904599166916, + "loss": 0.89188123, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.44482422, + "step": 856, + "time_per_iteration": 2.626354217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_mlp": 1.06231081, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.0709999882269981, + "language_loss": 0.86807954, + "learning_rate": 0.0009530587573550478, + "loss": 0.87915355, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.45068359, + "step": 857, + "time_per_iteration": 2.5761454105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142125, + "balance_loss_mlp": 1.11237001, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04095057850479287, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75461513, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.296875, + "step": 858, + "time_per_iteration": 5.055138349533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_mlp": 1.06165087, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.08838989258306214, + "language_loss": 0.91845137, + "learning_rate": 0.0009527948246039337, + "loss": 0.92946172, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.39379883, + "step": 859, + "time_per_iteration": 2.582608461380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111338, + "balance_loss_mlp": 1.0715934, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.06489567580347368, + "language_loss": 0.89263308, + "learning_rate": 0.000952662594516931, + "loss": 0.90374649, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.39746094, + "step": 860, + "time_per_iteration": 3.067707061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_mlp": 1.07018054, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.055059247831062384, + "language_loss": 0.88479781, + "learning_rate": 0.0009525301886907234, + "loss": 0.89590299, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.40307617, + "step": 861, + "time_per_iteration": 2.8873865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112184, + "balance_loss_mlp": 1.07758975, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.06995538812096423, + "language_loss": 0.89499515, + "learning_rate": 0.0009523976071767155, + "loss": 0.90621358, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.44262695, + "step": 862, + "time_per_iteration": 2.6588613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.08183372, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.06313062043432274, + "language_loss": 0.89038265, + "learning_rate": 0.00095226485002638, + "loss": 0.90163255, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.43115234, + "step": 863, + "time_per_iteration": 2.797896146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.07232881, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.054774526957085325, + "language_loss": 0.90381318, + "learning_rate": 0.0009521319172912576, + "loss": 0.91494584, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.40917969, + "step": 864, + "time_per_iteration": 2.7238612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_mlp": 1.08132839, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.05854649520245602, + "language_loss": 0.96491337, + "learning_rate": 0.0009519988090229579, + "loss": 0.97618109, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.4543457, + "step": 865, + "time_per_iteration": 2.683509111404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_mlp": 1.07907248, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.05699467986566688, + "language_loss": 0.89545953, + "learning_rate": 0.0009518655252731576, + "loss": 0.90669084, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.44067383, + "step": 866, + "time_per_iteration": 2.729865550994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_mlp": 1.08456326, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.06482393342324422, + "language_loss": 0.9171015, + "learning_rate": 0.0009517320660936022, + "loss": 0.9284128, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.46557617, + "step": 867, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133142, + "balance_loss_mlp": 1.08843839, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.06614373571764609, + "language_loss": 0.84472704, + "learning_rate": 0.0009515984315361051, + "loss": 0.85605848, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.44702148, + "step": 868, + "time_per_iteration": 2.796868085861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121806, + "balance_loss_mlp": 1.07657838, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08270078218547869, + "language_loss": 0.88773656, + "learning_rate": 0.000951464621652548, + "loss": 0.89895463, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.45239258, + "step": 869, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141117, + "balance_loss_mlp": 1.09751046, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.06072661062765564, + "language_loss": 0.80103016, + "learning_rate": 0.0009513306364948804, + "loss": 0.81244129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.43579102, + "step": 870, + "time_per_iteration": 2.799009084701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_mlp": 1.10373545, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09261319168225486, + "language_loss": 0.90277344, + "learning_rate": 0.0009511964761151197, + "loss": 0.91426206, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.45117188, + "step": 871, + "time_per_iteration": 2.5934712886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158743, + "balance_loss_mlp": 1.1145407, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.06739805293344515, + "language_loss": 0.91524243, + "learning_rate": 0.0009510621405653521, + "loss": 0.92682987, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.44213867, + "step": 872, + "time_per_iteration": 2.5557620525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11627746, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.06267535529199315, + "language_loss": 0.85553813, + "learning_rate": 0.0009509276298977309, + "loss": 0.86710668, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.40576172, + "step": 873, + "time_per_iteration": 2.9965007305145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187981, + "balance_loss_mlp": 1.13760364, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.07409010972210926, + "language_loss": 0.82916558, + "learning_rate": 0.0009507929441644778, + "loss": 0.84104538, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.50415039, + "step": 874, + "time_per_iteration": 3.5573699474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118449, + "balance_loss_mlp": 1.14097893, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.07388150752212762, + "language_loss": 0.8737148, + "learning_rate": 0.0009506580834178826, + "loss": 0.88555974, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.43530273, + "step": 875, + "time_per_iteration": 2.7659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215839, + "balance_loss_mlp": 1.16841793, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.06935842584614806, + "language_loss": 0.92793226, + "learning_rate": 0.0009505230477103028, + "loss": 0.94009066, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.47436523, + "step": 876, + "time_per_iteration": 2.7306137084960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_mlp": 1.18224776, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10053146783154573, + "language_loss": 0.82997662, + "learning_rate": 0.0009503878370941641, + "loss": 0.84224302, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.44433594, + "step": 877, + "time_per_iteration": 2.7356183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211684, + "balance_loss_mlp": 1.16793382, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.10508781605450683, + "language_loss": 0.9020679, + "learning_rate": 0.0009502524516219595, + "loss": 0.91418481, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.43798828, + "step": 878, + "time_per_iteration": 2.7525370121002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185028, + "balance_loss_mlp": 1.14232683, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.07887273759437702, + "language_loss": 0.91364408, + "learning_rate": 0.0009501168913462506, + "loss": 0.92549431, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.42724609, + "step": 879, + "time_per_iteration": 2.7009639739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115086, + "balance_loss_mlp": 1.11919844, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04902821320434346, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80272782, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.31640625, + "step": 880, + "time_per_iteration": 4.812703609466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116281, + "balance_loss_mlp": 1.11748707, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.06555145426806878, + "language_loss": 0.86756283, + "learning_rate": 0.0009498452465949042, + "loss": 0.87919092, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.453125, + "step": 881, + "time_per_iteration": 3.230407476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159747, + "balance_loss_mlp": 1.1133033, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.0753185527775994, + "language_loss": 0.92756218, + "learning_rate": 0.0009497091622247285, + "loss": 0.93915963, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.46459961, + "step": 882, + "time_per_iteration": 2.7412030696868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141259, + "balance_loss_mlp": 1.09734213, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.07197762243887564, + "language_loss": 0.94941783, + "learning_rate": 0.0009495729032619723, + "loss": 0.96083045, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.43945312, + "step": 883, + "time_per_iteration": 2.6705245971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_mlp": 1.09724283, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07033792867334165, + "language_loss": 0.85310471, + "learning_rate": 0.0009494364697595354, + "loss": 0.86451751, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.44018555, + "step": 884, + "time_per_iteration": 2.9024457931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115677, + "balance_loss_mlp": 1.10977769, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.0673266035955572, + "language_loss": 0.90739167, + "learning_rate": 0.0009492998617703867, + "loss": 0.91895938, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.47045898, + "step": 885, + "time_per_iteration": 2.6497459411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151127, + "balance_loss_mlp": 1.10813999, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.0863252086663651, + "language_loss": 0.89101255, + "learning_rate": 0.0009491630793475619, + "loss": 0.90252388, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.42993164, + "step": 886, + "time_per_iteration": 2.6258063316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159756, + "balance_loss_mlp": 1.11231089, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.0686214928272948, + "language_loss": 0.85993534, + "learning_rate": 0.0009490261225441643, + "loss": 0.87153292, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.47412109, + "step": 887, + "time_per_iteration": 2.9036519527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168173, + "balance_loss_mlp": 1.12370825, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07914830411429463, + "language_loss": 0.91452426, + "learning_rate": 0.0009488889914133656, + "loss": 0.92620599, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.44458008, + "step": 888, + "time_per_iteration": 3.0038132667541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155268, + "balance_loss_mlp": 1.10706019, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07300075385020723, + "language_loss": 0.90558064, + "learning_rate": 0.0009487516860084047, + "loss": 0.91713333, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.48193359, + "step": 889, + "time_per_iteration": 2.7158679962158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147984, + "balance_loss_mlp": 1.0996089, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.09172908653222724, + "language_loss": 0.90068781, + "learning_rate": 0.0009486142063825884, + "loss": 0.91216767, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.48364258, + "step": 890, + "time_per_iteration": 2.5330443382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.06175303, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.031797672969882694, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73511147, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.23144531, + "step": 891, + "time_per_iteration": 4.953175783157349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_mlp": 1.11835372, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.06989736404119995, + "language_loss": 0.91231126, + "learning_rate": 0.0009483387246819542, + "loss": 0.92398739, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.49243164, + "step": 892, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.0426023, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.022698270048783192, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83350885, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.2265625, + "step": 893, + "time_per_iteration": 4.662828683853149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12312233, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.06047387129149895, + "language_loss": 0.90360647, + "learning_rate": 0.0009480625467392688, + "loss": 0.91527206, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.43481445, + "step": 894, + "time_per_iteration": 2.615447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046079, + "balance_loss_mlp": 1.02433491, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.017910617622931155, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79040754, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.21777344, + "step": 895, + "time_per_iteration": 4.802469968795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196327, + "balance_loss_mlp": 1.15264833, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0591778940977726, + "language_loss": 0.88960874, + "learning_rate": 0.0009477856729834196, + "loss": 0.90157199, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.43652344, + "step": 896, + "time_per_iteration": 2.743036985397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214543, + "balance_loss_mlp": 1.17217648, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.09709817551063968, + "language_loss": 0.91585428, + "learning_rate": 0.0009476469753098809, + "loss": 0.92799973, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.42358398, + "step": 897, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206917, + "balance_loss_mlp": 1.16080689, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08785360527314089, + "language_loss": 0.87616539, + "learning_rate": 0.0009475081038443738, + "loss": 0.88823456, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.46118164, + "step": 898, + "time_per_iteration": 2.5958664417266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178905, + "balance_loss_mlp": 1.13436794, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.08099470404026293, + "language_loss": 0.87109447, + "learning_rate": 0.0009473690586408124, + "loss": 0.88288355, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.44482422, + "step": 899, + "time_per_iteration": 2.885279417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.13184392, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.060075693842180825, + "language_loss": 0.87349975, + "learning_rate": 0.0009472298397531792, + "loss": 0.88526928, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.45141602, + "step": 900, + "time_per_iteration": 2.6987335681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117213, + "balance_loss_mlp": 1.12244344, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.06597136758704356, + "language_loss": 0.87749296, + "learning_rate": 0.0009470904472355235, + "loss": 0.88921428, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.49707031, + "step": 901, + "time_per_iteration": 2.6920526027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_mlp": 1.08898544, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.06929151708835651, + "language_loss": 0.8084361, + "learning_rate": 0.0009469508811419626, + "loss": 0.81977129, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.44555664, + "step": 902, + "time_per_iteration": 2.7087764739990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_mlp": 1.01825094, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.018918236495105482, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7265144, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.831868648529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130376, + "balance_loss_mlp": 1.08429003, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.06904883588321564, + "language_loss": 0.84871197, + "learning_rate": 0.0009466712284439292, + "loss": 0.86001575, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.46118164, + "step": 904, + "time_per_iteration": 2.727154493331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135856, + "balance_loss_mlp": 1.08867335, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.0797697294198037, + "language_loss": 0.90077758, + "learning_rate": 0.0009465311419480276, + "loss": 0.9121362, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.47216797, + "step": 905, + "time_per_iteration": 2.659696340560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130539, + "balance_loss_mlp": 1.0859549, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.0780460064240459, + "language_loss": 0.89685637, + "learning_rate": 0.0009463908820933622, + "loss": 0.90816176, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.44604492, + "step": 906, + "time_per_iteration": 2.845508337020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_mlp": 1.10657179, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.06621529993663824, + "language_loss": 0.83420271, + "learning_rate": 0.0009462504489343868, + "loss": 0.84573436, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.46582031, + "step": 907, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152101, + "balance_loss_mlp": 1.10246193, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0823987818854668, + "language_loss": 0.9018122, + "learning_rate": 0.0009461098425256222, + "loss": 0.91333324, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.49633789, + "step": 908, + "time_per_iteration": 2.5904529094696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.11457169, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.0762262609163865, + "language_loss": 0.87090451, + "learning_rate": 0.0009459690629216567, + "loss": 0.88250846, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.45874023, + "step": 909, + "time_per_iteration": 2.61710524559021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155202, + "balance_loss_mlp": 1.10921121, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06657664395828655, + "language_loss": 0.88943893, + "learning_rate": 0.0009458281101771457, + "loss": 0.90099096, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.46020508, + "step": 910, + "time_per_iteration": 2.6421282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176316, + "balance_loss_mlp": 1.12810779, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.08799417436837091, + "language_loss": 0.8354404, + "learning_rate": 0.0009456869843468122, + "loss": 0.84720349, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.48217773, + "step": 911, + "time_per_iteration": 2.8633837699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178078, + "balance_loss_mlp": 1.12688971, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.08410877580390771, + "language_loss": 0.79552639, + "learning_rate": 0.0009455456854854459, + "loss": 0.80730712, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.51220703, + "step": 912, + "time_per_iteration": 2.661038875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180916, + "balance_loss_mlp": 1.13564038, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.17307911593328887, + "language_loss": 0.85480136, + "learning_rate": 0.0009454042136479039, + "loss": 0.86661053, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.45263672, + "step": 913, + "time_per_iteration": 2.561790943145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198041, + "balance_loss_mlp": 1.15183568, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.06959724621682493, + "language_loss": 0.8438077, + "learning_rate": 0.0009452625688891103, + "loss": 0.85578811, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.4621582, + "step": 914, + "time_per_iteration": 2.5396227836608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092507, + "balance_loss_mlp": 1.07600832, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.034614734916794516, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79827243, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.16503906, + "step": 915, + "time_per_iteration": 4.550157308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264875, + "balance_loss_mlp": 1.21347213, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08235911171958209, + "language_loss": 0.94223297, + "learning_rate": 0.0009449787608278015, + "loss": 0.95488179, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.51489258, + "step": 916, + "time_per_iteration": 2.8292665481567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243722, + "balance_loss_mlp": 1.19525158, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08361954447634375, + "language_loss": 0.9338274, + "learning_rate": 0.0009448365976354704, + "loss": 0.94626462, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.48461914, + "step": 917, + "time_per_iteration": 2.543883800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216482, + "balance_loss_mlp": 1.16622329, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.08482517786251102, + "language_loss": 0.91736883, + "learning_rate": 0.0009446942617422558, + "loss": 0.9295336, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.50317383, + "step": 918, + "time_per_iteration": 2.6130669116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118235, + "balance_loss_mlp": 1.13740778, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.07957198864097685, + "language_loss": 0.8648746, + "learning_rate": 0.0009445517532034176, + "loss": 0.87669808, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.44970703, + "step": 919, + "time_per_iteration": 2.7341010570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116033, + "balance_loss_mlp": 1.11002386, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.08371374964142012, + "language_loss": 0.9020586, + "learning_rate": 0.0009444090720742824, + "loss": 0.9136619, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.50341797, + "step": 920, + "time_per_iteration": 2.628169298171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158892, + "balance_loss_mlp": 1.1083951, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.07483188289837522, + "language_loss": 0.89025688, + "learning_rate": 0.0009442662184102439, + "loss": 0.90184581, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.50512695, + "step": 921, + "time_per_iteration": 2.7538435459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154828, + "balance_loss_mlp": 1.11210358, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.05276545299780942, + "language_loss": 0.88537991, + "learning_rate": 0.000944123192266763, + "loss": 0.89692819, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.42724609, + "step": 922, + "time_per_iteration": 2.788759469985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190709, + "balance_loss_mlp": 1.13887644, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.07681776188261369, + "language_loss": 0.84657156, + "learning_rate": 0.0009439799936993671, + "loss": 0.85847867, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.51904297, + "step": 923, + "time_per_iteration": 2.7123734951019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196866, + "balance_loss_mlp": 1.14787149, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.09732559260361714, + "language_loss": 0.89131558, + "learning_rate": 0.0009438366227636511, + "loss": 0.90328419, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.49047852, + "step": 924, + "time_per_iteration": 2.6907341480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171193, + "balance_loss_mlp": 1.12396216, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07379366042998667, + "language_loss": 0.86971134, + "learning_rate": 0.0009436930795152763, + "loss": 0.88142323, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.47241211, + "step": 925, + "time_per_iteration": 2.865673065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168215, + "balance_loss_mlp": 1.12174773, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07469970420174622, + "language_loss": 0.8767308, + "learning_rate": 0.0009435493640099713, + "loss": 0.88841295, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.46411133, + "step": 926, + "time_per_iteration": 2.779188394546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_mlp": 1.10388088, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.06972760602295516, + "language_loss": 0.85458124, + "learning_rate": 0.0009434054763035314, + "loss": 0.86612737, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.50756836, + "step": 927, + "time_per_iteration": 2.5972957611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 1.09983397, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.05666425765353489, + "language_loss": 0.86302543, + "learning_rate": 0.0009432614164518185, + "loss": 0.8745054, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.48168945, + "step": 928, + "time_per_iteration": 3.0064406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150828, + "balance_loss_mlp": 1.09780383, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07484249942420804, + "language_loss": 0.85464913, + "learning_rate": 0.000943117184510762, + "loss": 0.86615741, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 2.9855945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124448, + "balance_loss_mlp": 1.10556555, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.03465095249088487, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79914415, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.18847656, + "step": 930, + "time_per_iteration": 5.016055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148447, + "balance_loss_mlp": 1.09997642, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.07304481613225793, + "language_loss": 0.89790976, + "learning_rate": 0.0009428282045846674, + "loss": 0.90939426, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.48461914, + "step": 931, + "time_per_iteration": 2.787473678588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134009, + "balance_loss_mlp": 1.08797026, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.05043968313129053, + "language_loss": 0.90432143, + "learning_rate": 0.0009426834567118214, + "loss": 0.91566151, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.46044922, + "step": 932, + "time_per_iteration": 3.1106340885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149699, + "balance_loss_mlp": 1.10091829, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.0884624873286247, + "language_loss": 0.81563932, + "learning_rate": 0.0009425385369740155, + "loss": 0.82713628, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.48779297, + "step": 933, + "time_per_iteration": 3.056328296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.1138767, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.0672899912264689, + "language_loss": 0.88411558, + "learning_rate": 0.0009423934454275125, + "loss": 0.8957603, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.50561523, + "step": 934, + "time_per_iteration": 2.827507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162381, + "balance_loss_mlp": 1.11333871, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.07880287247644589, + "language_loss": 0.92845738, + "learning_rate": 0.0009422481821286418, + "loss": 0.94008112, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.49072266, + "step": 935, + "time_per_iteration": 2.7188265323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164456, + "balance_loss_mlp": 1.11918044, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.07978340192275198, + "language_loss": 0.88968349, + "learning_rate": 0.0009421027471337998, + "loss": 0.90132797, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.45239258, + "step": 936, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176728, + "balance_loss_mlp": 1.1271131, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.07049523693926517, + "language_loss": 0.83782339, + "learning_rate": 0.0009419571404994493, + "loss": 0.84959066, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.49584961, + "step": 937, + "time_per_iteration": 2.641847610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_mlp": 1.11354589, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.06745021535989586, + "language_loss": 0.91665328, + "learning_rate": 0.00094181136228212, + "loss": 0.92827624, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.48803711, + "step": 938, + "time_per_iteration": 2.622314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146811, + "balance_loss_mlp": 1.10334706, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06209482952821168, + "language_loss": 0.87085009, + "learning_rate": 0.0009416654125384077, + "loss": 0.88231826, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.43432617, + "step": 939, + "time_per_iteration": 2.735565423965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167753, + "balance_loss_mlp": 1.15230346, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.039552666267989665, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80940127, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15429688, + "step": 940, + "time_per_iteration": 4.9464662075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_mlp": 1.10293126, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.06405620484007693, + "language_loss": 0.85002685, + "learning_rate": 0.000941372998698552, + "loss": 0.86150396, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.44750977, + "step": 941, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152914, + "balance_loss_mlp": 1.10344219, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.07883971857950696, + "language_loss": 0.82437575, + "learning_rate": 0.0009412265347159336, + "loss": 0.8359049, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.49487305, + "step": 942, + "time_per_iteration": 2.727071762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135445, + "balance_loss_mlp": 1.09083664, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.10057326993772005, + "language_loss": 0.85614288, + "learning_rate": 0.0009410798994339829, + "loss": 0.86749732, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.44604492, + "step": 943, + "time_per_iteration": 2.6305696964263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.09248304, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.05478952043416941, + "language_loss": 0.88907182, + "learning_rate": 0.000940933092909628, + "loss": 0.90042174, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.42529297, + "step": 944, + "time_per_iteration": 2.631101369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.10530019, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.06051663433249254, + "language_loss": 0.84961444, + "learning_rate": 0.0009407861151998649, + "loss": 0.8611083, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.44067383, + "step": 945, + "time_per_iteration": 2.5717978477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116837, + "balance_loss_mlp": 1.12040067, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.06666982795430461, + "language_loss": 0.87044382, + "learning_rate": 0.0009406389663617552, + "loss": 0.88212758, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.47998047, + "step": 946, + "time_per_iteration": 2.6768407821655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170796, + "balance_loss_mlp": 1.12757087, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0759743739596538, + "language_loss": 0.87192827, + "learning_rate": 0.000940491646452427, + "loss": 0.88363624, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.43212891, + "step": 947, + "time_per_iteration": 2.7174758911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174804, + "balance_loss_mlp": 1.1271199, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.06285362616764655, + "language_loss": 0.91503757, + "learning_rate": 0.000940344155529075, + "loss": 0.92678559, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.47680664, + "step": 948, + "time_per_iteration": 2.6130924224853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175811, + "balance_loss_mlp": 1.12643504, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.07182633578445446, + "language_loss": 0.88395435, + "learning_rate": 0.0009401964936489605, + "loss": 0.89571244, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.4934082, + "step": 949, + "time_per_iteration": 2.518735885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154077, + "balance_loss_mlp": 1.11173368, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.08616214546245322, + "language_loss": 0.86381257, + "learning_rate": 0.0009400486608694108, + "loss": 0.87535334, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.42358398, + "step": 950, + "time_per_iteration": 2.7356269359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_mlp": 1.10071373, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.05684050086710682, + "language_loss": 0.88146299, + "learning_rate": 0.0009399006572478195, + "loss": 0.89294124, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.47119141, + "step": 951, + "time_per_iteration": 3.0829784870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113898, + "balance_loss_mlp": 1.09449124, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06809630737889293, + "language_loss": 0.91594249, + "learning_rate": 0.0009397524828416468, + "loss": 0.92733228, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.44482422, + "step": 952, + "time_per_iteration": 2.710500478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141316, + "balance_loss_mlp": 1.09339356, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.06814185159234107, + "language_loss": 0.97457635, + "learning_rate": 0.0009396041377084192, + "loss": 0.98598951, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.47949219, + "step": 953, + "time_per_iteration": 2.6530585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011339, + "balance_loss_mlp": 1.08716977, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.06688505748067412, + "language_loss": 0.88496006, + "learning_rate": 0.0009394556219057295, + "loss": 0.896299, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.46704102, + "step": 954, + "time_per_iteration": 2.662543773651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.08948374, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08148035498798997, + "language_loss": 0.84775722, + "learning_rate": 0.0009393069354912362, + "loss": 0.85911626, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.46386719, + "step": 955, + "time_per_iteration": 2.7262632846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_mlp": 1.0954181, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07343823471440349, + "language_loss": 0.83466816, + "learning_rate": 0.0009391580785226649, + "loss": 0.8460598, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.43774414, + "step": 956, + "time_per_iteration": 2.8661141395568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_mlp": 1.04708123, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.029557521366383285, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80407178, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.19628906, + "step": 957, + "time_per_iteration": 4.751030921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.08978534, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.06490118531587029, + "language_loss": 0.87677503, + "learning_rate": 0.0009388598531545196, + "loss": 0.88812232, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.44946289, + "step": 958, + "time_per_iteration": 2.8378970623016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143042, + "balance_loss_mlp": 1.09702718, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.07391212127287443, + "language_loss": 0.86896807, + "learning_rate": 0.000938710484870727, + "loss": 0.88039851, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.46044922, + "step": 959, + "time_per_iteration": 4.31168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128823, + "balance_loss_mlp": 1.08416748, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0638837232249089, + "language_loss": 0.86957002, + "learning_rate": 0.0009385609462644189, + "loss": 0.88085824, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.44702148, + "step": 960, + "time_per_iteration": 2.6793572902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_mlp": 1.07233214, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07248975394705585, + "language_loss": 0.86711299, + "learning_rate": 0.0009384112373936514, + "loss": 0.87830293, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.46679688, + "step": 961, + "time_per_iteration": 2.6220860481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119858, + "balance_loss_mlp": 1.07334304, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.06813544125014795, + "language_loss": 0.92053163, + "learning_rate": 0.0009382613583165467, + "loss": 0.93173021, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.46533203, + "step": 962, + "time_per_iteration": 2.8032093048095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108588, + "balance_loss_mlp": 1.06142831, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07296294799157402, + "language_loss": 0.9064188, + "learning_rate": 0.0009381113090912928, + "loss": 0.91750467, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.47167969, + "step": 963, + "time_per_iteration": 2.7358789443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_mlp": 1.06741881, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.07962159601741099, + "language_loss": 0.90353996, + "learning_rate": 0.000937961089776144, + "loss": 0.91463923, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.42480469, + "step": 964, + "time_per_iteration": 2.5761237144470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.07924736, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09082243760489998, + "language_loss": 0.83673573, + "learning_rate": 0.0009378107004294208, + "loss": 0.84802246, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.49438477, + "step": 965, + "time_per_iteration": 2.9681291580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132442, + "balance_loss_mlp": 1.08542585, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08405098410424734, + "language_loss": 0.92054594, + "learning_rate": 0.0009376601411095096, + "loss": 0.93187034, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.4699707, + "step": 966, + "time_per_iteration": 2.696122407913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.09773731, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07104128547690361, + "language_loss": 0.87554526, + "learning_rate": 0.0009375094118748622, + "loss": 0.88693225, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.40991211, + "step": 967, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179223, + "balance_loss_mlp": 1.13373268, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.0728928893981835, + "language_loss": 0.91626799, + "learning_rate": 0.0009373585127839976, + "loss": 0.92806023, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.45507812, + "step": 968, + "time_per_iteration": 2.9854021072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212732, + "balance_loss_mlp": 1.16905367, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08777237711590531, + "language_loss": 0.91368866, + "learning_rate": 0.0009372074438954994, + "loss": 0.92581606, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.43652344, + "step": 969, + "time_per_iteration": 2.5014536380767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211792, + "balance_loss_mlp": 1.16539574, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.0704882552763471, + "language_loss": 0.92436379, + "learning_rate": 0.0009370562052680181, + "loss": 0.93648171, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.46411133, + "step": 970, + "time_per_iteration": 2.453458070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120766, + "balance_loss_mlp": 1.16183591, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.07372597108689087, + "language_loss": 0.89988613, + "learning_rate": 0.0009369047969602695, + "loss": 0.91196281, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.45825195, + "step": 971, + "time_per_iteration": 2.703948497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192702, + "balance_loss_mlp": 1.14396954, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.08557962606734577, + "language_loss": 0.8750906, + "learning_rate": 0.0009367532190310357, + "loss": 0.88701761, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.48657227, + "step": 972, + "time_per_iteration": 4.1564977169036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148896, + "balance_loss_mlp": 1.1052649, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.06811184838385763, + "language_loss": 0.89467651, + "learning_rate": 0.0009366014715391644, + "loss": 0.90616548, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.43603516, + "step": 973, + "time_per_iteration": 2.695730209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134701, + "balance_loss_mlp": 1.09307301, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.054567817192194557, + "language_loss": 0.84347546, + "learning_rate": 0.0009364495545435693, + "loss": 0.85482252, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.41625977, + "step": 974, + "time_per_iteration": 2.828831672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146218, + "balance_loss_mlp": 1.09970224, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.08256927623824414, + "language_loss": 0.89333141, + "learning_rate": 0.0009362974681032297, + "loss": 0.90479362, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.46484375, + "step": 975, + "time_per_iteration": 2.5982418060302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143654, + "balance_loss_mlp": 1.09909391, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.07754570301250979, + "language_loss": 0.89447427, + "learning_rate": 0.0009361452122771907, + "loss": 0.90591079, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.44555664, + "step": 976, + "time_per_iteration": 2.881242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_mlp": 1.08834195, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.0965092241218366, + "language_loss": 0.84541976, + "learning_rate": 0.0009359927871245635, + "loss": 0.85675669, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.45361328, + "step": 977, + "time_per_iteration": 2.4720265865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113596, + "balance_loss_mlp": 1.09039843, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09227923665031239, + "language_loss": 0.87538362, + "learning_rate": 0.0009358401927045246, + "loss": 0.88674331, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.45581055, + "step": 978, + "time_per_iteration": 2.8225297927856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_mlp": 1.0945406, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.05953389716062443, + "language_loss": 0.88990903, + "learning_rate": 0.0009356874290763166, + "loss": 0.90131652, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.46264648, + "step": 979, + "time_per_iteration": 3.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_mlp": 1.09494936, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.06969100284100371, + "language_loss": 0.89955008, + "learning_rate": 0.0009355344962992474, + "loss": 0.91095543, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.45581055, + "step": 980, + "time_per_iteration": 2.6008429527282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138568, + "balance_loss_mlp": 1.09291101, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07021551702573088, + "language_loss": 0.88888156, + "learning_rate": 0.0009353813944326908, + "loss": 0.90026724, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.45654297, + "step": 981, + "time_per_iteration": 2.9102253913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141833, + "balance_loss_mlp": 1.09352899, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0640154196439605, + "language_loss": 0.83560127, + "learning_rate": 0.0009352281235360863, + "loss": 0.84701967, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.4831543, + "step": 982, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.10627127, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.06254433649037737, + "language_loss": 0.85791624, + "learning_rate": 0.0009350746836689389, + "loss": 0.86940861, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.4296875, + "step": 983, + "time_per_iteration": 2.524491548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.02905524, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.024687708549402564, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82486492, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.18261719, + "step": 984, + "time_per_iteration": 5.200335741043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156154, + "balance_loss_mlp": 1.1069684, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08202626484000469, + "language_loss": 0.84151661, + "learning_rate": 0.0009347672972613634, + "loss": 0.85307819, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.49145508, + "step": 985, + "time_per_iteration": 2.6939473152160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011517, + "balance_loss_mlp": 1.10756862, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.061889675774481866, + "language_loss": 0.8651796, + "learning_rate": 0.0009346133508402735, + "loss": 0.87669659, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.44140625, + "step": 986, + "time_per_iteration": 2.695004463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146205, + "balance_loss_mlp": 1.1000948, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07730871241699967, + "language_loss": 0.84821075, + "learning_rate": 0.0009344592356873166, + "loss": 0.85967278, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.46118164, + "step": 987, + "time_per_iteration": 2.635143518447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_mlp": 1.0975666, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.058246004489727894, + "language_loss": 0.79289091, + "learning_rate": 0.0009343049518623255, + "loss": 0.80432773, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.46142578, + "step": 988, + "time_per_iteration": 2.7257165908813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126709, + "balance_loss_mlp": 1.08503366, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.06464318177286693, + "language_loss": 0.83752143, + "learning_rate": 0.0009341504994251985, + "loss": 0.84878862, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.41674805, + "step": 989, + "time_per_iteration": 2.8336057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_mlp": 1.03692603, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.01962059038868396, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74572587, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.15136719, + "step": 990, + "time_per_iteration": 4.980287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.07682681, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.06360467015426281, + "language_loss": 0.82411575, + "learning_rate": 0.0009338410889544574, + "loss": 0.83530033, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.41601562, + "step": 991, + "time_per_iteration": 3.0192768573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123102, + "balance_loss_mlp": 1.0790422, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.06107834506241764, + "language_loss": 0.88440853, + "learning_rate": 0.000933686131040967, + "loss": 0.89563954, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.44067383, + "step": 992, + "time_per_iteration": 2.795952796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118187, + "balance_loss_mlp": 1.07479525, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.08075044213119366, + "language_loss": 0.91145802, + "learning_rate": 0.0009335310047555883, + "loss": 0.92263985, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.43383789, + "step": 993, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144036, + "balance_loss_mlp": 1.10052443, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.06789475617385991, + "language_loss": 0.89048505, + "learning_rate": 0.0009333757101585467, + "loss": 0.90192544, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.43554688, + "step": 994, + "time_per_iteration": 2.659120559692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_mlp": 1.11687493, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.05475551086737561, + "language_loss": 0.94071913, + "learning_rate": 0.0009332202473101329, + "loss": 0.95231587, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.42822266, + "step": 995, + "time_per_iteration": 2.672307014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153956, + "balance_loss_mlp": 1.11011088, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.060816834986447306, + "language_loss": 0.8370983, + "learning_rate": 0.0009330646162707028, + "loss": 0.84863788, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.4387207, + "step": 996, + "time_per_iteration": 2.7483248710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.11274719, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.05013127115514869, + "language_loss": 0.85195571, + "learning_rate": 0.0009329088171006779, + "loss": 0.86350954, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.42626953, + "step": 997, + "time_per_iteration": 3.1445202827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_mlp": 1.1197654, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.07353815647154911, + "language_loss": 0.86074895, + "learning_rate": 0.0009327528498605446, + "loss": 0.87238026, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.43383789, + "step": 998, + "time_per_iteration": 2.536146402359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159094, + "balance_loss_mlp": 1.11844337, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.06861677349241169, + "language_loss": 0.9080506, + "learning_rate": 0.0009325967146108548, + "loss": 0.91964149, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.40649414, + "step": 999, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151834, + "balance_loss_mlp": 1.11049271, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.0672850368289366, + "language_loss": 0.88138115, + "learning_rate": 0.0009324404114122258, + "loss": 0.89289951, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.41357422, + "step": 1000, + "time_per_iteration": 2.677651882171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.12221444, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.06402741154285656, + "language_loss": 0.8710497, + "learning_rate": 0.0009322839403253397, + "loss": 0.88269627, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.42431641, + "step": 1001, + "time_per_iteration": 2.7528679370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169088, + "balance_loss_mlp": 1.12440836, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07104878229054386, + "language_loss": 0.84949791, + "learning_rate": 0.0009321273014109439, + "loss": 0.86118877, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.44702148, + "step": 1002, + "time_per_iteration": 2.9990484714508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114889, + "balance_loss_mlp": 1.10523582, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.0673469195429183, + "language_loss": 0.85240018, + "learning_rate": 0.0009319704947298513, + "loss": 0.8638891, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.43676758, + "step": 1003, + "time_per_iteration": 2.8755459785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141118, + "balance_loss_mlp": 1.10127831, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.0925310675323854, + "language_loss": 0.89122581, + "learning_rate": 0.0009318135203429393, + "loss": 0.902637, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.3984375, + "step": 1004, + "time_per_iteration": 2.771192789077759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_mlp": 1.0866611, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.05779097302789, + "language_loss": 0.88602638, + "learning_rate": 0.0009316563783111511, + "loss": 0.8973062, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.41308594, + "step": 1005, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_mlp": 1.08638334, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06006842888316194, + "language_loss": 0.83199531, + "learning_rate": 0.0009314990686954943, + "loss": 0.84330451, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.44506836, + "step": 1006, + "time_per_iteration": 2.935081720352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_mlp": 1.09561515, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.0666735983489841, + "language_loss": 0.81657201, + "learning_rate": 0.000931341591557042, + "loss": 0.82798046, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.45263672, + "step": 1007, + "time_per_iteration": 3.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155404, + "balance_loss_mlp": 1.1041683, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.08115294197805281, + "language_loss": 0.87899536, + "learning_rate": 0.0009311839469569325, + "loss": 0.89054936, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.51171875, + "step": 1008, + "time_per_iteration": 2.6384472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150065, + "balance_loss_mlp": 1.10030699, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.07776470075981182, + "language_loss": 0.88065994, + "learning_rate": 0.0009310261349563687, + "loss": 0.89216053, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.49804688, + "step": 1009, + "time_per_iteration": 2.703058958053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_mlp": 1.11160064, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.05519618089274153, + "language_loss": 0.86250293, + "learning_rate": 0.0009308681556166186, + "loss": 0.87407839, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.45922852, + "step": 1010, + "time_per_iteration": 2.8404791355133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177928, + "balance_loss_mlp": 1.12480855, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10323239067467582, + "language_loss": 0.8870275, + "learning_rate": 0.0009307100089990152, + "loss": 0.89880681, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.53100586, + "step": 1011, + "time_per_iteration": 2.7103512287139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185201, + "balance_loss_mlp": 1.13530004, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.08766026563197518, + "language_loss": 0.84582877, + "learning_rate": 0.0009305516951649568, + "loss": 0.8576808, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.49902344, + "step": 1012, + "time_per_iteration": 2.6905276775360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175674, + "balance_loss_mlp": 1.12818122, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07259628373080033, + "language_loss": 0.87723738, + "learning_rate": 0.0009303932141759057, + "loss": 0.8889941, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.47485352, + "step": 1013, + "time_per_iteration": 2.7738490104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161359, + "balance_loss_mlp": 1.11200666, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.07589756885314788, + "language_loss": 0.84698361, + "learning_rate": 0.0009302345660933902, + "loss": 0.85859716, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.49291992, + "step": 1014, + "time_per_iteration": 2.7809414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152692, + "balance_loss_mlp": 1.10579538, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.06636914889533592, + "language_loss": 0.85938931, + "learning_rate": 0.0009300757509790026, + "loss": 0.87091625, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.46875, + "step": 1015, + "time_per_iteration": 2.886200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151123, + "balance_loss_mlp": 1.10324848, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.08384883211824797, + "language_loss": 0.91210115, + "learning_rate": 0.0009299167688944005, + "loss": 0.92361236, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.47827148, + "step": 1016, + "time_per_iteration": 2.5308799743652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135215, + "balance_loss_mlp": 1.09036839, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07612639660839114, + "language_loss": 0.86733758, + "learning_rate": 0.0009297576199013063, + "loss": 0.87868977, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.44873047, + "step": 1017, + "time_per_iteration": 2.699352264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_mlp": 1.14159799, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.04987694814110311, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74158609, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.1484375, + "step": 1018, + "time_per_iteration": 4.927512168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099249, + "balance_loss_mlp": 1.08494341, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.032347612483235935, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80525547, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14257812, + "step": 1019, + "time_per_iteration": 5.494646787643433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_mlp": 1.08855522, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.06601293097738069, + "language_loss": 0.87223667, + "learning_rate": 0.0009292791720892659, + "loss": 0.88352561, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.40332031, + "step": 1020, + "time_per_iteration": 2.8718464374542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_mlp": 1.08823943, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07136038826441608, + "language_loss": 0.89387941, + "learning_rate": 0.0009291193560807218, + "loss": 0.90521628, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.45483398, + "step": 1021, + "time_per_iteration": 2.588604211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132851, + "balance_loss_mlp": 1.09141409, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.06738480994857221, + "language_loss": 0.87651652, + "learning_rate": 0.0009289593734732688, + "loss": 0.88784504, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.41430664, + "step": 1022, + "time_per_iteration": 2.5915818214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129633, + "balance_loss_mlp": 1.09036541, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.06942729809827348, + "language_loss": 0.94984972, + "learning_rate": 0.0009287992243290175, + "loss": 0.96114612, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.39282227, + "step": 1023, + "time_per_iteration": 2.4477546215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142342, + "balance_loss_mlp": 1.09880638, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.1017247644504036, + "language_loss": 0.91891634, + "learning_rate": 0.0009286389087101435, + "loss": 0.93033981, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 2.765334129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142412, + "balance_loss_mlp": 1.09942544, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07195718640229302, + "language_loss": 0.8893857, + "learning_rate": 0.0009284784266788864, + "loss": 0.90080982, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.42993164, + "step": 1025, + "time_per_iteration": 2.7323853969573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_mlp": 1.10327554, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.069193395974369, + "language_loss": 0.93259764, + "learning_rate": 0.0009283177782975512, + "loss": 0.94401753, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.38696289, + "step": 1026, + "time_per_iteration": 2.9729068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114551, + "balance_loss_mlp": 1.10142589, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08755988500201482, + "language_loss": 0.88955659, + "learning_rate": 0.000928156963628507, + "loss": 0.90101171, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.44067383, + "step": 1027, + "time_per_iteration": 2.594200849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138947, + "balance_loss_mlp": 1.09855926, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.07316483198701504, + "language_loss": 0.89277303, + "learning_rate": 0.0009279959827341877, + "loss": 0.90416259, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.40405273, + "step": 1028, + "time_per_iteration": 2.7378368377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140451, + "balance_loss_mlp": 1.09727335, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.059550544329949856, + "language_loss": 0.88526183, + "learning_rate": 0.0009278348356770915, + "loss": 0.89666629, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.43188477, + "step": 1029, + "time_per_iteration": 2.5737922191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133825, + "balance_loss_mlp": 1.0914098, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.06393748023743129, + "language_loss": 0.8587814, + "learning_rate": 0.0009276735225197814, + "loss": 0.87011963, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.42431641, + "step": 1030, + "time_per_iteration": 2.648477077484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146668, + "balance_loss_mlp": 1.10170269, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.06069855374703422, + "language_loss": 0.86812896, + "learning_rate": 0.0009275120433248847, + "loss": 0.87959564, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.44946289, + "step": 1031, + "time_per_iteration": 2.6862802505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.10327268, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.06482797348212818, + "language_loss": 0.87033594, + "learning_rate": 0.0009273503981550931, + "loss": 0.8818205, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.45166016, + "step": 1032, + "time_per_iteration": 3.0549416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157268, + "balance_loss_mlp": 1.11235023, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.07571303407420105, + "language_loss": 0.87661642, + "learning_rate": 0.0009271885870731626, + "loss": 0.88818914, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.44946289, + "step": 1033, + "time_per_iteration": 2.4938008785247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172373, + "balance_loss_mlp": 1.12495148, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.07801561202279184, + "language_loss": 0.89466584, + "learning_rate": 0.0009270266101419143, + "loss": 0.90638959, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.47460938, + "step": 1034, + "time_per_iteration": 2.61181378364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.12681675, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.07487269237991181, + "language_loss": 0.85762119, + "learning_rate": 0.0009268644674242328, + "loss": 0.86931992, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.43066406, + "step": 1035, + "time_per_iteration": 2.6761085987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163027, + "balance_loss_mlp": 1.1147716, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.06997084642295975, + "language_loss": 0.81697071, + "learning_rate": 0.0009267021589830678, + "loss": 0.828601, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.4831543, + "step": 1036, + "time_per_iteration": 2.6166343688964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162737, + "balance_loss_mlp": 1.14547551, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.04224955266067769, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78789818, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.17285156, + "step": 1037, + "time_per_iteration": 4.932336330413818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124804, + "balance_loss_mlp": 1.08224678, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.07370646472771722, + "language_loss": 0.9354341, + "learning_rate": 0.000926377045182406, + "loss": 0.94668216, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.42553711, + "step": 1038, + "time_per_iteration": 2.89486026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122228, + "balance_loss_mlp": 1.07704759, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.06351485696264159, + "language_loss": 0.88915765, + "learning_rate": 0.0009262142399491296, + "loss": 0.9003799, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.4519043, + "step": 1039, + "time_per_iteration": 3.0843544006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132553, + "balance_loss_mlp": 1.08784938, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.06429886269356283, + "language_loss": 0.89007306, + "learning_rate": 0.0009260512692448105, + "loss": 0.9013986, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.44677734, + "step": 1040, + "time_per_iteration": 2.7221181392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143871, + "balance_loss_mlp": 1.10071695, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0714265416650486, + "language_loss": 0.85044324, + "learning_rate": 0.000925888133132719, + "loss": 0.86188197, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.43164062, + "step": 1041, + "time_per_iteration": 2.7112865447998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113685, + "balance_loss_mlp": 1.09566069, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.0301437897992815, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072412, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.18066406, + "step": 1042, + "time_per_iteration": 4.913869380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.13338971, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.11345429965909062, + "language_loss": 0.82242954, + "learning_rate": 0.0009255613649386244, + "loss": 0.83422714, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.46337891, + "step": 1043, + "time_per_iteration": 2.6586339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153581, + "balance_loss_mlp": 1.11133325, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07362734504976313, + "language_loss": 0.79954398, + "learning_rate": 0.0009253977329834838, + "loss": 0.81107974, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.42236328, + "step": 1044, + "time_per_iteration": 2.7028462886810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143902, + "balance_loss_mlp": 1.0951457, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.07842723007783056, + "language_loss": 0.8753317, + "learning_rate": 0.0009252339358742965, + "loss": 0.88677073, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.48779297, + "step": 1045, + "time_per_iteration": 2.8069612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139165, + "balance_loss_mlp": 1.0902648, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07197327624603128, + "language_loss": 0.84128577, + "learning_rate": 0.000925069973674654, + "loss": 0.85267735, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.48925781, + "step": 1046, + "time_per_iteration": 2.603602409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136955, + "balance_loss_mlp": 1.09303868, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06199919012721526, + "language_loss": 0.89849102, + "learning_rate": 0.000924905846448212, + "loss": 0.90986055, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.43896484, + "step": 1047, + "time_per_iteration": 2.733009099960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166193, + "balance_loss_mlp": 1.11726964, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.08010189097684783, + "language_loss": 0.86224002, + "learning_rate": 0.0009247415542586906, + "loss": 0.87390196, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.48950195, + "step": 1048, + "time_per_iteration": 2.8471555709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186895, + "balance_loss_mlp": 1.13675559, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.050762349186412876, + "language_loss": 0.83535373, + "learning_rate": 0.0009245770971698735, + "loss": 0.84722269, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.50170898, + "step": 1049, + "time_per_iteration": 2.889474630355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183513, + "balance_loss_mlp": 1.13671136, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.07506320746734087, + "language_loss": 0.8918047, + "learning_rate": 0.0009244124752456087, + "loss": 0.90363979, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.46826172, + "step": 1050, + "time_per_iteration": 2.5762786865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205722, + "balance_loss_mlp": 1.15453339, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.08917577036116058, + "language_loss": 0.86475039, + "learning_rate": 0.0009242476885498081, + "loss": 0.87680757, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.51220703, + "step": 1051, + "time_per_iteration": 2.720395565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193502, + "balance_loss_mlp": 1.14009643, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.08090891256677915, + "language_loss": 0.81871718, + "learning_rate": 0.0009240827371464474, + "loss": 0.83065224, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.53442383, + "step": 1052, + "time_per_iteration": 2.535388231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162702, + "balance_loss_mlp": 1.11833251, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.08177732735855556, + "language_loss": 0.84886205, + "learning_rate": 0.0009239176210995666, + "loss": 0.86048913, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.4440918, + "step": 1053, + "time_per_iteration": 3.4955379962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148392, + "balance_loss_mlp": 1.0973227, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.9822109545682867, + "language_loss": 0.94933617, + "learning_rate": 0.0009237523404732695, + "loss": 0.96082008, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51074219, + "step": 1054, + "time_per_iteration": 2.90132737159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137464, + "balance_loss_mlp": 1.09118664, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.09331279688006895, + "language_loss": 0.85504258, + "learning_rate": 0.0009235868953317235, + "loss": 0.86641729, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.46264648, + "step": 1055, + "time_per_iteration": 2.813202381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212355, + "balance_loss_mlp": 1.16388512, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08645469446577787, + "language_loss": 0.86679947, + "learning_rate": 0.0009234212857391602, + "loss": 0.87892294, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.48486328, + "step": 1056, + "time_per_iteration": 3.184723377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_mlp": 1.23723245, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.11402704661401492, + "language_loss": 0.90548229, + "learning_rate": 0.000923255511759875, + "loss": 0.91837716, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.52319336, + "step": 1057, + "time_per_iteration": 2.8404476642608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374128, + "balance_loss_mlp": 1.3215096, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.12448379126392096, + "language_loss": 0.86306804, + "learning_rate": 0.000923089573458227, + "loss": 0.87680936, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.52661133, + "step": 1058, + "time_per_iteration": 2.921942949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411943, + "balance_loss_mlp": 1.35701096, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.12614323996078466, + "language_loss": 0.84856015, + "learning_rate": 0.0009229234708986392, + "loss": 0.8626796, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.54931641, + "step": 1059, + "time_per_iteration": 2.922795057296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01629047, + "balance_loss_mlp": 1.60253465, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.12493252943786969, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83295941, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.265625, + "step": 1060, + "time_per_iteration": 4.733684062957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333622, + "balance_loss_mlp": 1.27976346, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.0936460184690869, + "language_loss": 0.86563337, + "learning_rate": 0.0009225907732636548, + "loss": 0.87896961, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.53881836, + "step": 1061, + "time_per_iteration": 2.761353015899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296883, + "balance_loss_mlp": 1.24183202, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.09002543594031559, + "language_loss": 0.87698424, + "learning_rate": 0.0009224241783174227, + "loss": 0.88995302, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.55078125, + "step": 1062, + "time_per_iteration": 2.7161052227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252808, + "balance_loss_mlp": 1.19947362, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.08928798499879465, + "language_loss": 0.87254798, + "learning_rate": 0.0009222574193715802, + "loss": 0.88507611, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.53369141, + "step": 1063, + "time_per_iteration": 2.779623031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122391, + "balance_loss_mlp": 1.16757131, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06606001070927259, + "language_loss": 0.87212694, + "learning_rate": 0.000922090496490869, + "loss": 0.88436604, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.56323242, + "step": 1064, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217897, + "balance_loss_mlp": 1.16120076, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.3109146854617931, + "language_loss": 0.90918952, + "learning_rate": 0.0009219234097400937, + "loss": 0.92136848, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.56665039, + "step": 1065, + "time_per_iteration": 2.804588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245438, + "balance_loss_mlp": 1.18359244, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06908392980849179, + "language_loss": 0.84456235, + "learning_rate": 0.0009217561591841237, + "loss": 0.85701674, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.61816406, + "step": 1066, + "time_per_iteration": 3.303875207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287048, + "balance_loss_mlp": 1.21867001, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.1162597514909173, + "language_loss": 0.82140827, + "learning_rate": 0.0009215887448878913, + "loss": 0.83427876, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.68408203, + "step": 1067, + "time_per_iteration": 2.568690776824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293452, + "balance_loss_mlp": 1.22288036, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08586469474305494, + "language_loss": 0.85986763, + "learning_rate": 0.0009214211669163922, + "loss": 0.87280214, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.70654297, + "step": 1068, + "time_per_iteration": 2.700090169906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_mlp": 1.21408105, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.06609725061841937, + "language_loss": 0.94520444, + "learning_rate": 0.0009212534253346862, + "loss": 0.95800096, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.65478516, + "step": 1069, + "time_per_iteration": 2.696699857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285979, + "balance_loss_mlp": 1.21912634, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.07442061186670905, + "language_loss": 0.85475862, + "learning_rate": 0.0009210855202078964, + "loss": 0.86761844, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.66845703, + "step": 1070, + "time_per_iteration": 2.5769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284239, + "balance_loss_mlp": 1.21771979, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.07631989099853977, + "language_loss": 0.88063252, + "learning_rate": 0.0009209174516012091, + "loss": 0.89347488, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.66601562, + "step": 1071, + "time_per_iteration": 2.6154239177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261362, + "balance_loss_mlp": 1.19317448, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.05883273983798781, + "language_loss": 0.90461957, + "learning_rate": 0.0009207492195798747, + "loss": 0.91723317, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.68164062, + "step": 1072, + "time_per_iteration": 2.764965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261739, + "balance_loss_mlp": 1.18997467, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.07316980575900926, + "language_loss": 0.86156094, + "learning_rate": 0.0009205808242092061, + "loss": 0.87417829, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.71728516, + "step": 1073, + "time_per_iteration": 2.6222856044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258718, + "balance_loss_mlp": 1.18952858, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.06600331144021966, + "language_loss": 0.83598334, + "learning_rate": 0.0009204122655545808, + "loss": 0.84857053, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.69189453, + "step": 1074, + "time_per_iteration": 3.313964605331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252953, + "balance_loss_mlp": 1.18571925, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.06834339296378739, + "language_loss": 0.82186073, + "learning_rate": 0.0009202435436814388, + "loss": 0.83439028, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.67236328, + "step": 1075, + "time_per_iteration": 2.68725848197937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260409, + "balance_loss_mlp": 1.1926024, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.07476886245144747, + "language_loss": 0.91110998, + "learning_rate": 0.0009200746586552836, + "loss": 0.92371404, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.67773438, + "step": 1076, + "time_per_iteration": 2.889910936355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238308, + "balance_loss_mlp": 1.17145491, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.06855298516082668, + "language_loss": 0.84957182, + "learning_rate": 0.0009199056105416825, + "loss": 0.86195493, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.66894531, + "step": 1077, + "time_per_iteration": 3.0826096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242457, + "balance_loss_mlp": 1.17312455, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.0732932371665923, + "language_loss": 0.87494361, + "learning_rate": 0.0009197363994062654, + "loss": 0.8873682, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.69287109, + "step": 1078, + "time_per_iteration": 2.814481735229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121373, + "balance_loss_mlp": 1.15455508, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.060498447021287705, + "language_loss": 0.85097158, + "learning_rate": 0.0009195670253147262, + "loss": 0.86310887, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.59179688, + "step": 1079, + "time_per_iteration": 2.989818572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216427, + "balance_loss_mlp": 1.15286458, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.0563328194871683, + "language_loss": 0.83052152, + "learning_rate": 0.0009193974883328216, + "loss": 0.84268576, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.63574219, + "step": 1080, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209864, + "balance_loss_mlp": 1.14553857, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06150097183917509, + "language_loss": 0.87932825, + "learning_rate": 0.0009192277885263718, + "loss": 0.89142686, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.64306641, + "step": 1081, + "time_per_iteration": 2.65731143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198356, + "balance_loss_mlp": 1.13264751, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.05302154537588453, + "language_loss": 0.86579674, + "learning_rate": 0.0009190579259612602, + "loss": 0.87778032, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.65722656, + "step": 1082, + "time_per_iteration": 3.2999303340911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207666, + "balance_loss_mlp": 1.14300656, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.07988409962843289, + "language_loss": 0.87673134, + "learning_rate": 0.000918887900703433, + "loss": 0.88880801, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.64648438, + "step": 1083, + "time_per_iteration": 2.7956981658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204411, + "balance_loss_mlp": 1.14361465, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07357181622228276, + "language_loss": 0.91242653, + "learning_rate": 0.0009187177128188999, + "loss": 0.92447066, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.60693359, + "step": 1084, + "time_per_iteration": 2.4656450748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194774, + "balance_loss_mlp": 1.16902518, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.038082499218869, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78351313, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.2578125, + "step": 1085, + "time_per_iteration": 4.855400323867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181967, + "balance_loss_mlp": 1.12419796, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07376491342946172, + "language_loss": 0.86747313, + "learning_rate": 0.000918376849434071, + "loss": 0.87929279, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.57739258, + "step": 1086, + "time_per_iteration": 2.5493998527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192292, + "balance_loss_mlp": 1.1305418, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.07728027722551846, + "language_loss": 0.9155581, + "learning_rate": 0.0009182061740661098, + "loss": 0.92748106, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.61767578, + "step": 1087, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192286, + "balance_loss_mlp": 1.13144195, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.057753656338862314, + "language_loss": 0.85712528, + "learning_rate": 0.0009180353363361127, + "loss": 0.86904812, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.60888672, + "step": 1088, + "time_per_iteration": 3.1143646240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180296, + "balance_loss_mlp": 1.11868906, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.07221088423930573, + "language_loss": 0.83469599, + "learning_rate": 0.0009178643363104044, + "loss": 0.84649897, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.61621094, + "step": 1089, + "time_per_iteration": 3.092656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199477, + "balance_loss_mlp": 1.138394, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.08745424257973078, + "language_loss": 0.92463166, + "learning_rate": 0.0009176931740553735, + "loss": 0.93662637, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.61083984, + "step": 1090, + "time_per_iteration": 2.53558349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207875, + "balance_loss_mlp": 1.14850855, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.07295134358518522, + "language_loss": 0.83623219, + "learning_rate": 0.0009175218496374708, + "loss": 0.84831095, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.59277344, + "step": 1091, + "time_per_iteration": 3.3514459133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226261, + "balance_loss_mlp": 1.16503549, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.0645587086921242, + "language_loss": 0.86590576, + "learning_rate": 0.0009173503631232103, + "loss": 0.87816834, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.61181641, + "step": 1092, + "time_per_iteration": 3.3893167972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122226, + "balance_loss_mlp": 1.16194034, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.12026645314545058, + "language_loss": 0.8245008, + "learning_rate": 0.0009171787145791691, + "loss": 0.83672333, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.60351562, + "step": 1093, + "time_per_iteration": 3.251084327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251584, + "balance_loss_mlp": 1.18854666, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.08481501206118727, + "language_loss": 0.8143028, + "learning_rate": 0.000917006904071987, + "loss": 0.82681859, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.63037109, + "step": 1094, + "time_per_iteration": 2.613060712814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272512, + "balance_loss_mlp": 1.20551634, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.08143629367900677, + "language_loss": 0.87639427, + "learning_rate": 0.0009168349316683669, + "loss": 0.88911939, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.66992188, + "step": 1095, + "time_per_iteration": 2.705172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269472, + "balance_loss_mlp": 1.20462179, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.05512017255927588, + "language_loss": 0.83512938, + "learning_rate": 0.0009166627974350741, + "loss": 0.8478241, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.64746094, + "step": 1096, + "time_per_iteration": 2.8979411125183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259233, + "balance_loss_mlp": 1.19390619, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.06519728045913388, + "language_loss": 0.90715098, + "learning_rate": 0.0009164905014389373, + "loss": 0.91974336, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.65283203, + "step": 1097, + "time_per_iteration": 2.7965359687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291926, + "balance_loss_mlp": 1.22445381, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.07891140172991894, + "language_loss": 0.87571776, + "learning_rate": 0.0009163180437468476, + "loss": 0.88863701, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.67480469, + "step": 1098, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012877, + "balance_loss_mlp": 1.22065675, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.06282838131309415, + "language_loss": 0.86816525, + "learning_rate": 0.000916145424425759, + "loss": 0.88104224, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.67041016, + "step": 1099, + "time_per_iteration": 2.6685678958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305165, + "balance_loss_mlp": 1.23554707, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.08616648204830919, + "language_loss": 0.916682, + "learning_rate": 0.0009159726435426885, + "loss": 0.92973363, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.69628906, + "step": 1100, + "time_per_iteration": 3.0852713584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282199, + "balance_loss_mlp": 1.21677744, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.07323647205544051, + "language_loss": 0.91053265, + "learning_rate": 0.0009157997011647154, + "loss": 0.92335469, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.65380859, + "step": 1101, + "time_per_iteration": 2.6137943267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270647, + "balance_loss_mlp": 1.20784807, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.05451247925490285, + "language_loss": 0.87014931, + "learning_rate": 0.0009156265973589817, + "loss": 0.88285577, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.62792969, + "step": 1102, + "time_per_iteration": 2.7920916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255362, + "balance_loss_mlp": 1.1928488, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.06310879580708054, + "language_loss": 0.90527534, + "learning_rate": 0.0009154533321926926, + "loss": 0.91782892, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.62548828, + "step": 1103, + "time_per_iteration": 2.646440029144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234037, + "balance_loss_mlp": 1.17214394, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.07831819024350671, + "language_loss": 0.88472342, + "learning_rate": 0.0009152799057331156, + "loss": 0.89706385, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.61865234, + "step": 1104, + "time_per_iteration": 3.122450590133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214804, + "balance_loss_mlp": 1.15462673, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.06719929320387279, + "language_loss": 0.91964042, + "learning_rate": 0.0009151063180475805, + "loss": 0.9317885, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.6015625, + "step": 1105, + "time_per_iteration": 2.5321173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181276, + "balance_loss_mlp": 1.12772751, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.07726558156265032, + "language_loss": 0.8518455, + "learning_rate": 0.0009149325692034803, + "loss": 0.86365819, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.53613281, + "step": 1106, + "time_per_iteration": 2.6019790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129115, + "balance_loss_mlp": 1.10660839, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.0458739418309424, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80332541, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.22460938, + "step": 1107, + "time_per_iteration": 4.859830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180766, + "balance_loss_mlp": 1.12478542, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08338906086238376, + "language_loss": 0.88186961, + "learning_rate": 0.0009145845883094678, + "loss": 0.89367729, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.56005859, + "step": 1108, + "time_per_iteration": 3.04249906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.10114598, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07708602471843919, + "language_loss": 0.85793281, + "learning_rate": 0.000914410356394654, + "loss": 0.86946738, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.5234375, + "step": 1109, + "time_per_iteration": 4.412867307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163449, + "balance_loss_mlp": 1.10751617, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.08187458054057056, + "language_loss": 0.85334879, + "learning_rate": 0.0009142359635914709, + "loss": 0.86498332, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.55957031, + "step": 1110, + "time_per_iteration": 3.023928642272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148781, + "balance_loss_mlp": 1.09570932, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.0669404625356857, + "language_loss": 0.85089076, + "learning_rate": 0.0009140614099676245, + "loss": 0.86237848, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.53076172, + "step": 1111, + "time_per_iteration": 2.625797748565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148537, + "balance_loss_mlp": 1.09632301, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.06784083874149466, + "language_loss": 0.83744586, + "learning_rate": 0.0009138866955908821, + "loss": 0.84893119, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.52246094, + "step": 1112, + "time_per_iteration": 2.9033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152374, + "balance_loss_mlp": 1.10042286, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.0756009236441896, + "language_loss": 0.81778276, + "learning_rate": 0.0009137118205290738, + "loss": 0.82930648, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.51977539, + "step": 1113, + "time_per_iteration": 3.00955867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163421, + "balance_loss_mlp": 1.10677314, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.07649003777848401, + "language_loss": 0.90946341, + "learning_rate": 0.0009135367848500924, + "loss": 0.92109764, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.56591797, + "step": 1114, + "time_per_iteration": 2.50858211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167845, + "balance_loss_mlp": 1.11472559, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.0823134598214501, + "language_loss": 0.87556803, + "learning_rate": 0.0009133615886218927, + "loss": 0.88724649, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.53125, + "step": 1115, + "time_per_iteration": 2.717454195022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178358, + "balance_loss_mlp": 1.11651218, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.06887665628973552, + "language_loss": 0.89567351, + "learning_rate": 0.0009131862319124917, + "loss": 0.90745711, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.61816406, + "step": 1116, + "time_per_iteration": 2.623767852783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176568, + "balance_loss_mlp": 1.1235671, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08365937432877864, + "language_loss": 0.85244483, + "learning_rate": 0.0009130107147899691, + "loss": 0.86421049, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.53051758, + "step": 1117, + "time_per_iteration": 2.795011281967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178642, + "balance_loss_mlp": 1.12561774, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.06665693704910039, + "language_loss": 0.8600654, + "learning_rate": 0.0009128350373224665, + "loss": 0.8718518, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.53076172, + "step": 1118, + "time_per_iteration": 2.5644795894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011837, + "balance_loss_mlp": 1.15928602, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.058896568697900505, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82640129, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.24414062, + "step": 1119, + "time_per_iteration": 4.683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204932, + "balance_loss_mlp": 1.15031052, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07135490421069918, + "language_loss": 0.85804355, + "learning_rate": 0.0009124832016254005, + "loss": 0.87009287, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.54663086, + "step": 1120, + "time_per_iteration": 2.6158647537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206508, + "balance_loss_mlp": 1.14571166, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.055578106746994274, + "language_loss": 0.89113355, + "learning_rate": 0.0009123070435324316, + "loss": 0.9031986, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.60791016, + "step": 1121, + "time_per_iteration": 2.755823850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.07988179, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.03051163671975961, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78977883, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.22265625, + "step": 1122, + "time_per_iteration": 4.996071100234985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211089, + "balance_loss_mlp": 1.15358257, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.06035521524280068, + "language_loss": 0.87145722, + "learning_rate": 0.0009119542471995752, + "loss": 0.88356811, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.57446289, + "step": 1123, + "time_per_iteration": 2.8323612213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204972, + "balance_loss_mlp": 1.14675009, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.060035653180353525, + "language_loss": 0.8248235, + "learning_rate": 0.0009117776090966554, + "loss": 0.83687323, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.58251953, + "step": 1124, + "time_per_iteration": 2.954216480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216387, + "balance_loss_mlp": 1.1558764, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.07791040933307145, + "language_loss": 0.876288, + "learning_rate": 0.0009116008111274899, + "loss": 0.88845193, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.60498047, + "step": 1125, + "time_per_iteration": 3.2826616764068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08216333, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.030294405796961115, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80209303, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.20214844, + "step": 1126, + "time_per_iteration": 4.8284173011779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.1455152, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.10762952047928877, + "language_loss": 0.8553561, + "learning_rate": 0.0009112467358650396, + "loss": 0.86737764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.56640625, + "step": 1127, + "time_per_iteration": 3.1621291637420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192643, + "balance_loss_mlp": 1.13561273, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.06435190440672867, + "language_loss": 0.87181705, + "learning_rate": 0.0009110694587092192, + "loss": 0.88374346, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.56982422, + "step": 1128, + "time_per_iteration": 2.7597765922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194699, + "balance_loss_mlp": 1.13452196, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.06894978951163175, + "language_loss": 0.8223331, + "learning_rate": 0.0009108920219620815, + "loss": 0.83428001, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.6015625, + "step": 1129, + "time_per_iteration": 2.6658482551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198898, + "balance_loss_mlp": 1.14072335, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06550313542995663, + "language_loss": 0.90210444, + "learning_rate": 0.0009107144256925133, + "loss": 0.91409343, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.58154297, + "step": 1130, + "time_per_iteration": 2.7298777103424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211101, + "balance_loss_mlp": 1.15464389, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08430456831611369, + "language_loss": 0.82975614, + "learning_rate": 0.0009105366699694638, + "loss": 0.84186715, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.56445312, + "step": 1131, + "time_per_iteration": 2.7422807216644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121305, + "balance_loss_mlp": 1.15263498, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.05499133039406014, + "language_loss": 0.82219702, + "learning_rate": 0.0009103587548619439, + "loss": 0.83432752, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.60400391, + "step": 1132, + "time_per_iteration": 2.8834011554718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202147, + "balance_loss_mlp": 1.14468873, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.12855794167944481, + "language_loss": 0.87174821, + "learning_rate": 0.0009101806804390261, + "loss": 0.88376963, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.57421875, + "step": 1133, + "time_per_iteration": 2.8493435382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186046, + "balance_loss_mlp": 1.13082814, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.07046865468216726, + "language_loss": 0.91345453, + "learning_rate": 0.0009100024467698453, + "loss": 0.92531502, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.55175781, + "step": 1134, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184337, + "balance_loss_mlp": 1.12613893, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.07929007457036284, + "language_loss": 0.8353889, + "learning_rate": 0.0009098240539235981, + "loss": 0.84723222, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.58227539, + "step": 1135, + "time_per_iteration": 2.6736483573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176396, + "balance_loss_mlp": 1.12122619, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.06661367385494366, + "language_loss": 0.88575935, + "learning_rate": 0.0009096455019695423, + "loss": 0.89752328, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.55224609, + "step": 1136, + "time_per_iteration": 2.8438823223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172318, + "balance_loss_mlp": 1.1156702, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07075177433605506, + "language_loss": 0.90707165, + "learning_rate": 0.000909466790976998, + "loss": 0.91879487, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.56616211, + "step": 1137, + "time_per_iteration": 2.4795870780944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185042, + "balance_loss_mlp": 1.12801182, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07051320604800417, + "language_loss": 0.83409071, + "learning_rate": 0.0009092879210153473, + "loss": 0.84594113, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.57080078, + "step": 1138, + "time_per_iteration": 3.1328911781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186779, + "balance_loss_mlp": 1.13284826, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.06458215213012623, + "language_loss": 0.89566886, + "learning_rate": 0.0009091088921540333, + "loss": 0.90753663, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.54003906, + "step": 1139, + "time_per_iteration": 2.5608675479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_mlp": 1.03115106, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.027642480599540168, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76555562, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.15722656, + "step": 1140, + "time_per_iteration": 4.908522605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117392, + "balance_loss_mlp": 1.11908412, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.0906322081519832, + "language_loss": 0.84775734, + "learning_rate": 0.0009087503580104985, + "loss": 0.85949653, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.54882812, + "step": 1141, + "time_per_iteration": 2.696129083633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181851, + "balance_loss_mlp": 1.12558413, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.16226849767110665, + "language_loss": 0.80068243, + "learning_rate": 0.0009085708528674728, + "loss": 0.81250095, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.56347656, + "step": 1142, + "time_per_iteration": 2.7995505332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157793, + "balance_loss_mlp": 1.09985733, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08217329602320493, + "language_loss": 0.874843, + "learning_rate": 0.0009083911891031745, + "loss": 0.88642091, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.57958984, + "step": 1143, + "time_per_iteration": 3.1351919174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115076, + "balance_loss_mlp": 1.09578109, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.06169995263224583, + "language_loss": 0.92273706, + "learning_rate": 0.0009082113667873553, + "loss": 0.93424463, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.55029297, + "step": 1144, + "time_per_iteration": 3.1171934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153616, + "balance_loss_mlp": 1.10087752, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.07183124767141379, + "language_loss": 0.91221762, + "learning_rate": 0.0009080313859898283, + "loss": 0.9237538, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.52758789, + "step": 1145, + "time_per_iteration": 2.506591796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153774, + "balance_loss_mlp": 1.09986758, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07077080612529597, + "language_loss": 0.92340779, + "learning_rate": 0.0009078512467804684, + "loss": 0.93494552, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.53881836, + "step": 1146, + "time_per_iteration": 2.591327667236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172392, + "balance_loss_mlp": 1.11800838, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.07651793216141736, + "language_loss": 0.91144007, + "learning_rate": 0.0009076709492292119, + "loss": 0.92316401, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.54418945, + "step": 1147, + "time_per_iteration": 2.609628438949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.11723804, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.07920780045429675, + "language_loss": 0.89603102, + "learning_rate": 0.0009074904934060562, + "loss": 0.90772295, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.51928711, + "step": 1148, + "time_per_iteration": 2.6755712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173959, + "balance_loss_mlp": 1.11697721, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.08245317941840166, + "language_loss": 0.8559376, + "learning_rate": 0.0009073098793810607, + "loss": 0.86767721, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.57006836, + "step": 1149, + "time_per_iteration": 2.9874348640441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177675, + "balance_loss_mlp": 1.12293434, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.08525751827962168, + "language_loss": 0.88982397, + "learning_rate": 0.000907129107224346, + "loss": 0.90160072, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.54785156, + "step": 1150, + "time_per_iteration": 2.739461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180589, + "balance_loss_mlp": 1.12658715, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.05205595876874212, + "language_loss": 0.88991034, + "learning_rate": 0.0009069481770060939, + "loss": 0.90171623, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.54077148, + "step": 1151, + "time_per_iteration": 2.7024669647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187248, + "balance_loss_mlp": 1.13212562, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06739531662392768, + "language_loss": 0.84448045, + "learning_rate": 0.000906767088796548, + "loss": 0.85635293, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.55126953, + "step": 1152, + "time_per_iteration": 3.4467508792877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117836, + "balance_loss_mlp": 1.12571764, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.05411857974090042, + "language_loss": 0.8779093, + "learning_rate": 0.0009065858426660127, + "loss": 0.8896929, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.52661133, + "step": 1153, + "time_per_iteration": 2.6216752529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182109, + "balance_loss_mlp": 1.12736845, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.07769931213358174, + "language_loss": 0.84979808, + "learning_rate": 0.0009064044386848543, + "loss": 0.86161917, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.54833984, + "step": 1154, + "time_per_iteration": 2.91601824760437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172512, + "balance_loss_mlp": 1.11381316, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.0711084155390928, + "language_loss": 0.89741302, + "learning_rate": 0.0009062228769234997, + "loss": 0.90913814, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.58691406, + "step": 1155, + "time_per_iteration": 2.5972864627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116208, + "balance_loss_mlp": 1.10690951, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.09100503083112628, + "language_loss": 0.81526613, + "learning_rate": 0.0009060411574524376, + "loss": 0.82688695, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.55224609, + "step": 1156, + "time_per_iteration": 2.6763274669647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182591, + "balance_loss_mlp": 1.12684917, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.06563385289017937, + "language_loss": 0.88585329, + "learning_rate": 0.0009058592803422178, + "loss": 0.89767921, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.55810547, + "step": 1157, + "time_per_iteration": 3.1414153575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_mlp": 1.00955701, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.012760142008093896, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79737109, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.17285156, + "step": 1158, + "time_per_iteration": 4.802858352661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171905, + "balance_loss_mlp": 1.12126482, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.060083734909452326, + "language_loss": 0.90886426, + "learning_rate": 0.00090549505348681, + "loss": 0.92058331, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.50683594, + "step": 1159, + "time_per_iteration": 2.5810928344726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168069, + "balance_loss_mlp": 1.11137354, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07069918091424116, + "language_loss": 0.85149121, + "learning_rate": 0.0009053127038830275, + "loss": 0.86317194, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.56689453, + "step": 1160, + "time_per_iteration": 3.009434223175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162107, + "balance_loss_mlp": 1.1050297, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.07200535138488619, + "language_loss": 0.87409687, + "learning_rate": 0.000905130196922898, + "loss": 0.88571799, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.57080078, + "step": 1161, + "time_per_iteration": 2.5972068309783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157457, + "balance_loss_mlp": 1.10223973, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.053497533436564174, + "language_loss": 0.8808614, + "learning_rate": 0.0009049475326772769, + "loss": 0.89243597, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.55224609, + "step": 1162, + "time_per_iteration": 2.580254316329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167432, + "balance_loss_mlp": 1.11092722, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.105825736895628, + "language_loss": 0.83639884, + "learning_rate": 0.0009047647112170811, + "loss": 0.84807312, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.56469727, + "step": 1163, + "time_per_iteration": 2.7509572505950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170262, + "balance_loss_mlp": 1.11041939, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.11729347611284674, + "language_loss": 0.8833853, + "learning_rate": 0.0009045817326132876, + "loss": 0.89508796, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.59814453, + "step": 1164, + "time_per_iteration": 3.6648380756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170775, + "balance_loss_mlp": 1.11226714, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.05704665841604838, + "language_loss": 0.83974147, + "learning_rate": 0.0009043985969369357, + "loss": 0.85144925, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.58544922, + "step": 1165, + "time_per_iteration": 2.868560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176977, + "balance_loss_mlp": 1.11665666, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.059940537627208516, + "language_loss": 0.84960037, + "learning_rate": 0.0009042153042591245, + "loss": 0.86137015, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.60302734, + "step": 1166, + "time_per_iteration": 2.8023743629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116839, + "balance_loss_mlp": 1.11271954, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.054742371261080745, + "language_loss": 0.85761929, + "learning_rate": 0.0009040318546510146, + "loss": 0.86930317, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.55639648, + "step": 1167, + "time_per_iteration": 3.141993999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117745, + "balance_loss_mlp": 1.1215651, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.07712318573741421, + "language_loss": 0.8582288, + "learning_rate": 0.0009038482481838275, + "loss": 0.87000328, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.55957031, + "step": 1168, + "time_per_iteration": 2.675204038619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116517, + "balance_loss_mlp": 1.1128844, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05640688657343365, + "language_loss": 0.88303328, + "learning_rate": 0.0009036644849288455, + "loss": 0.89468497, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.52319336, + "step": 1169, + "time_per_iteration": 3.0777511596679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.09441662, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.07174166621143864, + "language_loss": 0.86291218, + "learning_rate": 0.0009034805649574118, + "loss": 0.87439895, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.54394531, + "step": 1170, + "time_per_iteration": 2.7120091915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157496, + "balance_loss_mlp": 1.10513926, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.05497638968028837, + "language_loss": 0.85883957, + "learning_rate": 0.0009032964883409308, + "loss": 0.87041461, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.52441406, + "step": 1171, + "time_per_iteration": 2.8770556449890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_mlp": 1.03001809, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.027786176955518046, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74097812, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.17285156, + "step": 1172, + "time_per_iteration": 4.997943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150837, + "balance_loss_mlp": 1.0977174, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.06380875138992877, + "language_loss": 0.87640917, + "learning_rate": 0.0009029278654587462, + "loss": 0.88791752, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.53173828, + "step": 1173, + "time_per_iteration": 2.6070940494537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148484, + "balance_loss_mlp": 1.09546018, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.057211485944593306, + "language_loss": 0.83189976, + "learning_rate": 0.0009027433193361548, + "loss": 0.84338462, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.53027344, + "step": 1174, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.09708285, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06182212989299174, + "language_loss": 0.86948568, + "learning_rate": 0.00090255861685474, + "loss": 0.88097882, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.52246094, + "step": 1175, + "time_per_iteration": 2.7387607097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146248, + "balance_loss_mlp": 1.09284246, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.06871471519475823, + "language_loss": 0.92170686, + "learning_rate": 0.0009023737580862095, + "loss": 0.93316931, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.53442383, + "step": 1176, + "time_per_iteration": 2.6016342639923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160546, + "balance_loss_mlp": 1.11035883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0563237464245993, + "language_loss": 0.83948356, + "learning_rate": 0.0009021887431023321, + "loss": 0.851089, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.50219727, + "step": 1177, + "time_per_iteration": 2.5911412239074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161678, + "balance_loss_mlp": 1.11063254, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.06510699727290163, + "language_loss": 0.88054293, + "learning_rate": 0.0009020035719749369, + "loss": 0.8921597, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.51098633, + "step": 1178, + "time_per_iteration": 2.747715473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_mlp": 1.1255312, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0760827261000747, + "language_loss": 0.78592283, + "learning_rate": 0.0009018182447759136, + "loss": 0.79774463, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.56616211, + "step": 1179, + "time_per_iteration": 2.9912376403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177946, + "balance_loss_mlp": 1.12287188, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.05857060866656224, + "language_loss": 0.80403864, + "learning_rate": 0.0009016327615772126, + "loss": 0.81581813, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.55126953, + "step": 1180, + "time_per_iteration": 2.951934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178867, + "balance_loss_mlp": 1.1241498, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.07803208794693026, + "language_loss": 0.88654709, + "learning_rate": 0.0009014471224508451, + "loss": 0.8983357, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.54711914, + "step": 1181, + "time_per_iteration": 2.6834704875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175396, + "balance_loss_mlp": 1.12280107, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.07891792311297686, + "language_loss": 0.84171915, + "learning_rate": 0.0009012613274688823, + "loss": 0.85347319, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.52636719, + "step": 1182, + "time_per_iteration": 2.6773135662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193932, + "balance_loss_mlp": 1.13711679, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.06685387295915801, + "language_loss": 0.88334668, + "learning_rate": 0.0009010753767034565, + "loss": 0.89528602, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.56811523, + "step": 1183, + "time_per_iteration": 2.53671932220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192681, + "balance_loss_mlp": 1.13732028, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.05676884979808662, + "language_loss": 0.79381895, + "learning_rate": 0.0009008892702267599, + "loss": 0.80574578, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.55297852, + "step": 1184, + "time_per_iteration": 2.9609317779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218637, + "balance_loss_mlp": 1.16055822, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.11080255811352213, + "language_loss": 0.897048, + "learning_rate": 0.0009007030081110457, + "loss": 0.9092344, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.58105469, + "step": 1185, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212656, + "balance_loss_mlp": 1.15872598, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.06215110995007368, + "language_loss": 0.8510564, + "learning_rate": 0.000900516590428627, + "loss": 0.8631829, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.53955078, + "step": 1186, + "time_per_iteration": 2.66469407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206955, + "balance_loss_mlp": 1.15416956, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07510292852734143, + "language_loss": 0.90231287, + "learning_rate": 0.0009003300172518778, + "loss": 0.91438246, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.52807617, + "step": 1187, + "time_per_iteration": 2.6872987747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189379, + "balance_loss_mlp": 1.13559163, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.06187047573177096, + "language_loss": 0.84854043, + "learning_rate": 0.0009001432886532321, + "loss": 0.86043417, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.53808594, + "step": 1188, + "time_per_iteration": 2.961327314376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185796, + "balance_loss_mlp": 1.13248527, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0670290505569486, + "language_loss": 0.87277937, + "learning_rate": 0.0008999564047051843, + "loss": 0.88463724, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.53320312, + "step": 1189, + "time_per_iteration": 2.5120058059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119823, + "balance_loss_mlp": 1.14773321, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.07775817493182749, + "language_loss": 0.85562766, + "learning_rate": 0.0008997693654802894, + "loss": 0.86760998, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.50537109, + "step": 1190, + "time_per_iteration": 2.6584115028381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203195, + "balance_loss_mlp": 1.15343666, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08092173087187808, + "language_loss": 0.87245274, + "learning_rate": 0.0008995821710511625, + "loss": 0.88448465, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49780273, + "step": 1191, + "time_per_iteration": 2.75514817237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189711, + "balance_loss_mlp": 1.14376771, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.058050392882622655, + "language_loss": 0.85975361, + "learning_rate": 0.0008993948214904786, + "loss": 0.8716507, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.45922852, + "step": 1192, + "time_per_iteration": 2.5808064937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132885, + "balance_loss_mlp": 1.11629128, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.04438752541684951, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.795551, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.16601562, + "step": 1193, + "time_per_iteration": 4.915351629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170271, + "balance_loss_mlp": 1.11338401, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.06516354982073377, + "language_loss": 0.79226351, + "learning_rate": 0.0008990196572654427, + "loss": 0.80396616, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.56933594, + "step": 1194, + "time_per_iteration": 2.914353609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159508, + "balance_loss_mlp": 1.10982203, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.053033431306196574, + "language_loss": 0.88186455, + "learning_rate": 0.0008988318427467426, + "loss": 0.89345956, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.49707031, + "step": 1195, + "time_per_iteration": 2.763303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146949, + "balance_loss_mlp": 1.09754825, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.06471781599702997, + "language_loss": 0.87142104, + "learning_rate": 0.0008986438733877887, + "loss": 0.88289052, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.49414062, + "step": 1196, + "time_per_iteration": 3.453037738800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138036, + "balance_loss_mlp": 1.08901691, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.05831436273017673, + "language_loss": 0.84795159, + "learning_rate": 0.0008984557492615576, + "loss": 0.85933197, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.49023438, + "step": 1197, + "time_per_iteration": 2.9209883213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147831, + "balance_loss_mlp": 1.09816873, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.06183090029168821, + "language_loss": 0.90001792, + "learning_rate": 0.0008982674704410854, + "loss": 0.91149628, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.49658203, + "step": 1198, + "time_per_iteration": 2.723980665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.10364521, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.06439147944581719, + "language_loss": 0.78128076, + "learning_rate": 0.0008980790369994682, + "loss": 0.7928164, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.49926758, + "step": 1199, + "time_per_iteration": 2.968733787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148741, + "balance_loss_mlp": 1.09817219, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.060755539801175186, + "language_loss": 0.8790828, + "learning_rate": 0.000897890449009863, + "loss": 0.89057022, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.50561523, + "step": 1200, + "time_per_iteration": 2.7373695373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159063, + "balance_loss_mlp": 1.11052144, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09508340337221405, + "language_loss": 0.9041636, + "learning_rate": 0.0008977017065454853, + "loss": 0.91575426, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.4855957, + "step": 1201, + "time_per_iteration": 2.6561479568481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172289, + "balance_loss_mlp": 1.12393796, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06896397472633412, + "language_loss": 0.8110497, + "learning_rate": 0.0008975128096796121, + "loss": 0.82277262, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48413086, + "step": 1202, + "time_per_iteration": 2.850882053375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166428, + "balance_loss_mlp": 1.11583591, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.07234791297382964, + "language_loss": 0.86751068, + "learning_rate": 0.0008973237584855794, + "loss": 0.87917495, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.50610352, + "step": 1203, + "time_per_iteration": 2.898651599884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201199, + "balance_loss_mlp": 1.14912796, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.0647782155366788, + "language_loss": 0.82535917, + "learning_rate": 0.0008971345530367832, + "loss": 0.83737111, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.52172852, + "step": 1204, + "time_per_iteration": 2.479710102081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188056, + "balance_loss_mlp": 1.13743997, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07976239468268423, + "language_loss": 0.86050093, + "learning_rate": 0.0008969451934066799, + "loss": 0.87238145, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.50561523, + "step": 1205, + "time_per_iteration": 2.7891948223114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190172, + "balance_loss_mlp": 1.13834012, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08603625620414594, + "language_loss": 0.8068459, + "learning_rate": 0.0008967556796687854, + "loss": 0.81874764, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.51879883, + "step": 1206, + "time_per_iteration": 2.879742383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182453, + "balance_loss_mlp": 1.1313839, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.06613018456643845, + "language_loss": 0.8416872, + "learning_rate": 0.0008965660118966752, + "loss": 0.85351169, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.51098633, + "step": 1207, + "time_per_iteration": 2.8900513648986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.11610246, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06058209183838784, + "language_loss": 0.90754479, + "learning_rate": 0.0008963761901639851, + "loss": 0.91918385, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.47802734, + "step": 1208, + "time_per_iteration": 2.805534601211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176094, + "balance_loss_mlp": 1.12457156, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.06993420403149982, + "language_loss": 0.83909518, + "learning_rate": 0.0008961862145444103, + "loss": 0.85085618, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.51538086, + "step": 1209, + "time_per_iteration": 2.6882550716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197419, + "balance_loss_mlp": 1.14587319, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08646594069324176, + "language_loss": 0.85994279, + "learning_rate": 0.0008959960851117059, + "loss": 0.87191701, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.51611328, + "step": 1210, + "time_per_iteration": 2.6176648139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118703, + "balance_loss_mlp": 1.13340998, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06670419812311852, + "language_loss": 0.84013158, + "learning_rate": 0.0008958058019396868, + "loss": 0.85200191, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.53637695, + "step": 1211, + "time_per_iteration": 2.7867624759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177443, + "balance_loss_mlp": 1.12754154, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08722593193124767, + "language_loss": 0.87226063, + "learning_rate": 0.0008956153651022274, + "loss": 0.88403505, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.49926758, + "step": 1212, + "time_per_iteration": 2.671705961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169147, + "balance_loss_mlp": 1.11726665, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.06082314874639417, + "language_loss": 0.84296238, + "learning_rate": 0.0008954247746732618, + "loss": 0.85465384, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.51904297, + "step": 1213, + "time_per_iteration": 2.58005952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163838, + "balance_loss_mlp": 1.1156534, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.06006865966510304, + "language_loss": 0.91204965, + "learning_rate": 0.0008952340307267837, + "loss": 0.92368799, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48144531, + "step": 1214, + "time_per_iteration": 2.842824697494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149903, + "balance_loss_mlp": 1.09983516, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.07140080071894721, + "language_loss": 0.84202802, + "learning_rate": 0.0008950431333368468, + "loss": 0.85352707, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.50097656, + "step": 1215, + "time_per_iteration": 2.5616672039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155221, + "balance_loss_mlp": 1.10656011, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.083723319453273, + "language_loss": 0.85366404, + "learning_rate": 0.0008948520825775634, + "loss": 0.86521626, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48657227, + "step": 1216, + "time_per_iteration": 3.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114764, + "balance_loss_mlp": 1.09895492, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.05781662545039131, + "language_loss": 0.84181142, + "learning_rate": 0.0008946608785231067, + "loss": 0.85328782, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48706055, + "step": 1217, + "time_per_iteration": 2.861449956893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131497, + "balance_loss_mlp": 1.08352745, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.06428977242182035, + "language_loss": 0.85432529, + "learning_rate": 0.0008944695212477084, + "loss": 0.86564028, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.47973633, + "step": 1218, + "time_per_iteration": 2.540524959564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148618, + "balance_loss_mlp": 1.09907508, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.060914019840806265, + "language_loss": 0.86493349, + "learning_rate": 0.0008942780108256599, + "loss": 0.87641972, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.49560547, + "step": 1219, + "time_per_iteration": 2.613769769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142119, + "balance_loss_mlp": 1.09100199, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.05108155821019921, + "language_loss": 0.87340164, + "learning_rate": 0.0008940863473313121, + "loss": 0.88482285, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.51123047, + "step": 1220, + "time_per_iteration": 2.4549899101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145999, + "balance_loss_mlp": 1.09702742, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07702998226564757, + "language_loss": 0.8851074, + "learning_rate": 0.0008938945308390756, + "loss": 0.8965674, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48974609, + "step": 1221, + "time_per_iteration": 2.6133854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149321, + "balance_loss_mlp": 1.10211444, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.057479910137590906, + "language_loss": 0.88199294, + "learning_rate": 0.00089370256142342, + "loss": 0.89348614, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.47192383, + "step": 1222, + "time_per_iteration": 2.713489532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.09286284, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.05442066188859713, + "language_loss": 0.85879123, + "learning_rate": 0.0008935104391588746, + "loss": 0.87021047, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.49121094, + "step": 1223, + "time_per_iteration": 2.7304563522338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145924, + "balance_loss_mlp": 1.09447336, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.05049406517739995, + "language_loss": 0.8341555, + "learning_rate": 0.0008933181641200276, + "loss": 0.84561473, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.51513672, + "step": 1224, + "time_per_iteration": 3.122603416442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139619, + "balance_loss_mlp": 1.09279394, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.0678885239417847, + "language_loss": 0.8627063, + "learning_rate": 0.0008931257363815271, + "loss": 0.87410253, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.46826172, + "step": 1225, + "time_per_iteration": 2.86014986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142208, + "balance_loss_mlp": 1.09490585, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.0639396043769501, + "language_loss": 0.90318632, + "learning_rate": 0.0008929331560180798, + "loss": 0.91460842, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.47338867, + "step": 1226, + "time_per_iteration": 2.9069020748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158077, + "balance_loss_mlp": 1.10924876, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.05735405278544162, + "language_loss": 0.9124881, + "learning_rate": 0.0008927404231044525, + "loss": 0.92406881, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48828125, + "step": 1227, + "time_per_iteration": 2.745591163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154284, + "balance_loss_mlp": 1.10571766, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.062458312515348655, + "language_loss": 0.8233285, + "learning_rate": 0.0008925475377154703, + "loss": 0.83487129, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48583984, + "step": 1228, + "time_per_iteration": 2.7165796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147881, + "balance_loss_mlp": 1.09664452, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.06307879716822463, + "language_loss": 0.82915187, + "learning_rate": 0.0008923544999260183, + "loss": 0.84063065, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.51293945, + "step": 1229, + "time_per_iteration": 2.787444829940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156102, + "balance_loss_mlp": 1.10567617, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.06236445133400911, + "language_loss": 0.92471206, + "learning_rate": 0.00089216130981104, + "loss": 0.9362731, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.50439453, + "step": 1230, + "time_per_iteration": 3.0671463012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148349, + "balance_loss_mlp": 1.09816241, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.06420697058211047, + "language_loss": 0.82893002, + "learning_rate": 0.000891967967445539, + "loss": 0.84041357, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.50195312, + "step": 1231, + "time_per_iteration": 2.692819356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147263, + "balance_loss_mlp": 1.09733796, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.044472050821074895, + "language_loss": 0.89257467, + "learning_rate": 0.0008917744729045772, + "loss": 0.90404725, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.49975586, + "step": 1232, + "time_per_iteration": 2.911123037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.10190618, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.055115174481180494, + "language_loss": 0.84317499, + "learning_rate": 0.0008915808262632757, + "loss": 0.85468972, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.49633789, + "step": 1233, + "time_per_iteration": 2.8429055213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164245, + "balance_loss_mlp": 1.1117928, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.07089823280283834, + "language_loss": 0.93916011, + "learning_rate": 0.0008913870275968148, + "loss": 0.95080256, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.52392578, + "step": 1234, + "time_per_iteration": 2.7355082035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152305, + "balance_loss_mlp": 1.10321498, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06512180670183462, + "language_loss": 0.87916219, + "learning_rate": 0.0008911930769804342, + "loss": 0.8906852, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.49145508, + "step": 1235, + "time_per_iteration": 3.320653200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115702, + "balance_loss_mlp": 1.10549772, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.04926889071384256, + "language_loss": 0.91928077, + "learning_rate": 0.0008909989744894318, + "loss": 0.93085092, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.51513672, + "step": 1236, + "time_per_iteration": 2.860095500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114863, + "balance_loss_mlp": 1.09808517, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.06373579401102465, + "language_loss": 0.81724823, + "learning_rate": 0.0008908047201991649, + "loss": 0.82873452, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.50512695, + "step": 1237, + "time_per_iteration": 2.7173092365264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146815, + "balance_loss_mlp": 1.10065758, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.06973577397583665, + "language_loss": 0.86895192, + "learning_rate": 0.0008906103141850502, + "loss": 0.88042009, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.46142578, + "step": 1238, + "time_per_iteration": 2.9070518016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149112, + "balance_loss_mlp": 1.10068893, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.07438040904238923, + "language_loss": 0.88608682, + "learning_rate": 0.0008904157565225621, + "loss": 0.897578, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48461914, + "step": 1239, + "time_per_iteration": 2.598175287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114606, + "balance_loss_mlp": 1.09758997, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.07265689268382322, + "language_loss": 0.82424903, + "learning_rate": 0.000890221047287235, + "loss": 0.83570957, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48486328, + "step": 1240, + "time_per_iteration": 3.5255463123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149116, + "balance_loss_mlp": 1.10207629, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07692592831537566, + "language_loss": 0.91524613, + "learning_rate": 0.0008900261865546615, + "loss": 0.92673725, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47021484, + "step": 1241, + "time_per_iteration": 2.626298189163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150585, + "balance_loss_mlp": 1.10101807, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.06193436068824588, + "language_loss": 0.85487348, + "learning_rate": 0.0008898311744004936, + "loss": 0.86637932, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.49584961, + "step": 1242, + "time_per_iteration": 2.6845884323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143159, + "balance_loss_mlp": 1.09638107, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06489370510499948, + "language_loss": 0.87195957, + "learning_rate": 0.0008896360109004414, + "loss": 0.88339114, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.46801758, + "step": 1243, + "time_per_iteration": 2.6279244422912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149339, + "balance_loss_mlp": 1.10239482, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.05690023470638135, + "language_loss": 0.84913921, + "learning_rate": 0.0008894406961302742, + "loss": 0.8606326, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.46948242, + "step": 1244, + "time_per_iteration": 2.5823607444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161498, + "balance_loss_mlp": 1.11591244, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.06599652790645752, + "language_loss": 0.84225279, + "learning_rate": 0.0008892452301658201, + "loss": 0.85386777, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.45581055, + "step": 1245, + "time_per_iteration": 3.0007240772247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153792, + "balance_loss_mlp": 1.1045351, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.05569216777143309, + "language_loss": 0.83851659, + "learning_rate": 0.0008890496130829653, + "loss": 0.8500545, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.49316406, + "step": 1246, + "time_per_iteration": 2.656524658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.10424757, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.0643203237989141, + "language_loss": 0.85808307, + "learning_rate": 0.0008888538449576555, + "loss": 0.86958289, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.45751953, + "step": 1247, + "time_per_iteration": 2.5420141220092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148571, + "balance_loss_mlp": 1.09993315, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.07931889136759729, + "language_loss": 0.83083689, + "learning_rate": 0.0008886579258658944, + "loss": 0.84232259, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48632812, + "step": 1248, + "time_per_iteration": 2.574025869369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136833, + "balance_loss_mlp": 1.08786154, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.057547694087262784, + "language_loss": 0.85210383, + "learning_rate": 0.0008884618558837446, + "loss": 0.8634721, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.48974609, + "step": 1249, + "time_per_iteration": 2.808790922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146334, + "balance_loss_mlp": 1.09407234, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.05843363394571656, + "language_loss": 0.87170362, + "learning_rate": 0.0008882656350873273, + "loss": 0.88316691, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.52319336, + "step": 1250, + "time_per_iteration": 2.839341163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139888, + "balance_loss_mlp": 1.08998704, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.06920486589868534, + "language_loss": 0.87495792, + "learning_rate": 0.0008880692635528219, + "loss": 0.88635677, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.49975586, + "step": 1251, + "time_per_iteration": 3.0422415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141134, + "balance_loss_mlp": 1.09404635, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09445201185980338, + "language_loss": 0.89987123, + "learning_rate": 0.0008878727413564669, + "loss": 0.91128266, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47094727, + "step": 1252, + "time_per_iteration": 2.7974343299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110917, + "balance_loss_mlp": 1.09066832, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.0270998190046769, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81244767, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.18457031, + "step": 1253, + "time_per_iteration": 4.892668724060059 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150056, + "balance_loss_mlp": 1.09707963, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.06472275672686992, + "language_loss": 0.79044139, + "learning_rate": 0.0008874792452834528, + "loss": 0.80194199, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.53076172, + "step": 1254, + "time_per_iteration": 2.759533643722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144784, + "balance_loss_mlp": 1.09397733, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08671647217417044, + "language_loss": 0.87847424, + "learning_rate": 0.0008872822715595626, + "loss": 0.88992208, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.50878906, + "step": 1255, + "time_per_iteration": 2.6758921146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136115, + "balance_loss_mlp": 1.08731091, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.07818195128513271, + "language_loss": 0.87750483, + "learning_rate": 0.0008870851474793598, + "loss": 0.88886595, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.48803711, + "step": 1256, + "time_per_iteration": 2.5903451442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140246, + "balance_loss_mlp": 1.09196591, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.06462138017812241, + "language_loss": 0.90108514, + "learning_rate": 0.0008868878731193752, + "loss": 0.91248751, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48291016, + "step": 1257, + "time_per_iteration": 2.9156484603881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131411, + "balance_loss_mlp": 1.08611095, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.06839520252820154, + "language_loss": 0.89823216, + "learning_rate": 0.0008866904485561973, + "loss": 0.90954626, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.45361328, + "step": 1258, + "time_per_iteration": 2.709073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128152, + "balance_loss_mlp": 1.07698727, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.061516465429869265, + "language_loss": 0.83619797, + "learning_rate": 0.000886492873866473, + "loss": 0.84747952, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.51245117, + "step": 1259, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122174, + "balance_loss_mlp": 1.07315516, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.07532562043269028, + "language_loss": 0.85057306, + "learning_rate": 0.000886295149126908, + "loss": 0.86179483, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.49023438, + "step": 1260, + "time_per_iteration": 2.7702596187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_mlp": 1.07291138, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.06506459806255929, + "language_loss": 0.86249155, + "learning_rate": 0.0008860972744142655, + "loss": 0.87369466, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47363281, + "step": 1261, + "time_per_iteration": 2.9010353088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111356, + "balance_loss_mlp": 1.06575668, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.05333874014607912, + "language_loss": 0.82215619, + "learning_rate": 0.0008858992498053671, + "loss": 0.83329183, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47729492, + "step": 1262, + "time_per_iteration": 2.8307647705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_mlp": 1.08506405, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.04388178085496151, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77694511, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.20703125, + "step": 1263, + "time_per_iteration": 4.839150428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113047, + "balance_loss_mlp": 1.06517243, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07576677138650743, + "language_loss": 0.83877796, + "learning_rate": 0.0008855027512063817, + "loss": 0.84990847, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47924805, + "step": 1264, + "time_per_iteration": 2.6955387592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116702, + "balance_loss_mlp": 1.06847, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.08737911579836782, + "language_loss": 0.86160326, + "learning_rate": 0.0008853042773702292, + "loss": 0.87277025, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.48217773, + "step": 1265, + "time_per_iteration": 2.718477725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123795, + "balance_loss_mlp": 1.07191551, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.05410456343654981, + "language_loss": 0.87916005, + "learning_rate": 0.0008851056539456896, + "loss": 0.89039803, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.51855469, + "step": 1266, + "time_per_iteration": 2.668398380279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127352, + "balance_loss_mlp": 1.07792759, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.06341671281787149, + "language_loss": 0.82546353, + "learning_rate": 0.0008849068810098755, + "loss": 0.8367371, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.49414062, + "step": 1267, + "time_per_iteration": 3.348644971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132731, + "balance_loss_mlp": 1.08523834, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.08675992555990221, + "language_loss": 0.8333391, + "learning_rate": 0.0008847079586399575, + "loss": 0.84466636, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47509766, + "step": 1268, + "time_per_iteration": 2.549433946609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126198, + "balance_loss_mlp": 1.07994461, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07249150513377325, + "language_loss": 0.8672694, + "learning_rate": 0.0008845088869131641, + "loss": 0.87853134, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.46289062, + "step": 1269, + "time_per_iteration": 2.6586451530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.10145724, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.06266770628228314, + "language_loss": 0.89411461, + "learning_rate": 0.0008843096659067818, + "loss": 0.90561438, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.48510742, + "step": 1270, + "time_per_iteration": 2.626946210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146652, + "balance_loss_mlp": 1.10228229, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.056965438466979365, + "language_loss": 0.86992264, + "learning_rate": 0.000884110295698155, + "loss": 0.88138914, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.44335938, + "step": 1271, + "time_per_iteration": 2.970078706741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160922, + "balance_loss_mlp": 1.11080623, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.06894839907125858, + "language_loss": 0.86557794, + "learning_rate": 0.0008839107763646861, + "loss": 0.87718713, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.5012207, + "step": 1272, + "time_per_iteration": 2.592349052429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183532, + "balance_loss_mlp": 1.13437057, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.06647703149266906, + "language_loss": 0.90856385, + "learning_rate": 0.0008837111079838353, + "loss": 0.92039919, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.49194336, + "step": 1273, + "time_per_iteration": 2.7098910808563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118943, + "balance_loss_mlp": 1.14289117, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.05923779703064254, + "language_loss": 0.90316379, + "learning_rate": 0.000883511290633121, + "loss": 0.91505814, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.46533203, + "step": 1274, + "time_per_iteration": 2.5714197158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.13739181, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.060927364177961095, + "language_loss": 0.92697686, + "learning_rate": 0.000883311324390119, + "loss": 0.93883693, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.48608398, + "step": 1275, + "time_per_iteration": 2.740896224975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189584, + "balance_loss_mlp": 1.13474798, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.07775603238406727, + "language_loss": 0.82056022, + "learning_rate": 0.0008831112093324629, + "loss": 0.83245611, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.5480957, + "step": 1276, + "time_per_iteration": 3.0821468830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190059, + "balance_loss_mlp": 1.13927567, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.05600773018776359, + "language_loss": 0.89543378, + "learning_rate": 0.0008829109455378444, + "loss": 0.90733445, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.50830078, + "step": 1277, + "time_per_iteration": 2.7299413681030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192458, + "balance_loss_mlp": 1.14241397, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.05156937738675093, + "language_loss": 0.87083036, + "learning_rate": 0.000882710533084013, + "loss": 0.88275498, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.5, + "step": 1278, + "time_per_iteration": 2.6295228004455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185847, + "balance_loss_mlp": 1.13568354, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.05927927368096647, + "language_loss": 0.90088928, + "learning_rate": 0.0008825099720487755, + "loss": 0.91274774, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.50195312, + "step": 1279, + "time_per_iteration": 2.630868434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149494, + "balance_loss_mlp": 1.13461673, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04555367127523109, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76410633, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.1484375, + "step": 1280, + "time_per_iteration": 4.843670129776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118256, + "balance_loss_mlp": 1.10366488, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.038204832859796624, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79062366, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.14550781, + "step": 1281, + "time_per_iteration": 4.784554481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115452, + "balance_loss_mlp": 1.10547721, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.05852441511604794, + "language_loss": 0.89541078, + "learning_rate": 0.0008819073982335619, + "loss": 0.90695602, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.49047852, + "step": 1282, + "time_per_iteration": 2.8370161056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141297, + "balance_loss_mlp": 1.09726083, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07515840278086762, + "language_loss": 0.84908974, + "learning_rate": 0.0008817062436519235, + "loss": 0.86050272, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.44042969, + "step": 1283, + "time_per_iteration": 2.6532042026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114078, + "balance_loss_mlp": 1.09164214, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.051214690731677004, + "language_loss": 0.9022612, + "learning_rate": 0.0008815049408787788, + "loss": 0.91366905, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.49072266, + "step": 1284, + "time_per_iteration": 2.577040195465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145806, + "balance_loss_mlp": 1.09857535, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.06399849872592922, + "language_loss": 0.86388409, + "learning_rate": 0.0008813034899922805, + "loss": 0.87534213, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47216797, + "step": 1285, + "time_per_iteration": 2.586411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153157, + "balance_loss_mlp": 1.10366094, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.05962621730359375, + "language_loss": 0.90523338, + "learning_rate": 0.0008811018910706387, + "loss": 0.91676497, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.49536133, + "step": 1286, + "time_per_iteration": 2.558340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150564, + "balance_loss_mlp": 1.0996381, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08171747444285254, + "language_loss": 0.82914776, + "learning_rate": 0.0008809001441921211, + "loss": 0.84065336, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.50976562, + "step": 1287, + "time_per_iteration": 2.7096829414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134679, + "balance_loss_mlp": 1.08651865, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.061876473909820096, + "language_loss": 0.86037469, + "learning_rate": 0.0008806982494350528, + "loss": 0.87172151, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.48144531, + "step": 1288, + "time_per_iteration": 2.6826744079589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.0885514, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.05818805427718153, + "language_loss": 0.90965348, + "learning_rate": 0.0008804962068778161, + "loss": 0.92104065, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.50195312, + "step": 1289, + "time_per_iteration": 2.9314775466918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137271, + "balance_loss_mlp": 1.08872867, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.06661216201088474, + "language_loss": 0.81390089, + "learning_rate": 0.0008802940165988511, + "loss": 0.82527363, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.48510742, + "step": 1290, + "time_per_iteration": 2.8629136085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113117, + "balance_loss_mlp": 1.08389127, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.06960392685137955, + "language_loss": 0.89268786, + "learning_rate": 0.000880091678676655, + "loss": 0.90399957, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47265625, + "step": 1291, + "time_per_iteration": 2.8345038890838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.08882165, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.058047960295431696, + "language_loss": 0.89150697, + "learning_rate": 0.0008798891931897821, + "loss": 0.90286887, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47338867, + "step": 1292, + "time_per_iteration": 2.7299227714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128008, + "balance_loss_mlp": 1.07949018, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.09954343743221296, + "language_loss": 0.84998739, + "learning_rate": 0.0008796865602168447, + "loss": 0.86126745, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.48535156, + "step": 1293, + "time_per_iteration": 2.5342278480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127533, + "balance_loss_mlp": 1.08220935, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.05777797953149353, + "language_loss": 0.89527249, + "learning_rate": 0.0008794837798365115, + "loss": 0.90654784, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.45361328, + "step": 1294, + "time_per_iteration": 2.6889185905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.08886147, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.07754051928079464, + "language_loss": 0.89232659, + "learning_rate": 0.0008792808521275089, + "loss": 0.90369469, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47924805, + "step": 1295, + "time_per_iteration": 2.7635927200317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136837, + "balance_loss_mlp": 1.09027398, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.09989296116771008, + "language_loss": 0.87984705, + "learning_rate": 0.0008790777771686206, + "loss": 0.89121538, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.46557617, + "step": 1296, + "time_per_iteration": 2.579235076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124595, + "balance_loss_mlp": 1.07853234, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.08251132162328097, + "language_loss": 0.85680348, + "learning_rate": 0.0008788745550386872, + "loss": 0.86804938, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.46044922, + "step": 1297, + "time_per_iteration": 2.598031759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128617, + "balance_loss_mlp": 1.08152938, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.06717402893383145, + "language_loss": 0.80945367, + "learning_rate": 0.0008786711858166063, + "loss": 0.82073987, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47070312, + "step": 1298, + "time_per_iteration": 2.9720141887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133144, + "balance_loss_mlp": 1.08696246, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.058753985131359356, + "language_loss": 0.84356344, + "learning_rate": 0.0008784676695813332, + "loss": 0.85489488, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.46166992, + "step": 1299, + "time_per_iteration": 3.003113031387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154452, + "balance_loss_mlp": 1.10700631, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07081449776085671, + "language_loss": 0.85444576, + "learning_rate": 0.0008782640064118796, + "loss": 0.86599028, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47436523, + "step": 1300, + "time_per_iteration": 2.8769848346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166343, + "balance_loss_mlp": 1.14946294, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.041859158942630086, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77351093, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.16894531, + "step": 1301, + "time_per_iteration": 4.951652526855469 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191692, + "balance_loss_mlp": 1.14701271, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.07273634964220443, + "language_loss": 0.8750245, + "learning_rate": 0.0008778562395867648, + "loss": 0.88694143, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.44677734, + "step": 1302, + "time_per_iteration": 2.604402542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181408, + "balance_loss_mlp": 1.13629961, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07562070017846675, + "language_loss": 0.84288502, + "learning_rate": 0.0008776521360894127, + "loss": 0.85469913, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.45092773, + "step": 1303, + "time_per_iteration": 2.5878565311431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08784008, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0317480068151838, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80065739, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.15820312, + "step": 1304, + "time_per_iteration": 4.7717835903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116688, + "balance_loss_mlp": 1.12220049, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.05690422496958516, + "language_loss": 0.90951985, + "learning_rate": 0.0008772434893213186, + "loss": 0.92118865, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.44702148, + "step": 1305, + "time_per_iteration": 2.604490280151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160948, + "balance_loss_mlp": 1.11405063, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.058263181320018995, + "language_loss": 0.85050523, + "learning_rate": 0.0008770389462092276, + "loss": 0.86211473, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46875, + "step": 1306, + "time_per_iteration": 2.6470468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011567, + "balance_loss_mlp": 1.1099937, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.058464254330546805, + "language_loss": 0.87023067, + "learning_rate": 0.0008768342567176357, + "loss": 0.88179767, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.46704102, + "step": 1307, + "time_per_iteration": 2.8168630599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155047, + "balance_loss_mlp": 1.10753012, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.05479935706331158, + "language_loss": 0.90999937, + "learning_rate": 0.0008766294209260107, + "loss": 0.9215498, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.4753418, + "step": 1308, + "time_per_iteration": 2.721531629562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144469, + "balance_loss_mlp": 1.09704781, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.06755027454964987, + "language_loss": 0.91936618, + "learning_rate": 0.0008764244389138767, + "loss": 0.93081093, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47436523, + "step": 1309, + "time_per_iteration": 2.574913263320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146846, + "balance_loss_mlp": 1.10061693, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09614568206927013, + "language_loss": 0.82912982, + "learning_rate": 0.000876219310760815, + "loss": 0.84059829, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.46240234, + "step": 1310, + "time_per_iteration": 2.8861234188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140262, + "balance_loss_mlp": 1.09419942, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.07943381545238665, + "language_loss": 0.82026285, + "learning_rate": 0.0008760140365464631, + "loss": 0.83166546, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.46020508, + "step": 1311, + "time_per_iteration": 2.615981340408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157686, + "balance_loss_mlp": 1.11212397, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.0923524312347507, + "language_loss": 0.8768574, + "learning_rate": 0.0008758086163505156, + "loss": 0.88843429, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.45532227, + "step": 1312, + "time_per_iteration": 2.6723434925079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144164, + "balance_loss_mlp": 1.09872115, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.06443576206069311, + "language_loss": 0.90026277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91170442, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.45458984, + "step": 1313, + "time_per_iteration": 2.841367721557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114771, + "balance_loss_mlp": 1.10291111, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.057466156915965357, + "language_loss": 0.90976274, + "learning_rate": 0.0008753973383328954, + "loss": 0.92123979, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.44824219, + "step": 1314, + "time_per_iteration": 2.7198092937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135642, + "balance_loss_mlp": 1.08912706, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.0651730634150067, + "language_loss": 0.84640622, + "learning_rate": 0.0008751914806708952, + "loss": 0.85776269, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.46508789, + "step": 1315, + "time_per_iteration": 2.619739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138249, + "balance_loss_mlp": 1.0955956, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.06535523514746128, + "language_loss": 0.82706141, + "learning_rate": 0.0008749854773466439, + "loss": 0.83844388, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.42700195, + "step": 1316, + "time_per_iteration": 2.6750850677490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126734, + "balance_loss_mlp": 1.08594072, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07438570972797282, + "language_loss": 0.85103095, + "learning_rate": 0.0008747793284401192, + "loss": 0.86229837, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.40771484, + "step": 1317, + "time_per_iteration": 2.667684316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127851, + "balance_loss_mlp": 1.08231306, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.06662830476911753, + "language_loss": 0.8637262, + "learning_rate": 0.0008745730340313551, + "loss": 0.87500465, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.45532227, + "step": 1318, + "time_per_iteration": 2.783167839050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_mlp": 1.08298802, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.06014849970215255, + "language_loss": 0.84828806, + "learning_rate": 0.0008743665942004422, + "loss": 0.85955328, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.43554688, + "step": 1319, + "time_per_iteration": 2.6454880237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128022, + "balance_loss_mlp": 1.08334279, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.10116204644494126, + "language_loss": 0.93301231, + "learning_rate": 0.0008741600090275277, + "loss": 0.94429255, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.44702148, + "step": 1320, + "time_per_iteration": 2.565373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112488, + "balance_loss_mlp": 1.07884121, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.06655436432492466, + "language_loss": 0.84446663, + "learning_rate": 0.0008739532785928151, + "loss": 0.85571539, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.45996094, + "step": 1321, + "time_per_iteration": 3.479727268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080328, + "balance_loss_mlp": 1.06325758, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.0281051137535917, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7597391, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.17089844, + "step": 1322, + "time_per_iteration": 4.7930076122283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136178, + "balance_loss_mlp": 1.08921003, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.06285601142266005, + "language_loss": 0.83366752, + "learning_rate": 0.0008735393822590908, + "loss": 0.84502923, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.46923828, + "step": 1323, + "time_per_iteration": 2.672137498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145864, + "balance_loss_mlp": 1.10192394, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.05471127015298985, + "language_loss": 0.8775813, + "learning_rate": 0.0008733322165207681, + "loss": 0.88903993, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.43969727, + "step": 1324, + "time_per_iteration": 2.6422736644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157775, + "balance_loss_mlp": 1.11292815, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.058409122955484685, + "language_loss": 0.83687508, + "learning_rate": 0.0008731249058420247, + "loss": 0.84845281, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.44824219, + "step": 1325, + "time_per_iteration": 3.02577805519104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165947, + "balance_loss_mlp": 1.11995602, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.0843662219595253, + "language_loss": 0.90814316, + "learning_rate": 0.0008729174503033459, + "loss": 0.91980267, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.45947266, + "step": 1326, + "time_per_iteration": 2.700956344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160817, + "balance_loss_mlp": 1.11418188, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.07395752020353057, + "language_loss": 0.83274329, + "learning_rate": 0.0008727098499852728, + "loss": 0.84435147, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.46630859, + "step": 1327, + "time_per_iteration": 2.8289363384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.0946734, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.05433597882612883, + "language_loss": 0.90389377, + "learning_rate": 0.0008725021049684034, + "loss": 0.91528177, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.44165039, + "step": 1328, + "time_per_iteration": 2.766871452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.09057808, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.04999939134312536, + "language_loss": 0.83732843, + "learning_rate": 0.000872294215333391, + "loss": 0.84867573, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.44165039, + "step": 1329, + "time_per_iteration": 3.181687116622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133543, + "balance_loss_mlp": 1.08941174, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.053270875218317436, + "language_loss": 0.83338815, + "learning_rate": 0.0008720861811609457, + "loss": 0.84472358, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.44140625, + "step": 1330, + "time_per_iteration": 2.753095865249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139869, + "balance_loss_mlp": 1.09282851, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0744958299593676, + "language_loss": 0.83801699, + "learning_rate": 0.0008718780025318338, + "loss": 0.84941566, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.4699707, + "step": 1331, + "time_per_iteration": 2.74076771736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141571, + "balance_loss_mlp": 1.09913218, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.06658506014654758, + "language_loss": 0.84681445, + "learning_rate": 0.0008716696795268771, + "loss": 0.85823017, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.42456055, + "step": 1332, + "time_per_iteration": 2.6771953105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141914, + "balance_loss_mlp": 1.09718704, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.06458865940403113, + "language_loss": 0.86108088, + "learning_rate": 0.0008714612122269538, + "loss": 0.87250006, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.44750977, + "step": 1333, + "time_per_iteration": 2.872405767440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145867, + "balance_loss_mlp": 1.09944701, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.06078246423813374, + "language_loss": 0.89285004, + "learning_rate": 0.0008712526007129982, + "loss": 0.90430868, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46411133, + "step": 1334, + "time_per_iteration": 2.575467586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148778, + "balance_loss_mlp": 1.10517156, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.06822349657501799, + "language_loss": 0.91275418, + "learning_rate": 0.0008710438450660003, + "loss": 0.92424202, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.43603516, + "step": 1335, + "time_per_iteration": 2.6461987495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149209, + "balance_loss_mlp": 1.10157323, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.08158488021096956, + "language_loss": 0.88278055, + "learning_rate": 0.0008708349453670064, + "loss": 0.89427269, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47583008, + "step": 1336, + "time_per_iteration": 2.5001657009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128588, + "balance_loss_mlp": 1.08297849, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.0603403973753485, + "language_loss": 0.91654134, + "learning_rate": 0.0008706259016971185, + "loss": 0.92782724, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.45629883, + "step": 1337, + "time_per_iteration": 2.817657947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127771, + "balance_loss_mlp": 1.07865644, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.08421665296665147, + "language_loss": 0.83723027, + "learning_rate": 0.0008704167141374944, + "loss": 0.848508, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.49145508, + "step": 1338, + "time_per_iteration": 2.808487892150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_mlp": 1.08003271, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.05813050369368248, + "language_loss": 0.88781357, + "learning_rate": 0.0008702073827693482, + "loss": 0.89909494, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.48144531, + "step": 1339, + "time_per_iteration": 2.687836170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131918, + "balance_loss_mlp": 1.08711886, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.05714278292432699, + "language_loss": 0.89388514, + "learning_rate": 0.0008699979076739494, + "loss": 0.9052043, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.44799805, + "step": 1340, + "time_per_iteration": 2.9907524585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157888, + "balance_loss_mlp": 1.11089551, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.06321899043923618, + "language_loss": 0.8949765, + "learning_rate": 0.0008697882889326234, + "loss": 0.90655541, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.4699707, + "step": 1341, + "time_per_iteration": 2.5261731147766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182653, + "balance_loss_mlp": 1.13513625, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.06545350512623192, + "language_loss": 0.87013066, + "learning_rate": 0.0008695785266267515, + "loss": 0.88195717, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.4753418, + "step": 1342, + "time_per_iteration": 2.719949722290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194656, + "balance_loss_mlp": 1.14585173, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.07227104516109029, + "language_loss": 0.8379634, + "learning_rate": 0.0008693686208377704, + "loss": 0.84991002, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.48828125, + "step": 1343, + "time_per_iteration": 2.789046049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011909, + "balance_loss_mlp": 1.14572012, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.08291144049116697, + "language_loss": 0.89388204, + "learning_rate": 0.0008691585716471733, + "loss": 0.90579104, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.45214844, + "step": 1344, + "time_per_iteration": 2.63281512260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182868, + "balance_loss_mlp": 1.1348505, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.05462335243620436, + "language_loss": 0.86349607, + "learning_rate": 0.0008689483791365079, + "loss": 0.87532479, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.48022461, + "step": 1345, + "time_per_iteration": 2.8293464183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165648, + "balance_loss_mlp": 1.11879873, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.060641418043912716, + "language_loss": 0.89744675, + "learning_rate": 0.0008687380433873786, + "loss": 0.90910327, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46875, + "step": 1346, + "time_per_iteration": 2.757361650466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150314, + "balance_loss_mlp": 1.100389, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.0738804898683007, + "language_loss": 0.83070856, + "learning_rate": 0.0008685275644814448, + "loss": 0.84221172, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.49926758, + "step": 1347, + "time_per_iteration": 2.716006278991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147842, + "balance_loss_mlp": 1.10087395, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07544817120788133, + "language_loss": 0.85244781, + "learning_rate": 0.0008683169425004216, + "loss": 0.86392623, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46972656, + "step": 1348, + "time_per_iteration": 2.900754451751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114914, + "balance_loss_mlp": 1.09842825, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.08404854247051008, + "language_loss": 0.83688962, + "learning_rate": 0.0008681061775260799, + "loss": 0.84838104, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.50708008, + "step": 1349, + "time_per_iteration": 2.8356235027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140271, + "balance_loss_mlp": 1.09356534, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08196022848482862, + "language_loss": 0.92983842, + "learning_rate": 0.0008678952696402458, + "loss": 0.94124115, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46704102, + "step": 1350, + "time_per_iteration": 2.5051889419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_mlp": 1.0865308, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.052642437263987304, + "language_loss": 0.86759204, + "learning_rate": 0.000867684218924801, + "loss": 0.87891388, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.45629883, + "step": 1351, + "time_per_iteration": 2.8635144233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089623, + "balance_loss_mlp": 1.0725522, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.04013302579778462, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80036712, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.17089844, + "step": 1352, + "time_per_iteration": 4.89817476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121624, + "balance_loss_mlp": 1.07587171, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.055845692832442596, + "language_loss": 0.85694808, + "learning_rate": 0.0008672616893328834, + "loss": 0.8681643, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.45751953, + "step": 1353, + "time_per_iteration": 2.9335103034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123767, + "balance_loss_mlp": 1.07877684, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.07010977425409264, + "language_loss": 0.9082427, + "learning_rate": 0.0008670502106204512, + "loss": 0.91948032, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.44970703, + "step": 1354, + "time_per_iteration": 2.8469178676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138616, + "balance_loss_mlp": 1.08840501, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.056353527093492256, + "language_loss": 0.82360619, + "learning_rate": 0.0008668385894064892, + "loss": 0.83499235, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.50195312, + "step": 1355, + "time_per_iteration": 2.672883987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149756, + "balance_loss_mlp": 1.10321617, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.05383030346289838, + "language_loss": 0.89593899, + "learning_rate": 0.0008666268257731562, + "loss": 0.90743661, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46557617, + "step": 1356, + "time_per_iteration": 3.1050939559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169178, + "balance_loss_mlp": 1.12127948, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.05849819020383372, + "language_loss": 0.85968256, + "learning_rate": 0.0008664149198026662, + "loss": 0.87137431, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.47900391, + "step": 1357, + "time_per_iteration": 3.226966619491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156465, + "balance_loss_mlp": 1.10932934, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.07293583935871151, + "language_loss": 0.89518476, + "learning_rate": 0.0008662028715772883, + "loss": 0.90674949, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.47143555, + "step": 1358, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163078, + "balance_loss_mlp": 1.11718237, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.05890556701012809, + "language_loss": 0.86217821, + "learning_rate": 0.0008659906811793467, + "loss": 0.87380904, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.45898438, + "step": 1359, + "time_per_iteration": 2.651193857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151481, + "balance_loss_mlp": 1.10699224, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.06298146111957026, + "language_loss": 0.90418088, + "learning_rate": 0.0008657783486912215, + "loss": 0.91569573, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.44482422, + "step": 1360, + "time_per_iteration": 2.723550319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156338, + "balance_loss_mlp": 1.11022782, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.055299708084911615, + "language_loss": 0.90110713, + "learning_rate": 0.0008655658741953472, + "loss": 0.91267049, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.4609375, + "step": 1361, + "time_per_iteration": 3.216830015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139946, + "balance_loss_mlp": 1.09564757, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.04868556149108388, + "language_loss": 0.89168048, + "learning_rate": 0.0008653532577742136, + "loss": 0.90307987, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.44311523, + "step": 1362, + "time_per_iteration": 2.718886375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143457, + "balance_loss_mlp": 1.0986346, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.058057923999792295, + "language_loss": 0.87558335, + "learning_rate": 0.0008651404995103659, + "loss": 0.88701797, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.44824219, + "step": 1363, + "time_per_iteration": 2.594294309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.09338474, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.06330728330165165, + "language_loss": 0.87334514, + "learning_rate": 0.0008649275994864041, + "loss": 0.88471884, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.43994141, + "step": 1364, + "time_per_iteration": 2.707449197769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144999, + "balance_loss_mlp": 1.09879303, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.05276541609050752, + "language_loss": 0.84391934, + "learning_rate": 0.0008647145577849834, + "loss": 0.85536933, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46191406, + "step": 1365, + "time_per_iteration": 2.8216350078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131283, + "balance_loss_mlp": 1.08560157, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.05376997595185902, + "language_loss": 0.83317888, + "learning_rate": 0.0008645013744888139, + "loss": 0.84449172, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.45678711, + "step": 1366, + "time_per_iteration": 2.866891622543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149616, + "balance_loss_mlp": 1.10536587, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.06316724717597957, + "language_loss": 0.87992281, + "learning_rate": 0.0008642880496806607, + "loss": 0.89141893, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.44287109, + "step": 1367, + "time_per_iteration": 2.7763173580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142909, + "balance_loss_mlp": 1.09772861, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.05877759558608074, + "language_loss": 0.84959197, + "learning_rate": 0.0008640745834433437, + "loss": 0.86102104, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.4519043, + "step": 1368, + "time_per_iteration": 2.738328218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134336, + "balance_loss_mlp": 1.09018087, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.05935956886320276, + "language_loss": 0.87054664, + "learning_rate": 0.000863860975859738, + "loss": 0.88189, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.44165039, + "step": 1369, + "time_per_iteration": 2.9206831455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131372, + "balance_loss_mlp": 1.0855242, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.06691392922801855, + "language_loss": 0.88684422, + "learning_rate": 0.0008636472270127733, + "loss": 0.89815795, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.45825195, + "step": 1370, + "time_per_iteration": 2.6078739166259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116186, + "balance_loss_mlp": 1.07021928, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.06515524250359679, + "language_loss": 0.90367895, + "learning_rate": 0.0008634333369854345, + "loss": 0.91484082, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.45947266, + "step": 1371, + "time_per_iteration": 2.6001384258270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110327, + "balance_loss_mlp": 1.0667206, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.056061894150206536, + "language_loss": 0.87892628, + "learning_rate": 0.0008632193058607608, + "loss": 0.89002955, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.43554688, + "step": 1372, + "time_per_iteration": 2.711435317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113538, + "balance_loss_mlp": 1.06628299, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.060513983317086996, + "language_loss": 0.81023312, + "learning_rate": 0.0008630051337218466, + "loss": 0.82136846, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47314453, + "step": 1373, + "time_per_iteration": 2.656416893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110026, + "balance_loss_mlp": 1.0668484, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0689512550651149, + "language_loss": 0.82808203, + "learning_rate": 0.0008627908206518409, + "loss": 0.83918226, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.43188477, + "step": 1374, + "time_per_iteration": 2.673738956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_mlp": 1.02716982, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.01820003864645097, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76191109, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12695312, + "step": 1375, + "time_per_iteration": 5.317140817642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115308, + "balance_loss_mlp": 1.07272696, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.062338636090573274, + "language_loss": 0.91769958, + "learning_rate": 0.0008623617720514241, + "loss": 0.92885268, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.42578125, + "step": 1376, + "time_per_iteration": 2.666618585586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117829, + "balance_loss_mlp": 1.07255304, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.08321054400070194, + "language_loss": 0.85169828, + "learning_rate": 0.0008621470366875848, + "loss": 0.86287659, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.45288086, + "step": 1377, + "time_per_iteration": 2.5939900875091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011137, + "balance_loss_mlp": 1.0724293, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.0756812485553519, + "language_loss": 0.88528687, + "learning_rate": 0.0008619321607257966, + "loss": 0.89642382, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.41259766, + "step": 1378, + "time_per_iteration": 2.675719976425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.08109117, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.05967522341676015, + "language_loss": 0.8244732, + "learning_rate": 0.000861717144249482, + "loss": 0.8357054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.42138672, + "step": 1379, + "time_per_iteration": 2.8289949893951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132354, + "balance_loss_mlp": 1.09170318, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06486885922060631, + "language_loss": 0.90334523, + "learning_rate": 0.0008615019873421175, + "loss": 0.91466868, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.40649414, + "step": 1380, + "time_per_iteration": 2.4665510654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141805, + "balance_loss_mlp": 1.09798408, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.06471812563896691, + "language_loss": 0.86262017, + "learning_rate": 0.0008612866900872349, + "loss": 0.87403822, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.43823242, + "step": 1381, + "time_per_iteration": 2.553489923477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140972, + "balance_loss_mlp": 1.10017824, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.07006288293307902, + "language_loss": 0.88817614, + "learning_rate": 0.0008610712525684197, + "loss": 0.89958596, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.40771484, + "step": 1382, + "time_per_iteration": 2.623844861984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156525, + "balance_loss_mlp": 1.11341906, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.06690376769295572, + "language_loss": 0.85084939, + "learning_rate": 0.0008608556748693121, + "loss": 0.8624146, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.43115234, + "step": 1383, + "time_per_iteration": 3.248947858810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149603, + "balance_loss_mlp": 1.10549557, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.05893966497122096, + "language_loss": 0.86648834, + "learning_rate": 0.000860639957073607, + "loss": 0.8779844, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.44116211, + "step": 1384, + "time_per_iteration": 2.6954376697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161137, + "balance_loss_mlp": 1.11838901, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.05777577847879513, + "language_loss": 0.88325369, + "learning_rate": 0.0008604240992650534, + "loss": 0.8948651, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.42749023, + "step": 1385, + "time_per_iteration": 2.6810553073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116884, + "balance_loss_mlp": 1.12613928, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.1266990207417539, + "language_loss": 0.89650941, + "learning_rate": 0.0008602081015274545, + "loss": 0.90819776, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.42724609, + "step": 1386, + "time_per_iteration": 2.7079007625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169207, + "balance_loss_mlp": 1.12602973, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.05666517988787923, + "language_loss": 0.83684492, + "learning_rate": 0.0008599919639446684, + "loss": 0.84853697, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.43139648, + "step": 1387, + "time_per_iteration": 2.67275333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184027, + "balance_loss_mlp": 1.13755894, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.06873806966805297, + "language_loss": 0.80686462, + "learning_rate": 0.000859775686600607, + "loss": 0.81870484, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46459961, + "step": 1388, + "time_per_iteration": 2.568384885787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192065, + "balance_loss_mlp": 1.14676547, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.07413400256287127, + "language_loss": 0.85524642, + "learning_rate": 0.0008595592695792367, + "loss": 0.86716712, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.453125, + "step": 1389, + "time_per_iteration": 2.6748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182907, + "balance_loss_mlp": 1.13884759, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06676524761439688, + "language_loss": 0.9117986, + "learning_rate": 0.0008593427129645778, + "loss": 0.92362767, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.44042969, + "step": 1390, + "time_per_iteration": 2.5506954193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186114, + "balance_loss_mlp": 1.14205468, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.056989477345309104, + "language_loss": 0.85532665, + "learning_rate": 0.0008591260168407052, + "loss": 0.86718786, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.44067383, + "step": 1391, + "time_per_iteration": 2.759000778198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_mlp": 1.13714194, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.12230490659722075, + "language_loss": 0.83154678, + "learning_rate": 0.0008589091812917479, + "loss": 0.84336257, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.4440918, + "step": 1392, + "time_per_iteration": 2.6213910579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183464, + "balance_loss_mlp": 1.14030981, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07403824045185783, + "language_loss": 0.8547672, + "learning_rate": 0.0008586922064018887, + "loss": 0.86660182, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.43139648, + "step": 1393, + "time_per_iteration": 2.6706490516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170846, + "balance_loss_mlp": 1.12375855, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.06891205333434622, + "language_loss": 0.89827204, + "learning_rate": 0.0008584750922553651, + "loss": 0.90998048, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.47021484, + "step": 1394, + "time_per_iteration": 3.1465976238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164798, + "balance_loss_mlp": 1.1222403, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.06253124916771012, + "language_loss": 0.84102368, + "learning_rate": 0.0008582578389364677, + "loss": 0.85267168, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.42529297, + "step": 1395, + "time_per_iteration": 2.853278875350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170721, + "balance_loss_mlp": 1.12573135, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.0656545534576685, + "language_loss": 0.92268932, + "learning_rate": 0.0008580404465295422, + "loss": 0.93439656, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.44970703, + "step": 1396, + "time_per_iteration": 2.773932695388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152323, + "balance_loss_mlp": 1.10826349, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07972324646927738, + "language_loss": 0.88789833, + "learning_rate": 0.0008578229151189876, + "loss": 0.89942157, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.44067383, + "step": 1397, + "time_per_iteration": 2.934276819229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10151267, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.10010461149900847, + "language_loss": 0.8178823, + "learning_rate": 0.0008576052447892573, + "loss": 0.82932794, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.43115234, + "step": 1398, + "time_per_iteration": 2.5337071418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131122, + "balance_loss_mlp": 1.08768189, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.07718983812215899, + "language_loss": 0.86768365, + "learning_rate": 0.000857387435624858, + "loss": 0.87899494, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.43457031, + "step": 1399, + "time_per_iteration": 2.5189273357391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127749, + "balance_loss_mlp": 1.08404672, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0707561541840249, + "language_loss": 0.88852745, + "learning_rate": 0.0008571694877103513, + "loss": 0.89980495, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.43701172, + "step": 1400, + "time_per_iteration": 3.287325859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126684, + "balance_loss_mlp": 1.08372128, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.08476375879770352, + "language_loss": 0.88499445, + "learning_rate": 0.0008569514011303515, + "loss": 0.89626133, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.4296875, + "step": 1401, + "time_per_iteration": 2.849506378173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120044, + "balance_loss_mlp": 1.07770109, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.12418270059874827, + "language_loss": 0.88531977, + "learning_rate": 0.0008567331759695277, + "loss": 0.89652026, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.42358398, + "step": 1402, + "time_per_iteration": 2.7033023834228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119932, + "balance_loss_mlp": 1.07584798, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.09855769315853927, + "language_loss": 0.86756563, + "learning_rate": 0.0008565148123126023, + "loss": 0.87876499, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.44091797, + "step": 1403, + "time_per_iteration": 2.645425319671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119876, + "balance_loss_mlp": 1.07769978, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.15226973878739974, + "language_loss": 0.86578166, + "learning_rate": 0.0008562963102443516, + "loss": 0.87698042, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.421875, + "step": 1404, + "time_per_iteration": 2.6965179443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130222, + "balance_loss_mlp": 1.08668637, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.09156828725831004, + "language_loss": 0.85926664, + "learning_rate": 0.0008560776698496056, + "loss": 0.87056887, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.43530273, + "step": 1405, + "time_per_iteration": 2.868159532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141969, + "balance_loss_mlp": 1.09707534, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.07226677638641436, + "language_loss": 0.86433703, + "learning_rate": 0.0008558588912132481, + "loss": 0.87575674, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.44873047, + "step": 1406, + "time_per_iteration": 2.8309988975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.05236614, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03207539465139433, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77525663, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.14257812, + "step": 1407, + "time_per_iteration": 4.926543235778809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136459, + "balance_loss_mlp": 1.09220862, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.06146298960376288, + "language_loss": 0.83448923, + "learning_rate": 0.0008554209195555016, + "loss": 0.84585381, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.44287109, + "step": 1408, + "time_per_iteration": 2.6698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136456, + "balance_loss_mlp": 1.08965421, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.1627330563817166, + "language_loss": 0.89102834, + "learning_rate": 0.0008552017267041483, + "loss": 0.90239286, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.46801758, + "step": 1409, + "time_per_iteration": 2.6957972049713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127578, + "balance_loss_mlp": 1.08349395, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06560812899143556, + "language_loss": 0.83656335, + "learning_rate": 0.0008549823959512549, + "loss": 0.84783912, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.44091797, + "step": 1410, + "time_per_iteration": 2.7068376541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011101, + "balance_loss_mlp": 1.06708908, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.08175260567644033, + "language_loss": 0.87610555, + "learning_rate": 0.0008547629273819728, + "loss": 0.88720655, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.43041992, + "step": 1411, + "time_per_iteration": 3.366260290145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_mlp": 1.06542349, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.10517352924457117, + "language_loss": 0.84009993, + "learning_rate": 0.0008545433210815074, + "loss": 0.85118002, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.42578125, + "step": 1412, + "time_per_iteration": 2.630105972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.07931852, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.09841738404648297, + "language_loss": 0.87974489, + "learning_rate": 0.0008543235771351176, + "loss": 0.89097011, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.43188477, + "step": 1413, + "time_per_iteration": 2.725048065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_mlp": 1.08635998, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.059677420125308425, + "language_loss": 0.84918916, + "learning_rate": 0.0008541036956281154, + "loss": 0.86048239, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.42993164, + "step": 1414, + "time_per_iteration": 2.897216796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133545, + "balance_loss_mlp": 1.08898425, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.08487151018546404, + "language_loss": 0.82919049, + "learning_rate": 0.0008538836766458665, + "loss": 0.84052598, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.44580078, + "step": 1415, + "time_per_iteration": 2.8930981159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137425, + "balance_loss_mlp": 1.0942955, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09871518143765563, + "language_loss": 0.85738099, + "learning_rate": 0.0008536635202737897, + "loss": 0.86875528, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.43164062, + "step": 1416, + "time_per_iteration": 2.7891178131103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137299, + "balance_loss_mlp": 1.0931915, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.10766210404252562, + "language_loss": 0.82790214, + "learning_rate": 0.0008534432265973573, + "loss": 0.83927512, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.44091797, + "step": 1417, + "time_per_iteration": 2.6409006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141948, + "balance_loss_mlp": 1.09691095, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07824380469589887, + "language_loss": 0.88708508, + "learning_rate": 0.000853222795702095, + "loss": 0.89850456, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.45092773, + "step": 1418, + "time_per_iteration": 3.4312241077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115343, + "balance_loss_mlp": 1.10767758, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.06262628073505326, + "language_loss": 0.84196067, + "learning_rate": 0.0008530022276735813, + "loss": 0.85349494, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.45727539, + "step": 1419, + "time_per_iteration": 2.742341995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169959, + "balance_loss_mlp": 1.12742519, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07008703106338479, + "language_loss": 0.86301696, + "learning_rate": 0.0008527815225974489, + "loss": 0.87471658, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.42529297, + "step": 1420, + "time_per_iteration": 2.643151044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172801, + "balance_loss_mlp": 1.12731028, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10800570533054084, + "language_loss": 0.88767672, + "learning_rate": 0.0008525606805593829, + "loss": 0.8994047, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.45483398, + "step": 1421, + "time_per_iteration": 2.4374186992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115892, + "balance_loss_mlp": 1.11283422, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.11472023337789067, + "language_loss": 0.83181965, + "learning_rate": 0.0008523397016451213, + "loss": 0.84340894, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46142578, + "step": 1422, + "time_per_iteration": 2.585376739501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152063, + "balance_loss_mlp": 1.10824132, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.08784028487991961, + "language_loss": 0.87910116, + "learning_rate": 0.0008521185859404564, + "loss": 0.89062172, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.43847656, + "step": 1423, + "time_per_iteration": 3.399348020553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150781, + "balance_loss_mlp": 1.10634017, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06323160386311827, + "language_loss": 0.89755672, + "learning_rate": 0.0008518973335312326, + "loss": 0.90906453, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.44433594, + "step": 1424, + "time_per_iteration": 2.771397352218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141797, + "balance_loss_mlp": 1.09628344, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.0741893947597381, + "language_loss": 0.83755773, + "learning_rate": 0.0008516759445033477, + "loss": 0.84897572, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.45532227, + "step": 1425, + "time_per_iteration": 2.623136520385742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148744, + "balance_loss_mlp": 1.10227656, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08118081060083703, + "language_loss": 0.85448551, + "learning_rate": 0.0008514544189427526, + "loss": 0.865973, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.46484375, + "step": 1426, + "time_per_iteration": 2.695749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156426, + "balance_loss_mlp": 1.11208034, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.0837156631450272, + "language_loss": 0.86976963, + "learning_rate": 0.0008512327569354511, + "loss": 0.88133389, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.44360352, + "step": 1427, + "time_per_iteration": 2.5354061126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160051, + "balance_loss_mlp": 1.11353528, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.09189170382991782, + "language_loss": 0.84034801, + "learning_rate": 0.0008510109585675001, + "loss": 0.8519485, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.46508789, + "step": 1428, + "time_per_iteration": 2.5996179580688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093492, + "balance_loss_mlp": 1.07680273, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.03549776566589832, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.8224684, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.16699219, + "step": 1429, + "time_per_iteration": 4.714696407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172648, + "balance_loss_mlp": 1.1280638, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.1239425770540774, + "language_loss": 0.81035018, + "learning_rate": 0.0008505669530941415, + "loss": 0.82207668, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.44580078, + "step": 1430, + "time_per_iteration": 3.346867322921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171144, + "balance_loss_mlp": 1.12613082, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.0741807723541833, + "language_loss": 0.84519219, + "learning_rate": 0.000850344746161112, + "loss": 0.85690367, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.45019531, + "step": 1431, + "time_per_iteration": 2.6365530490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178527, + "balance_loss_mlp": 1.13418126, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.09683250699138053, + "language_loss": 0.88287663, + "learning_rate": 0.0008501224032121894, + "loss": 0.8946619, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.44360352, + "step": 1432, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178788, + "balance_loss_mlp": 1.13406062, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06051880699738469, + "language_loss": 0.82098711, + "learning_rate": 0.0008498999243336946, + "loss": 0.832775, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.44726562, + "step": 1433, + "time_per_iteration": 2.643663167953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198526, + "balance_loss_mlp": 1.15129471, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.07173936681504893, + "language_loss": 0.87897062, + "learning_rate": 0.0008496773096120021, + "loss": 0.89095587, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.47241211, + "step": 1434, + "time_per_iteration": 2.8680803775787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198281, + "balance_loss_mlp": 1.15164685, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.07924459326066897, + "language_loss": 0.84949142, + "learning_rate": 0.0008494545591335381, + "loss": 0.86147422, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46630859, + "step": 1435, + "time_per_iteration": 2.9436187744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197184, + "balance_loss_mlp": 1.15176487, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.05338969573395925, + "language_loss": 0.87283278, + "learning_rate": 0.0008492316729847823, + "loss": 0.88480461, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.4543457, + "step": 1436, + "time_per_iteration": 2.817201614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195413, + "balance_loss_mlp": 1.14739525, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08524745340475512, + "language_loss": 0.80082995, + "learning_rate": 0.0008490086512522664, + "loss": 0.81278408, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47998047, + "step": 1437, + "time_per_iteration": 2.7126290798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196486, + "balance_loss_mlp": 1.14870656, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.06867103991167788, + "language_loss": 0.90572739, + "learning_rate": 0.0008487854940225755, + "loss": 0.9176923, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47729492, + "step": 1438, + "time_per_iteration": 2.431755542755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207177, + "balance_loss_mlp": 1.15858746, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.13716227323677116, + "language_loss": 0.90202403, + "learning_rate": 0.0008485622013823466, + "loss": 0.91409582, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.48608398, + "step": 1439, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198257, + "balance_loss_mlp": 1.15062046, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.09985187013126534, + "language_loss": 0.836923, + "learning_rate": 0.00084833877341827, + "loss": 0.84890562, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47680664, + "step": 1440, + "time_per_iteration": 2.652665138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215433, + "balance_loss_mlp": 1.16562724, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09777751450797587, + "language_loss": 0.81022394, + "learning_rate": 0.000848115210217088, + "loss": 0.82237822, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.49853516, + "step": 1441, + "time_per_iteration": 2.550879955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120133, + "balance_loss_mlp": 1.15166724, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.06658099231370791, + "language_loss": 0.82249796, + "learning_rate": 0.0008478915118655952, + "loss": 0.83451128, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.49658203, + "step": 1442, + "time_per_iteration": 2.7541940212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209129, + "balance_loss_mlp": 1.16261363, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.05385742523937431, + "language_loss": 0.86750221, + "learning_rate": 0.0008476676784506393, + "loss": 0.87959349, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.46557617, + "step": 1443, + "time_per_iteration": 2.6595921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120895, + "balance_loss_mlp": 1.16083765, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07541643273231594, + "language_loss": 0.82715142, + "learning_rate": 0.0008474437100591201, + "loss": 0.83924091, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.48120117, + "step": 1444, + "time_per_iteration": 3.285985231399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209577, + "balance_loss_mlp": 1.16258454, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.07952238187909891, + "language_loss": 0.8560605, + "learning_rate": 0.0008472196067779898, + "loss": 0.86815625, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47021484, + "step": 1445, + "time_per_iteration": 2.677077293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204567, + "balance_loss_mlp": 1.15600109, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10163023549653756, + "language_loss": 0.86494523, + "learning_rate": 0.0008469953686942531, + "loss": 0.87699091, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.48583984, + "step": 1446, + "time_per_iteration": 3.10603928565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.14158559, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.0769454608790312, + "language_loss": 0.83537692, + "learning_rate": 0.0008467709958949668, + "loss": 0.84726554, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.47265625, + "step": 1447, + "time_per_iteration": 2.7602903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116478, + "balance_loss_mlp": 1.11943233, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08244080074007111, + "language_loss": 0.86534739, + "learning_rate": 0.0008465464884672403, + "loss": 0.87699515, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.45410156, + "step": 1448, + "time_per_iteration": 2.702974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178355, + "balance_loss_mlp": 1.13424778, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.061441667483596626, + "language_loss": 0.85982984, + "learning_rate": 0.0008463218464982348, + "loss": 0.87161338, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.44091797, + "step": 1449, + "time_per_iteration": 2.832615852355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185601, + "balance_loss_mlp": 1.14058757, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07503412994840371, + "language_loss": 0.88168389, + "learning_rate": 0.0008460970700751645, + "loss": 0.89353991, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.45019531, + "step": 1450, + "time_per_iteration": 3.0487136840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185626, + "balance_loss_mlp": 1.13977861, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.06352945894963989, + "language_loss": 0.88538259, + "learning_rate": 0.000845872159285295, + "loss": 0.89723885, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.45849609, + "step": 1451, + "time_per_iteration": 2.715423822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_mlp": 1.04985404, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.02807340123185793, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78833961, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17285156, + "step": 1452, + "time_per_iteration": 4.906192302703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197684, + "balance_loss_mlp": 1.15064442, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.06703382456082828, + "language_loss": 0.86617672, + "learning_rate": 0.0008454219349544836, + "loss": 0.87815356, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47045898, + "step": 1453, + "time_per_iteration": 3.3534200191497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.15343201, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.08552050648295068, + "language_loss": 0.82341981, + "learning_rate": 0.000845196621588334, + "loss": 0.83540004, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.44580078, + "step": 1454, + "time_per_iteration": 2.743699073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204394, + "balance_loss_mlp": 1.1566391, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.05325666962256515, + "language_loss": 0.7637955, + "learning_rate": 0.0008449711742049706, + "loss": 0.77583951, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.4777832, + "step": 1455, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208188, + "balance_loss_mlp": 1.16222095, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.09912152167704158, + "language_loss": 0.84447122, + "learning_rate": 0.0008447455928919196, + "loss": 0.85655314, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.45996094, + "step": 1456, + "time_per_iteration": 2.597557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242882, + "balance_loss_mlp": 1.19460225, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.060789109492995964, + "language_loss": 0.87272859, + "learning_rate": 0.0008445198777367595, + "loss": 0.88515741, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.48291016, + "step": 1457, + "time_per_iteration": 2.5689990520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283391, + "balance_loss_mlp": 1.23394287, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.0840599244275116, + "language_loss": 0.80820799, + "learning_rate": 0.0008442940288271208, + "loss": 0.82104188, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.49365234, + "step": 1458, + "time_per_iteration": 2.674907922744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299064, + "balance_loss_mlp": 1.24899602, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06912303271008884, + "language_loss": 0.87410611, + "learning_rate": 0.0008440680462506856, + "loss": 0.88709676, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.50073242, + "step": 1459, + "time_per_iteration": 2.73905873298645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312423, + "balance_loss_mlp": 1.26221192, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.11964292138845481, + "language_loss": 0.86650789, + "learning_rate": 0.0008438419300951883, + "loss": 0.87963212, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.50219727, + "step": 1460, + "time_per_iteration": 2.6775193214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277494, + "balance_loss_mlp": 1.22690177, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.08967430845786024, + "language_loss": 0.86711442, + "learning_rate": 0.0008436156804484148, + "loss": 0.87988937, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.50610352, + "step": 1461, + "time_per_iteration": 2.8446624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225027, + "balance_loss_mlp": 1.17615128, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.06778030965882964, + "language_loss": 0.88354933, + "learning_rate": 0.0008433892973982031, + "loss": 0.89579964, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.48901367, + "step": 1462, + "time_per_iteration": 2.5101869106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212759, + "balance_loss_mlp": 1.16168988, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07940790981700917, + "language_loss": 0.85705763, + "learning_rate": 0.0008431627810324431, + "loss": 0.86918521, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.51098633, + "step": 1463, + "time_per_iteration": 2.6701931953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208608, + "balance_loss_mlp": 1.15906441, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.1112721524597414, + "language_loss": 0.81312853, + "learning_rate": 0.000842936131439076, + "loss": 0.82521462, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.49584961, + "step": 1464, + "time_per_iteration": 2.626397132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182235, + "balance_loss_mlp": 1.13440847, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.10805991000078381, + "language_loss": 0.88305855, + "learning_rate": 0.0008427093487060951, + "loss": 0.89488095, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.4777832, + "step": 1465, + "time_per_iteration": 2.6287689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152025, + "balance_loss_mlp": 1.10815573, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.05392746655550109, + "language_loss": 0.85014635, + "learning_rate": 0.000842482432921545, + "loss": 0.86166662, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.4387207, + "step": 1466, + "time_per_iteration": 2.843055009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140929, + "balance_loss_mlp": 1.09691691, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.12216249404138245, + "language_loss": 0.8786549, + "learning_rate": 0.0008422553841735225, + "loss": 0.89006418, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.44018555, + "step": 1467, + "time_per_iteration": 2.4870855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130953, + "balance_loss_mlp": 1.08686972, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.0834179705505054, + "language_loss": 0.85186172, + "learning_rate": 0.0008420282025501757, + "loss": 0.86317128, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.44091797, + "step": 1468, + "time_per_iteration": 2.746919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.09730196, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07747841896553878, + "language_loss": 0.85862702, + "learning_rate": 0.0008418008881397043, + "loss": 0.8700223, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.42236328, + "step": 1469, + "time_per_iteration": 2.7157111167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011536, + "balance_loss_mlp": 1.11108959, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.09196817065592088, + "language_loss": 0.83090472, + "learning_rate": 0.0008415734410303595, + "loss": 0.84244066, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.42529297, + "step": 1470, + "time_per_iteration": 3.2546660900115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.1166662, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07745609031802311, + "language_loss": 0.91133046, + "learning_rate": 0.0008413458613104444, + "loss": 0.92292744, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.43017578, + "step": 1471, + "time_per_iteration": 2.683119773864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124215, + "balance_loss_mlp": 1.08091772, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06716648824100378, + "language_loss": 0.83225214, + "learning_rate": 0.0008411181490683129, + "loss": 0.84349424, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.43334961, + "step": 1472, + "time_per_iteration": 2.7247512340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112102, + "balance_loss_mlp": 1.06692195, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08730853561294576, + "language_loss": 0.83099282, + "learning_rate": 0.0008408903043923707, + "loss": 0.84211385, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.45166016, + "step": 1473, + "time_per_iteration": 2.9982750415802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_mlp": 1.06675041, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09441991509127853, + "language_loss": 0.81456125, + "learning_rate": 0.0008406623273710754, + "loss": 0.82569724, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.46826172, + "step": 1474, + "time_per_iteration": 2.6457254886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107143, + "balance_loss_mlp": 1.06482363, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.08147557265850319, + "language_loss": 0.83874208, + "learning_rate": 0.0008404342180929351, + "loss": 0.84981352, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.42358398, + "step": 1475, + "time_per_iteration": 2.6071481704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110668, + "balance_loss_mlp": 1.06758618, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0682383784230515, + "language_loss": 0.81900609, + "learning_rate": 0.00084020597664651, + "loss": 0.83011281, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.43066406, + "step": 1476, + "time_per_iteration": 2.831547260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118821, + "balance_loss_mlp": 1.07149458, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.08199753583087593, + "language_loss": 0.84526181, + "learning_rate": 0.0008399776031204111, + "loss": 0.85645002, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.47290039, + "step": 1477, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112444, + "balance_loss_mlp": 1.07832992, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07183050675580523, + "language_loss": 0.80975109, + "learning_rate": 0.0008397490976033009, + "loss": 0.82099551, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.46118164, + "step": 1478, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.03766239, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.035679392232843235, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933525, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.16210938, + "step": 1479, + "time_per_iteration": 4.813107252120972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132957, + "balance_loss_mlp": 1.08925462, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06426749014533666, + "language_loss": 0.85708797, + "learning_rate": 0.0008392916909509525, + "loss": 0.86841756, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.43725586, + "step": 1480, + "time_per_iteration": 3.105465888977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.10180378, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.12099224111333258, + "language_loss": 0.8583495, + "learning_rate": 0.0008390627899932954, + "loss": 0.86980623, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.43847656, + "step": 1481, + "time_per_iteration": 2.5961339473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146403, + "balance_loss_mlp": 1.1041795, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.09850404509995118, + "language_loss": 0.88747412, + "learning_rate": 0.000838833757399789, + "loss": 0.89893812, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.42211914, + "step": 1482, + "time_per_iteration": 2.9445223808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160742, + "balance_loss_mlp": 1.11513209, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.09258701289693592, + "language_loss": 0.81233478, + "learning_rate": 0.0008386045932593515, + "loss": 0.82394218, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.45605469, + "step": 1483, + "time_per_iteration": 2.696171283721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172022, + "balance_loss_mlp": 1.12853456, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07718327666813503, + "language_loss": 0.8687939, + "learning_rate": 0.0008383752976609525, + "loss": 0.88051414, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.43481445, + "step": 1484, + "time_per_iteration": 2.948983907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159194, + "balance_loss_mlp": 1.11508679, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06564205880415652, + "language_loss": 0.80617285, + "learning_rate": 0.0008381458706936123, + "loss": 0.81776482, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.44116211, + "step": 1485, + "time_per_iteration": 2.689715623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117177, + "balance_loss_mlp": 1.12740064, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06570872016312425, + "language_loss": 0.87734085, + "learning_rate": 0.0008379163124464025, + "loss": 0.88905853, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.44384766, + "step": 1486, + "time_per_iteration": 2.7226197719573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166912, + "balance_loss_mlp": 1.12526059, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.0915307653224295, + "language_loss": 0.77564812, + "learning_rate": 0.0008376866230084452, + "loss": 0.78731728, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.41650391, + "step": 1487, + "time_per_iteration": 2.82708477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154293, + "balance_loss_mlp": 1.10901785, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07232162522245564, + "language_loss": 0.86754864, + "learning_rate": 0.000837456802468914, + "loss": 0.87909162, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.45239258, + "step": 1488, + "time_per_iteration": 2.6107335090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115391, + "balance_loss_mlp": 1.1082294, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.06580975478488113, + "language_loss": 0.85965604, + "learning_rate": 0.0008372268509170331, + "loss": 0.8711952, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.45678711, + "step": 1489, + "time_per_iteration": 2.682190418243408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147981, + "balance_loss_mlp": 1.10554218, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.0640942252200205, + "language_loss": 0.85215169, + "learning_rate": 0.0008369967684420779, + "loss": 0.86363149, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.42431641, + "step": 1490, + "time_per_iteration": 2.708315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.11154985, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.07293711729105107, + "language_loss": 0.84566355, + "learning_rate": 0.0008367665551333736, + "loss": 0.85722154, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.44262695, + "step": 1491, + "time_per_iteration": 2.605665445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159368, + "balance_loss_mlp": 1.11216116, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.0802107480821924, + "language_loss": 0.85808468, + "learning_rate": 0.0008365362110802977, + "loss": 0.86967838, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47241211, + "step": 1492, + "time_per_iteration": 2.879655122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155437, + "balance_loss_mlp": 1.109303, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.06007050516222503, + "language_loss": 0.82957923, + "learning_rate": 0.0008363057363722773, + "loss": 0.84113365, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.46142578, + "step": 1493, + "time_per_iteration": 2.8600335121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154458, + "balance_loss_mlp": 1.11085081, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.060904552171674266, + "language_loss": 0.8464222, + "learning_rate": 0.0008360751310987906, + "loss": 0.85796678, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.4362793, + "step": 1494, + "time_per_iteration": 2.602029800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151781, + "balance_loss_mlp": 1.11160707, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.06255193118064963, + "language_loss": 0.86073208, + "learning_rate": 0.0008358443953493666, + "loss": 0.87224984, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.40185547, + "step": 1495, + "time_per_iteration": 2.8682689666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116061, + "balance_loss_mlp": 1.11702669, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.06637793594414569, + "language_loss": 0.89093578, + "learning_rate": 0.0008356135292135851, + "loss": 0.90254188, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.43579102, + "step": 1496, + "time_per_iteration": 2.519700288772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162426, + "balance_loss_mlp": 1.11760294, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.07926576541007177, + "language_loss": 0.92873323, + "learning_rate": 0.0008353825327810758, + "loss": 0.94035745, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.44873047, + "step": 1497, + "time_per_iteration": 2.4195892810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.09852648, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.05522330058639147, + "language_loss": 0.81832987, + "learning_rate": 0.00083515140614152, + "loss": 0.82973409, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.41894531, + "step": 1498, + "time_per_iteration": 2.6989245414733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151843, + "balance_loss_mlp": 1.10992932, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.08112895482541128, + "language_loss": 0.87581354, + "learning_rate": 0.0008349201493846485, + "loss": 0.88733196, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.41894531, + "step": 1499, + "time_per_iteration": 2.647165298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113639, + "balance_loss_mlp": 1.09364128, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.06188269799142739, + "language_loss": 0.89485824, + "learning_rate": 0.0008346887626002432, + "loss": 0.90622216, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.42724609, + "step": 1500, + "time_per_iteration": 2.546494960784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.09546816, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.07756887509348087, + "language_loss": 0.86612689, + "learning_rate": 0.000834457245878137, + "loss": 0.87751424, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.43261719, + "step": 1501, + "time_per_iteration": 2.6271145343780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132854, + "balance_loss_mlp": 1.08993816, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07465598629984396, + "language_loss": 0.8176384, + "learning_rate": 0.000834225599308212, + "loss": 0.82896686, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.42895508, + "step": 1502, + "time_per_iteration": 3.2550971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150677, + "balance_loss_mlp": 1.10580611, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07581203663628927, + "language_loss": 0.85830456, + "learning_rate": 0.0008339938229804016, + "loss": 0.8698113, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.44897461, + "step": 1503, + "time_per_iteration": 2.704310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132016, + "balance_loss_mlp": 1.11475468, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04995777902546146, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76566839, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17285156, + "step": 1504, + "time_per_iteration": 4.959474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.10965538, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.06157445053236475, + "language_loss": 0.84505653, + "learning_rate": 0.0008335298814111094, + "loss": 0.85662901, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47607422, + "step": 1505, + "time_per_iteration": 2.5612986087799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178976, + "balance_loss_mlp": 1.13374829, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.05887296654917154, + "language_loss": 0.88222575, + "learning_rate": 0.0008332977163497455, + "loss": 0.89401549, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.4519043, + "step": 1506, + "time_per_iteration": 2.8017849922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183741, + "balance_loss_mlp": 1.13696313, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07773532252894584, + "language_loss": 0.83964998, + "learning_rate": 0.0008330654218907325, + "loss": 0.8514874, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.46801758, + "step": 1507, + "time_per_iteration": 2.6568052768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167782, + "balance_loss_mlp": 1.12016964, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.05364053536005051, + "language_loss": 0.82260346, + "learning_rate": 0.0008328329981242548, + "loss": 0.83428133, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47631836, + "step": 1508, + "time_per_iteration": 2.8732171058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161954, + "balance_loss_mlp": 1.11479485, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.06776855665971031, + "language_loss": 0.88091129, + "learning_rate": 0.0008326004451405475, + "loss": 0.8925308, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47143555, + "step": 1509, + "time_per_iteration": 2.762476921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.11104107, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.08089915602738365, + "language_loss": 0.82757521, + "learning_rate": 0.0008323677630298957, + "loss": 0.83914363, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.45800781, + "step": 1510, + "time_per_iteration": 2.554558753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152926, + "balance_loss_mlp": 1.1073643, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.07106066660777852, + "language_loss": 0.85773015, + "learning_rate": 0.0008321349518826345, + "loss": 0.86925942, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.45556641, + "step": 1511, + "time_per_iteration": 2.8341891765594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144812, + "balance_loss_mlp": 1.09870172, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.06994476337169399, + "language_loss": 0.95554525, + "learning_rate": 0.0008319020117891491, + "loss": 0.96699333, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.4609375, + "step": 1512, + "time_per_iteration": 2.6152215003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147304, + "balance_loss_mlp": 1.09902406, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.09218377020634298, + "language_loss": 0.87772787, + "learning_rate": 0.0008316689428398751, + "loss": 0.88920093, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.4831543, + "step": 1513, + "time_per_iteration": 2.687288522720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148068, + "balance_loss_mlp": 1.10407972, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05407373665960582, + "language_loss": 0.89050305, + "learning_rate": 0.0008314357451252979, + "loss": 0.90198368, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.44018555, + "step": 1514, + "time_per_iteration": 2.7870078086853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151939, + "balance_loss_mlp": 1.10644853, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.11283198751561448, + "language_loss": 0.88657945, + "learning_rate": 0.0008312024187359527, + "loss": 0.89809883, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.45483398, + "step": 1515, + "time_per_iteration": 2.6400256156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144613, + "balance_loss_mlp": 1.10060108, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.08270455580526427, + "language_loss": 0.87534022, + "learning_rate": 0.000830968963762425, + "loss": 0.8867864, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.43994141, + "step": 1516, + "time_per_iteration": 3.0442028045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151597, + "balance_loss_mlp": 1.10617828, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.06364079743342543, + "language_loss": 0.84482789, + "learning_rate": 0.0008307353802953497, + "loss": 0.85634387, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.45361328, + "step": 1517, + "time_per_iteration": 2.672921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171551, + "balance_loss_mlp": 1.12281811, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.060139597091390135, + "language_loss": 0.86612219, + "learning_rate": 0.0008305016684254125, + "loss": 0.87783766, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.48803711, + "step": 1518, + "time_per_iteration": 2.7845590114593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.12947094, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.09151635615922826, + "language_loss": 0.87469971, + "learning_rate": 0.0008302678282433479, + "loss": 0.88644284, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.44848633, + "step": 1519, + "time_per_iteration": 2.562605619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163342, + "balance_loss_mlp": 1.11999798, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07068722957296131, + "language_loss": 0.85016668, + "learning_rate": 0.0008300338598399411, + "loss": 0.86180007, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.43359375, + "step": 1520, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155651, + "balance_loss_mlp": 1.11111403, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07704766336953982, + "language_loss": 0.95187533, + "learning_rate": 0.0008297997633060263, + "loss": 0.96343178, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.44506836, + "step": 1521, + "time_per_iteration": 2.5206730365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_mlp": 1.08468485, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07256926042070597, + "language_loss": 0.85441822, + "learning_rate": 0.0008295655387324883, + "loss": 0.865695, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.42993164, + "step": 1522, + "time_per_iteration": 2.8186635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126011, + "balance_loss_mlp": 1.08090246, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.07210388942873598, + "language_loss": 0.8532753, + "learning_rate": 0.0008293311862102609, + "loss": 0.86453545, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.45092773, + "step": 1523, + "time_per_iteration": 2.4982752799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.07334912, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.0579845522804068, + "language_loss": 0.89434093, + "learning_rate": 0.0008290967058303275, + "loss": 0.90552431, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.44995117, + "step": 1524, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.07575774, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07735764089304721, + "language_loss": 0.86793721, + "learning_rate": 0.0008288620976837219, + "loss": 0.87910557, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.41088867, + "step": 1525, + "time_per_iteration": 2.4877853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.06881261, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.06064034312392981, + "language_loss": 0.83118868, + "learning_rate": 0.000828627361861527, + "loss": 0.84231043, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.43383789, + "step": 1526, + "time_per_iteration": 2.567406415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_mlp": 1.06620967, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.0729369607745646, + "language_loss": 0.84539104, + "learning_rate": 0.0008283924984548752, + "loss": 0.85648245, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.42919922, + "step": 1527, + "time_per_iteration": 2.8396716117858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117649, + "balance_loss_mlp": 1.07480514, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.05516048868040139, + "language_loss": 0.85423326, + "learning_rate": 0.0008281575075549485, + "loss": 0.86540973, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.4284668, + "step": 1528, + "time_per_iteration": 2.596402645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093475, + "balance_loss_mlp": 1.0787884, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.03776357558455706, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78446174, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.14648438, + "step": 1529, + "time_per_iteration": 4.641916513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118614, + "balance_loss_mlp": 1.07436347, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.11599739785132454, + "language_loss": 0.90857148, + "learning_rate": 0.0008276871436402469, + "loss": 0.91975754, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.44238281, + "step": 1530, + "time_per_iteration": 2.8211593627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113901, + "balance_loss_mlp": 1.07239282, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.06834093724659761, + "language_loss": 0.87937176, + "learning_rate": 0.000827451770808083, + "loss": 0.8905108, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.41503906, + "step": 1531, + "time_per_iteration": 2.7127888202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.06357539, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.06489723039655686, + "language_loss": 0.8385976, + "learning_rate": 0.0008272162708478674, + "loss": 0.84966749, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.43457031, + "step": 1532, + "time_per_iteration": 2.580057144165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119293, + "balance_loss_mlp": 1.07749844, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.06938693493012958, + "language_loss": 0.86437017, + "learning_rate": 0.000826980643851029, + "loss": 0.87556309, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.41821289, + "step": 1533, + "time_per_iteration": 2.689450740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118363, + "balance_loss_mlp": 1.07518554, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.057495804655394826, + "language_loss": 0.85101378, + "learning_rate": 0.0008267448899090464, + "loss": 0.8621974, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.43188477, + "step": 1534, + "time_per_iteration": 2.5541234016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139738, + "balance_loss_mlp": 1.09460509, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.0763188518859088, + "language_loss": 0.81071836, + "learning_rate": 0.0008265090091134473, + "loss": 0.82211578, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.45117188, + "step": 1535, + "time_per_iteration": 2.851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.09309804, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06589165398662913, + "language_loss": 0.80565453, + "learning_rate": 0.0008262730015558088, + "loss": 0.8170197, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.43432617, + "step": 1536, + "time_per_iteration": 2.8671340942382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113965, + "balance_loss_mlp": 1.09423184, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.08099910548300644, + "language_loss": 0.82513618, + "learning_rate": 0.0008260368673277574, + "loss": 0.83653271, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.45410156, + "step": 1537, + "time_per_iteration": 3.114685297012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134888, + "balance_loss_mlp": 1.08973145, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06868209454347093, + "language_loss": 0.84501362, + "learning_rate": 0.0008258006065209682, + "loss": 0.85636258, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.45141602, + "step": 1538, + "time_per_iteration": 2.7343428134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112017, + "balance_loss_mlp": 1.07341647, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.07819005704771397, + "language_loss": 0.80795646, + "learning_rate": 0.0008255642192271657, + "loss": 0.8191582, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.4675293, + "step": 1539, + "time_per_iteration": 2.7900264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123831, + "balance_loss_mlp": 1.0775305, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06984070899888078, + "language_loss": 0.84251219, + "learning_rate": 0.0008253277055381241, + "loss": 0.85375053, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.46313477, + "step": 1540, + "time_per_iteration": 2.7936105728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126968, + "balance_loss_mlp": 1.08383858, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09213105437911238, + "language_loss": 0.86479163, + "learning_rate": 0.0008250910655456658, + "loss": 0.87606132, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.43115234, + "step": 1541, + "time_per_iteration": 3.119706392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141818, + "balance_loss_mlp": 1.09723353, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.06264221574110865, + "language_loss": 0.84348595, + "learning_rate": 0.0008248542993416625, + "loss": 0.85490412, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.44628906, + "step": 1542, + "time_per_iteration": 2.6273162364959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.09224987, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.062187844768518095, + "language_loss": 0.838552, + "learning_rate": 0.0008246174070180352, + "loss": 0.84992176, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.44702148, + "step": 1543, + "time_per_iteration": 2.6559441089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155532, + "balance_loss_mlp": 1.11099529, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09249403217806111, + "language_loss": 0.84424686, + "learning_rate": 0.0008243803886667537, + "loss": 0.85580218, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.44506836, + "step": 1544, + "time_per_iteration": 3.161595582962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155762, + "balance_loss_mlp": 1.11196482, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.11473976054569617, + "language_loss": 0.79569989, + "learning_rate": 0.0008241432443798364, + "loss": 0.80725753, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.43774414, + "step": 1545, + "time_per_iteration": 2.8056137561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154045, + "balance_loss_mlp": 1.11160624, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05050947415994233, + "language_loss": 0.86053026, + "learning_rate": 0.0008239059742493512, + "loss": 0.87207067, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.42456055, + "step": 1546, + "time_per_iteration": 2.6890687942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146751, + "balance_loss_mlp": 1.10383546, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.060404475813103174, + "language_loss": 0.87675822, + "learning_rate": 0.0008236685783674142, + "loss": 0.88822567, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.42944336, + "step": 1547, + "time_per_iteration": 3.0594639778137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176135, + "balance_loss_mlp": 1.15439153, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05730794129930028, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77397329, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.21777344, + "step": 1548, + "time_per_iteration": 4.907459020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115635, + "balance_loss_mlp": 1.11174202, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08902597202075696, + "language_loss": 0.82813615, + "learning_rate": 0.0008231934097178955, + "loss": 0.83969963, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.44604492, + "step": 1549, + "time_per_iteration": 2.622082471847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.1013267, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.06733871211748228, + "language_loss": 0.85700476, + "learning_rate": 0.0008229556371347903, + "loss": 0.86848152, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.46362305, + "step": 1550, + "time_per_iteration": 3.0081942081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133769, + "balance_loss_mlp": 1.09018564, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.09176779567237862, + "language_loss": 0.79384351, + "learning_rate": 0.0008227177391691874, + "loss": 0.80518115, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.43554688, + "step": 1551, + "time_per_iteration": 3.1698648929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126053, + "balance_loss_mlp": 1.08218408, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07033401560901072, + "language_loss": 0.89799201, + "learning_rate": 0.0008224797159134463, + "loss": 0.90925252, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.4387207, + "step": 1552, + "time_per_iteration": 2.714494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118143, + "balance_loss_mlp": 1.07816052, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.05144631995573129, + "language_loss": 0.83942962, + "learning_rate": 0.0008222415674599765, + "loss": 0.85061103, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.39990234, + "step": 1553, + "time_per_iteration": 3.0642828941345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130247, + "balance_loss_mlp": 1.08563888, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07574846124683007, + "language_loss": 0.83871847, + "learning_rate": 0.0008220032939012349, + "loss": 0.85002089, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.44628906, + "step": 1554, + "time_per_iteration": 2.714172840118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.08810425, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.05026836342639273, + "language_loss": 0.8851645, + "learning_rate": 0.0008217648953297277, + "loss": 0.89646089, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.41503906, + "step": 1555, + "time_per_iteration": 2.8413305282592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139651, + "balance_loss_mlp": 1.09692693, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07726233455877282, + "language_loss": 0.78621179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79760832, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.42749023, + "step": 1556, + "time_per_iteration": 2.6995439529418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153013, + "balance_loss_mlp": 1.10766625, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07367356569931041, + "language_loss": 0.8461448, + "learning_rate": 0.0008212877235186833, + "loss": 0.85767496, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.45361328, + "step": 1557, + "time_per_iteration": 2.655294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105489, + "balance_loss_mlp": 1.09290004, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.039126881386902713, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78843045, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12597656, + "step": 1558, + "time_per_iteration": 4.953773021697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148338, + "balance_loss_mlp": 1.10647154, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.07045252665170362, + "language_loss": 0.81300378, + "learning_rate": 0.0008208100527678611, + "loss": 0.82448721, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.41870117, + "step": 1559, + "time_per_iteration": 2.5706257820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142691, + "balance_loss_mlp": 1.10223174, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.09371754463761041, + "language_loss": 0.79173958, + "learning_rate": 0.0008205710305218135, + "loss": 0.80316657, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.40454102, + "step": 1560, + "time_per_iteration": 3.001490354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152428, + "balance_loss_mlp": 1.11292171, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.06044421333553386, + "language_loss": 0.90459639, + "learning_rate": 0.0008203318838190541, + "loss": 0.91612065, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.39501953, + "step": 1561, + "time_per_iteration": 2.753243923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166566, + "balance_loss_mlp": 1.1229353, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.07449479195038491, + "language_loss": 0.85542631, + "learning_rate": 0.0008200926127524281, + "loss": 0.86709195, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.43676758, + "step": 1562, + "time_per_iteration": 2.6388282775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184921, + "balance_loss_mlp": 1.14045644, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.07268784417656445, + "language_loss": 0.83160597, + "learning_rate": 0.0008198532174148289, + "loss": 0.8434552, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.44482422, + "step": 1563, + "time_per_iteration": 2.71712589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076623, + "balance_loss_mlp": 1.06308043, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03416296623034226, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81762791, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.13574219, + "step": 1564, + "time_per_iteration": 4.830719232559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194058, + "balance_loss_mlp": 1.15185785, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08914748552149089, + "language_loss": 0.88889605, + "learning_rate": 0.0008193740542985244, + "loss": 0.90083665, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.421875, + "step": 1565, + "time_per_iteration": 2.6047041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199035, + "balance_loss_mlp": 1.15647733, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.07863054385005203, + "language_loss": 0.8685202, + "learning_rate": 0.0008191342867058467, + "loss": 0.88051057, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.42578125, + "step": 1566, + "time_per_iteration": 2.715708017349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196816, + "balance_loss_mlp": 1.15280378, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.087093537774187, + "language_loss": 0.83839655, + "learning_rate": 0.0008188943952142509, + "loss": 0.85036469, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.43994141, + "step": 1567, + "time_per_iteration": 2.831888198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118972, + "balance_loss_mlp": 1.14663815, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09637975850341399, + "language_loss": 0.82476509, + "learning_rate": 0.0008186543799168711, + "loss": 0.83666229, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.43041992, + "step": 1568, + "time_per_iteration": 3.121755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_mlp": 1.13324285, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.08024736909630528, + "language_loss": 0.88665748, + "learning_rate": 0.0008184142409068892, + "loss": 0.89842814, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.43847656, + "step": 1569, + "time_per_iteration": 2.990497350692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163968, + "balance_loss_mlp": 1.12343669, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.05684047424393967, + "language_loss": 0.86850333, + "learning_rate": 0.000818173978277536, + "loss": 0.88014305, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.40551758, + "step": 1570, + "time_per_iteration": 2.636310338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171599, + "balance_loss_mlp": 1.12956595, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.07636807389642969, + "language_loss": 0.84349716, + "learning_rate": 0.000817933592122089, + "loss": 0.85521317, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.4206543, + "step": 1571, + "time_per_iteration": 2.699178695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163998, + "balance_loss_mlp": 1.11984301, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.07546742874281152, + "language_loss": 0.83585215, + "learning_rate": 0.0008176930825338749, + "loss": 0.8474921, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.44189453, + "step": 1572, + "time_per_iteration": 2.550837516784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166441, + "balance_loss_mlp": 1.12385964, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07092433148156627, + "language_loss": 0.89086282, + "learning_rate": 0.0008174524496062679, + "loss": 0.90252721, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.42578125, + "step": 1573, + "time_per_iteration": 2.883683919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116421, + "balance_loss_mlp": 1.11907697, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.061103918995996154, + "language_loss": 0.8587321, + "learning_rate": 0.0008172116934326894, + "loss": 0.8703742, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.45092773, + "step": 1574, + "time_per_iteration": 2.7379467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162954, + "balance_loss_mlp": 1.12132585, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.07023429776023385, + "language_loss": 0.87709713, + "learning_rate": 0.0008169708141066097, + "loss": 0.88872665, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.41625977, + "step": 1575, + "time_per_iteration": 2.571963310241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154168, + "balance_loss_mlp": 1.11435199, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.11601472076904104, + "language_loss": 0.90864658, + "learning_rate": 0.0008167298117215465, + "loss": 0.92018831, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.39819336, + "step": 1576, + "time_per_iteration": 2.562636375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153517, + "balance_loss_mlp": 1.11141217, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08960201833145559, + "language_loss": 0.88355744, + "learning_rate": 0.0008164886863710649, + "loss": 0.89509267, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.42138672, + "step": 1577, + "time_per_iteration": 2.921163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151824, + "balance_loss_mlp": 1.11212754, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07034131144929774, + "language_loss": 0.86199445, + "learning_rate": 0.0008162474381487783, + "loss": 0.87351274, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.39697266, + "step": 1578, + "time_per_iteration": 3.029076337814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.11016417, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.07584256466560314, + "language_loss": 0.85196549, + "learning_rate": 0.0008160060671483475, + "loss": 0.86348867, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.42163086, + "step": 1579, + "time_per_iteration": 2.7073986530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142614, + "balance_loss_mlp": 1.10289371, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.08686038732079729, + "language_loss": 0.83729678, + "learning_rate": 0.0008157645734634809, + "loss": 0.84872293, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.3972168, + "step": 1580, + "time_per_iteration": 2.6613049507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090857, + "balance_loss_mlp": 1.07302368, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.0332286598930082, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77987349, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.17871094, + "step": 1581, + "time_per_iteration": 4.915473699569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074598, + "balance_loss_mlp": 1.05705047, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.028649014265593315, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74289095, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17578125, + "step": 1582, + "time_per_iteration": 4.889309883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129405, + "balance_loss_mlp": 1.08827806, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.06812522797045092, + "language_loss": 0.84052569, + "learning_rate": 0.000815039357240067, + "loss": 0.85181975, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.41113281, + "step": 1583, + "time_per_iteration": 2.6366286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138467, + "balance_loss_mlp": 1.09672034, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.06492424308297744, + "language_loss": 0.85869169, + "learning_rate": 0.0008147973737554952, + "loss": 0.87007636, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.41748047, + "step": 1584, + "time_per_iteration": 2.7854599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136804, + "balance_loss_mlp": 1.095963, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.08202571879527615, + "language_loss": 0.86834013, + "learning_rate": 0.000814555268055744, + "loss": 0.87970817, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.40844727, + "step": 1585, + "time_per_iteration": 2.6199045181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132861, + "balance_loss_mlp": 1.09130502, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07393752668393892, + "language_loss": 0.87929702, + "learning_rate": 0.0008143130402348073, + "loss": 0.89062566, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.41625977, + "step": 1586, + "time_per_iteration": 2.638741970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129239, + "balance_loss_mlp": 1.08868384, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.06849121050203105, + "language_loss": 0.7939502, + "learning_rate": 0.0008140706903867265, + "loss": 0.80524254, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.4050293, + "step": 1587, + "time_per_iteration": 2.810335874557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134042, + "balance_loss_mlp": 1.0908649, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.07851663365650921, + "language_loss": 0.91122121, + "learning_rate": 0.0008138282186055897, + "loss": 0.92256165, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.43188477, + "step": 1588, + "time_per_iteration": 2.7237448692321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137411, + "balance_loss_mlp": 1.09661722, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.06832590097240848, + "language_loss": 0.8307212, + "learning_rate": 0.0008135856249855331, + "loss": 0.84209532, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.40771484, + "step": 1589, + "time_per_iteration": 2.7399301528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153972, + "balance_loss_mlp": 1.11241579, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.09162978556143483, + "language_loss": 0.89933717, + "learning_rate": 0.0008133429096207398, + "loss": 0.91087687, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.41577148, + "step": 1590, + "time_per_iteration": 2.8074302673339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_mlp": 1.0156827, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.025543227678258826, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76341486, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.13574219, + "step": 1591, + "time_per_iteration": 4.961095094680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153411, + "balance_loss_mlp": 1.11330891, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.05628096053427355, + "language_loss": 0.87358719, + "learning_rate": 0.0008128571140339123, + "loss": 0.88512129, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.40087891, + "step": 1592, + "time_per_iteration": 2.6484899520874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137482, + "balance_loss_mlp": 1.09497237, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.058132540851188214, + "language_loss": 0.87688839, + "learning_rate": 0.0008126140340004805, + "loss": 0.88826323, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.42529297, + "step": 1593, + "time_per_iteration": 2.509239912033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144438, + "balance_loss_mlp": 1.10316801, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06371804566889869, + "language_loss": 0.82466245, + "learning_rate": 0.0008123708325995172, + "loss": 0.83610678, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.4128418, + "step": 1594, + "time_per_iteration": 3.1773130893707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133345, + "balance_loss_mlp": 1.09240818, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06060698504548286, + "language_loss": 0.79972136, + "learning_rate": 0.0008121275099254414, + "loss": 0.81105477, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.40942383, + "step": 1595, + "time_per_iteration": 2.9426517486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142693, + "balance_loss_mlp": 1.10244751, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06149446857353131, + "language_loss": 0.88748306, + "learning_rate": 0.0008118840660727194, + "loss": 0.89890993, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.40283203, + "step": 1596, + "time_per_iteration": 2.665166139602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.09553957, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.15751252363629464, + "language_loss": 0.88104224, + "learning_rate": 0.0008116405011358644, + "loss": 0.89240128, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.40380859, + "step": 1597, + "time_per_iteration": 3.1415486335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.10291696, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.06428245482632208, + "language_loss": 0.80117774, + "learning_rate": 0.0008113968152094369, + "loss": 0.81262958, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.42285156, + "step": 1598, + "time_per_iteration": 2.50484037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140725, + "balance_loss_mlp": 1.09781003, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.069373282908973, + "language_loss": 0.82692802, + "learning_rate": 0.0008111530083880438, + "loss": 0.83833528, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.42895508, + "step": 1599, + "time_per_iteration": 2.9072136878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.11211586, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.09326308305844169, + "language_loss": 0.86715603, + "learning_rate": 0.0008109090807663399, + "loss": 0.87871301, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.43554688, + "step": 1600, + "time_per_iteration": 2.8556277751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154517, + "balance_loss_mlp": 1.1142, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.07163974647376076, + "language_loss": 0.89029115, + "learning_rate": 0.0008106650324390257, + "loss": 0.90183634, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.40307617, + "step": 1601, + "time_per_iteration": 2.8016483783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115055, + "balance_loss_mlp": 1.10768259, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.06437682840273379, + "language_loss": 0.81480461, + "learning_rate": 0.0008104208635008493, + "loss": 0.82631016, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.42871094, + "step": 1602, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150496, + "balance_loss_mlp": 1.10631728, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.13502170342564263, + "language_loss": 0.8243258, + "learning_rate": 0.0008101765740466058, + "loss": 0.83583081, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.44165039, + "step": 1603, + "time_per_iteration": 2.506427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144916, + "balance_loss_mlp": 1.10135674, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0649160929519563, + "language_loss": 0.84340334, + "learning_rate": 0.0008099321641711364, + "loss": 0.85485256, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.43579102, + "step": 1604, + "time_per_iteration": 2.6318166255950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151756, + "balance_loss_mlp": 1.10938883, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.0523010874933109, + "language_loss": 0.83940029, + "learning_rate": 0.0008096876339693295, + "loss": 0.85091782, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.42407227, + "step": 1605, + "time_per_iteration": 2.620199680328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150228, + "balance_loss_mlp": 1.1086241, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.07539888612246932, + "language_loss": 0.8184768, + "learning_rate": 0.0008094429835361206, + "loss": 0.82997912, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.41625977, + "step": 1606, + "time_per_iteration": 2.9251575469970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147276, + "balance_loss_mlp": 1.10679281, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.07700051037162058, + "language_loss": 0.85932112, + "learning_rate": 0.0008091982129664908, + "loss": 0.87079388, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.40478516, + "step": 1607, + "time_per_iteration": 2.7032129764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169169, + "balance_loss_mlp": 1.12427497, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.11394505928871175, + "language_loss": 0.83292013, + "learning_rate": 0.0008089533223554687, + "loss": 0.84461182, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.44897461, + "step": 1608, + "time_per_iteration": 2.6975207328796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161949, + "balance_loss_mlp": 1.12115526, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.06275490202685644, + "language_loss": 0.85402906, + "learning_rate": 0.0008087083117981294, + "loss": 0.86564851, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.40795898, + "step": 1609, + "time_per_iteration": 2.8709142208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158469, + "balance_loss_mlp": 1.11402774, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.06357956742359384, + "language_loss": 0.88521934, + "learning_rate": 0.0008084631813895943, + "loss": 0.89680409, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.44433594, + "step": 1610, + "time_per_iteration": 2.7704904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148821, + "balance_loss_mlp": 1.1059773, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07818022356789546, + "language_loss": 0.84349322, + "learning_rate": 0.0008082179312250315, + "loss": 0.85498142, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.42871094, + "step": 1611, + "time_per_iteration": 2.6352171897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118188, + "balance_loss_mlp": 1.10588562, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.03204939869531237, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8097403, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.12255859, + "step": 1612, + "time_per_iteration": 4.865812301635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095093, + "balance_loss_mlp": 1.08288634, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.024031397097536, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77724421, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.12207031, + "step": 1613, + "time_per_iteration": 5.057459831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163336, + "balance_loss_mlp": 1.12020612, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.056757119691581794, + "language_loss": 0.82255232, + "learning_rate": 0.0008074814631475545, + "loss": 0.83418566, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.43139648, + "step": 1614, + "time_per_iteration": 3.3026204109191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164621, + "balance_loss_mlp": 1.12153852, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.0685570598787085, + "language_loss": 0.79806983, + "learning_rate": 0.0008072357349114907, + "loss": 0.80971605, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.4309082, + "step": 1615, + "time_per_iteration": 2.663853645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187873, + "balance_loss_mlp": 1.14369345, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.06371446427292905, + "language_loss": 0.8904891, + "learning_rate": 0.0008069898873959363, + "loss": 0.90236783, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.44189453, + "step": 1616, + "time_per_iteration": 2.675607919692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199097, + "balance_loss_mlp": 1.15773141, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.10138062428343411, + "language_loss": 0.8626408, + "learning_rate": 0.0008067439206963375, + "loss": 0.87463176, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.41381836, + "step": 1617, + "time_per_iteration": 2.6264841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193178, + "balance_loss_mlp": 1.15119278, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.06654120721966555, + "language_loss": 0.8650856, + "learning_rate": 0.0008064978349081873, + "loss": 0.87701744, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.41967773, + "step": 1618, + "time_per_iteration": 2.9114232063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180658, + "balance_loss_mlp": 1.13712287, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.06279818174684408, + "language_loss": 0.86905777, + "learning_rate": 0.0008062516301270245, + "loss": 0.88086432, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.43530273, + "step": 1619, + "time_per_iteration": 2.697016477584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174783, + "balance_loss_mlp": 1.13341749, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.07259268941115717, + "language_loss": 0.89074606, + "learning_rate": 0.0008060053064484343, + "loss": 0.90249389, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.41381836, + "step": 1620, + "time_per_iteration": 2.9220941066741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160759, + "balance_loss_mlp": 1.11996579, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.054906942105454146, + "language_loss": 0.85286081, + "learning_rate": 0.0008057588639680482, + "loss": 0.8644684, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.40795898, + "step": 1621, + "time_per_iteration": 2.7432475090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161698, + "balance_loss_mlp": 1.11754274, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.08428579582226577, + "language_loss": 0.83045304, + "learning_rate": 0.0008055123027815434, + "loss": 0.84207004, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.44165039, + "step": 1622, + "time_per_iteration": 2.888124465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149406, + "balance_loss_mlp": 1.10947073, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.06442378780427988, + "language_loss": 0.85635763, + "learning_rate": 0.0008052656229846436, + "loss": 0.86785173, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.39916992, + "step": 1623, + "time_per_iteration": 2.7215354442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.11259365, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.1013205930173775, + "language_loss": 0.90875685, + "learning_rate": 0.0008050188246731182, + "loss": 0.92030621, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.42333984, + "step": 1624, + "time_per_iteration": 2.6636321544647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146822, + "balance_loss_mlp": 1.10655355, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08961406202901398, + "language_loss": 0.82641953, + "learning_rate": 0.0008047719079427834, + "loss": 0.83788776, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.40283203, + "step": 1625, + "time_per_iteration": 2.9943442344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067888, + "balance_loss_mlp": 1.05425012, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.02225722433359613, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75419593, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.13671875, + "step": 1626, + "time_per_iteration": 4.865052700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124122, + "balance_loss_mlp": 1.0819937, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.05828883069087806, + "language_loss": 0.86570215, + "learning_rate": 0.0008042777196091757, + "loss": 0.87694335, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.42138672, + "step": 1627, + "time_per_iteration": 2.668349266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127492, + "balance_loss_mlp": 1.08481538, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08399253674550058, + "language_loss": 0.82332879, + "learning_rate": 0.0008040304481977643, + "loss": 0.83460367, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.42675781, + "step": 1628, + "time_per_iteration": 2.6445093154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130913, + "balance_loss_mlp": 1.09224153, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.06122809929096989, + "language_loss": 0.86751842, + "learning_rate": 0.0008037830587512649, + "loss": 0.87882763, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.38671875, + "step": 1629, + "time_per_iteration": 3.0830209255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131503, + "balance_loss_mlp": 1.09068549, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.06235185724616104, + "language_loss": 0.7940957, + "learning_rate": 0.0008035355513657224, + "loss": 0.80541074, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.40820312, + "step": 1630, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135326, + "balance_loss_mlp": 1.09326935, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.06249119555349938, + "language_loss": 0.9321425, + "learning_rate": 0.0008032879261372279, + "loss": 0.94349587, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.42089844, + "step": 1631, + "time_per_iteration": 2.7995047569274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_mlp": 1.01777005, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.019617221588718974, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80666578, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12988281, + "step": 1632, + "time_per_iteration": 5.3968565464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149415, + "balance_loss_mlp": 1.10959888, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.05783646939860944, + "language_loss": 0.87576675, + "learning_rate": 0.0008027923225359748, + "loss": 0.88726091, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.39819336, + "step": 1633, + "time_per_iteration": 2.5933566093444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153635, + "balance_loss_mlp": 1.11145878, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.05944909670445279, + "language_loss": 0.88579285, + "learning_rate": 0.0008025443443556267, + "loss": 0.89732921, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.421875, + "step": 1634, + "time_per_iteration": 2.728522777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149168, + "balance_loss_mlp": 1.109519, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.0772983201911997, + "language_loss": 0.88333809, + "learning_rate": 0.000802296248717147, + "loss": 0.89482975, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.39648438, + "step": 1635, + "time_per_iteration": 2.9030401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140791, + "balance_loss_mlp": 1.0971607, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06629024784700413, + "language_loss": 0.7930302, + "learning_rate": 0.0008020480357168554, + "loss": 0.80443811, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.43603516, + "step": 1636, + "time_per_iteration": 2.839134931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145583, + "balance_loss_mlp": 1.1038121, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.06656267016529639, + "language_loss": 0.88396037, + "learning_rate": 0.0008017997054511165, + "loss": 0.89541626, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.41796875, + "step": 1637, + "time_per_iteration": 2.5937085151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148115, + "balance_loss_mlp": 1.10424566, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06622170213435077, + "language_loss": 0.85649616, + "learning_rate": 0.0008015512580163407, + "loss": 0.86797726, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.43896484, + "step": 1638, + "time_per_iteration": 2.8432726860046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138121, + "balance_loss_mlp": 1.09639752, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.06676164925493694, + "language_loss": 0.81149763, + "learning_rate": 0.0008013026935089838, + "loss": 0.82287884, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.41699219, + "step": 1639, + "time_per_iteration": 2.8703761100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142857, + "balance_loss_mlp": 1.1031127, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.060786667538297263, + "language_loss": 0.84702241, + "learning_rate": 0.0008010540120255472, + "loss": 0.85845095, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.3972168, + "step": 1640, + "time_per_iteration": 2.6741273403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136768, + "balance_loss_mlp": 1.09511614, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.06934658167266547, + "language_loss": 0.86723542, + "learning_rate": 0.0008008052136625774, + "loss": 0.8786031, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.41650391, + "step": 1641, + "time_per_iteration": 2.8395094871520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135427, + "balance_loss_mlp": 1.09272623, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07613576058544219, + "language_loss": 0.87025082, + "learning_rate": 0.0008005562985166666, + "loss": 0.88160515, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.42675781, + "step": 1642, + "time_per_iteration": 2.708812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127626, + "balance_loss_mlp": 1.08621287, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05118616143218352, + "language_loss": 0.85440576, + "learning_rate": 0.0008003072666844524, + "loss": 0.86568201, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.41430664, + "step": 1643, + "time_per_iteration": 2.74019193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127922, + "balance_loss_mlp": 1.08746231, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.07457594622010144, + "language_loss": 0.82632107, + "learning_rate": 0.0008000581182626173, + "loss": 0.83760029, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.40478516, + "step": 1644, + "time_per_iteration": 2.6125125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011327, + "balance_loss_mlp": 1.09159672, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.0586598658040055, + "language_loss": 0.86714005, + "learning_rate": 0.0007998088533478894, + "loss": 0.87846708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.41137695, + "step": 1645, + "time_per_iteration": 2.674678087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130403, + "balance_loss_mlp": 1.08805966, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.10428151324619617, + "language_loss": 0.84319067, + "learning_rate": 0.000799559472037042, + "loss": 0.85449469, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.4230957, + "step": 1646, + "time_per_iteration": 2.5389983654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130022, + "balance_loss_mlp": 1.08939528, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05498023868715711, + "language_loss": 0.8798641, + "learning_rate": 0.0007993099744268932, + "loss": 0.8911643, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.40625, + "step": 1647, + "time_per_iteration": 2.919410467147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127448, + "balance_loss_mlp": 1.0858674, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.07648109375468225, + "language_loss": 0.88298547, + "learning_rate": 0.000799060360614307, + "loss": 0.89425999, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.41577148, + "step": 1648, + "time_per_iteration": 2.679098606109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132184, + "balance_loss_mlp": 1.09117627, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.17676844539598618, + "language_loss": 0.83707428, + "learning_rate": 0.0007988106306961917, + "loss": 0.84839618, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.41015625, + "step": 1649, + "time_per_iteration": 3.1304876804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139719, + "balance_loss_mlp": 1.09809113, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.06731506602110418, + "language_loss": 0.84557772, + "learning_rate": 0.0007985607847695014, + "loss": 0.85697484, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.41625977, + "step": 1650, + "time_per_iteration": 2.6152966022491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151307, + "balance_loss_mlp": 1.11087108, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.08658277444707524, + "language_loss": 0.83160597, + "learning_rate": 0.0007983108229312345, + "loss": 0.84311903, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.40454102, + "step": 1651, + "time_per_iteration": 2.9157605171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180085, + "balance_loss_mlp": 1.13864803, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.12326743545136284, + "language_loss": 0.86631948, + "learning_rate": 0.0007980607452784351, + "loss": 0.8781203, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.4140625, + "step": 1652, + "time_per_iteration": 2.5533528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170234, + "balance_loss_mlp": 1.12798643, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.07656805667485655, + "language_loss": 0.90550399, + "learning_rate": 0.0007978105519081919, + "loss": 0.91720629, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.42236328, + "step": 1653, + "time_per_iteration": 2.683962821960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162895, + "balance_loss_mlp": 1.12088561, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.06859901935764132, + "language_loss": 0.88378012, + "learning_rate": 0.0007975602429176385, + "loss": 0.89540899, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.42041016, + "step": 1654, + "time_per_iteration": 2.563507556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165514, + "balance_loss_mlp": 1.12421989, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.07830522948057009, + "language_loss": 0.81779003, + "learning_rate": 0.0007973098184039536, + "loss": 0.82944512, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.4128418, + "step": 1655, + "time_per_iteration": 2.6503560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154556, + "balance_loss_mlp": 1.11433494, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.07004293098644994, + "language_loss": 0.87212098, + "learning_rate": 0.0007970592784643602, + "loss": 0.88366652, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.40185547, + "step": 1656, + "time_per_iteration": 2.8598649501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167315, + "balance_loss_mlp": 1.12366056, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.08267452239342069, + "language_loss": 0.8563, + "learning_rate": 0.0007968086231961272, + "loss": 0.86797309, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.43676758, + "step": 1657, + "time_per_iteration": 2.637216806411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158327, + "balance_loss_mlp": 1.11343288, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.09173012098392071, + "language_loss": 0.83764172, + "learning_rate": 0.0007965578526965671, + "loss": 0.84922498, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.44897461, + "step": 1658, + "time_per_iteration": 2.607729911804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154777, + "balance_loss_mlp": 1.11307764, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.08650327787833377, + "language_loss": 0.86397582, + "learning_rate": 0.0007963069670630377, + "loss": 0.87552357, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.41723633, + "step": 1659, + "time_per_iteration": 2.7385904788970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154696, + "balance_loss_mlp": 1.11175728, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.06815630012467462, + "language_loss": 0.88107586, + "learning_rate": 0.0007960559663929416, + "loss": 0.89262283, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.4296875, + "step": 1660, + "time_per_iteration": 2.696936845779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155709, + "balance_loss_mlp": 1.11372399, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.07443207173064395, + "language_loss": 0.8773188, + "learning_rate": 0.0007958048507837259, + "loss": 0.88887584, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.41992188, + "step": 1661, + "time_per_iteration": 3.0276992321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.12168884, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.07361086812440759, + "language_loss": 0.87900233, + "learning_rate": 0.0007955536203328822, + "loss": 0.89066029, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.44116211, + "step": 1662, + "time_per_iteration": 2.9181947708129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167079, + "balance_loss_mlp": 1.12497449, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0536049497981301, + "language_loss": 0.8375597, + "learning_rate": 0.0007953022751379469, + "loss": 0.84923047, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.42089844, + "step": 1663, + "time_per_iteration": 2.8502774238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160364, + "balance_loss_mlp": 1.11749601, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.09076105210561375, + "language_loss": 0.82297581, + "learning_rate": 0.000795050815296501, + "loss": 0.83457941, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.42871094, + "step": 1664, + "time_per_iteration": 2.990253210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149395, + "balance_loss_mlp": 1.10821986, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.05392034602485258, + "language_loss": 0.93401325, + "learning_rate": 0.0007947992409061695, + "loss": 0.94550717, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.41162109, + "step": 1665, + "time_per_iteration": 2.5734803676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146101, + "balance_loss_mlp": 1.10456824, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07147454481835314, + "language_loss": 0.86398005, + "learning_rate": 0.0007945475520646226, + "loss": 0.87544107, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.4152832, + "step": 1666, + "time_per_iteration": 2.9147067070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144126, + "balance_loss_mlp": 1.10156846, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08541845552139904, + "language_loss": 0.85159481, + "learning_rate": 0.0007942957488695743, + "loss": 0.8630361, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.42578125, + "step": 1667, + "time_per_iteration": 2.6842408180236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138005, + "balance_loss_mlp": 1.09725952, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06001483498827303, + "language_loss": 0.81309706, + "learning_rate": 0.0007940438314187833, + "loss": 0.82447714, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.4074707, + "step": 1668, + "time_per_iteration": 3.0340676307678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128156, + "balance_loss_mlp": 1.08769631, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.06998559069767052, + "language_loss": 0.81191337, + "learning_rate": 0.0007937917998100529, + "loss": 0.82319492, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.40454102, + "step": 1669, + "time_per_iteration": 2.635629177093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.09313023, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.08304565240235381, + "language_loss": 0.79254091, + "learning_rate": 0.0007935396541412302, + "loss": 0.80392736, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.45532227, + "step": 1670, + "time_per_iteration": 2.6226065158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141669, + "balance_loss_mlp": 1.09896851, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07816166477955887, + "language_loss": 0.85914934, + "learning_rate": 0.0007932873945102068, + "loss": 0.87056601, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.42724609, + "step": 1671, + "time_per_iteration": 2.559443473815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_mlp": 1.03238678, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.025388272809080015, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76809424, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15234375, + "step": 1672, + "time_per_iteration": 4.8329596519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176113, + "balance_loss_mlp": 1.13319826, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.10680060394368475, + "language_loss": 0.86589128, + "learning_rate": 0.0007927825337533461, + "loss": 0.87765247, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.42895508, + "step": 1673, + "time_per_iteration": 2.670067071914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117015, + "balance_loss_mlp": 1.12651968, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.0659920492524482, + "language_loss": 0.84953517, + "learning_rate": 0.0007925299328235131, + "loss": 0.86123669, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.43652344, + "step": 1674, + "time_per_iteration": 2.6559884548187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169615, + "balance_loss_mlp": 1.12543643, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.10142438885407562, + "language_loss": 0.85307467, + "learning_rate": 0.000792277218323488, + "loss": 0.86477083, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.44189453, + "step": 1675, + "time_per_iteration": 2.5843372344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158765, + "balance_loss_mlp": 1.11673164, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.06840501438298492, + "language_loss": 0.85418063, + "learning_rate": 0.0007920243903513833, + "loss": 0.86576831, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.4206543, + "step": 1676, + "time_per_iteration": 2.562697649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.09280825, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.06731593225582447, + "language_loss": 0.84609574, + "learning_rate": 0.0007917714490053556, + "loss": 0.85747755, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.45361328, + "step": 1677, + "time_per_iteration": 2.685854434967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131879, + "balance_loss_mlp": 1.09029913, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06310440112326268, + "language_loss": 0.86562228, + "learning_rate": 0.0007915183943836055, + "loss": 0.87694108, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.41601562, + "step": 1678, + "time_per_iteration": 2.8568227291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128366, + "balance_loss_mlp": 1.08466363, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07690366782162197, + "language_loss": 0.84428912, + "learning_rate": 0.0007912652265843773, + "loss": 0.85557282, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.43725586, + "step": 1679, + "time_per_iteration": 3.079998254776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110817, + "balance_loss_mlp": 1.06930852, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.07712564159484636, + "language_loss": 0.8213551, + "learning_rate": 0.0007910119457059597, + "loss": 0.83246326, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.4152832, + "step": 1680, + "time_per_iteration": 2.6812973022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112077, + "balance_loss_mlp": 1.06975782, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.10745693955939492, + "language_loss": 0.81109858, + "learning_rate": 0.0007907585518466849, + "loss": 0.82221937, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.42333984, + "step": 1681, + "time_per_iteration": 2.9406683444976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115262, + "balance_loss_mlp": 1.07265627, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07157404686533678, + "language_loss": 0.89948541, + "learning_rate": 0.000790505045104929, + "loss": 0.91063797, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.42602539, + "step": 1682, + "time_per_iteration": 2.5241646766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119953, + "balance_loss_mlp": 1.07606041, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.06937214564576595, + "language_loss": 0.87034553, + "learning_rate": 0.0007902514255791125, + "loss": 0.88154507, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.43896484, + "step": 1683, + "time_per_iteration": 2.8741068840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111917, + "balance_loss_mlp": 1.076231, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06778435640114842, + "language_loss": 0.87994444, + "learning_rate": 0.0007899976933676986, + "loss": 0.89113617, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.42919922, + "step": 1684, + "time_per_iteration": 2.959290027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117467, + "balance_loss_mlp": 1.07469463, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.06453517439379398, + "language_loss": 0.87573123, + "learning_rate": 0.0007897438485691955, + "loss": 0.88690597, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.42773438, + "step": 1685, + "time_per_iteration": 2.6591978073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_mlp": 1.08655035, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.13512041919643347, + "language_loss": 0.82386112, + "learning_rate": 0.0007894898912821542, + "loss": 0.835177, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.45043945, + "step": 1686, + "time_per_iteration": 2.5375750064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134689, + "balance_loss_mlp": 1.09201205, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.07414292899066016, + "language_loss": 0.8748548, + "learning_rate": 0.0007892358216051695, + "loss": 0.88620168, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.42675781, + "step": 1687, + "time_per_iteration": 2.73968243598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132707, + "balance_loss_mlp": 1.09098339, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06337992950379638, + "language_loss": 0.92269105, + "learning_rate": 0.0007889816396368803, + "loss": 0.93401814, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.41699219, + "step": 1688, + "time_per_iteration": 2.6067299842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131771, + "balance_loss_mlp": 1.08961868, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07885708031147778, + "language_loss": 0.85782814, + "learning_rate": 0.0007887273454759687, + "loss": 0.86914587, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.421875, + "step": 1689, + "time_per_iteration": 2.484260320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122278, + "balance_loss_mlp": 1.08031607, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.06527022407794938, + "language_loss": 0.82859224, + "learning_rate": 0.0007884729392211603, + "loss": 0.83981502, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.41943359, + "step": 1690, + "time_per_iteration": 2.642786741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129634, + "balance_loss_mlp": 1.08812594, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09568065131307975, + "language_loss": 0.86132944, + "learning_rate": 0.0007882184209712245, + "loss": 0.87262577, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.41503906, + "step": 1691, + "time_per_iteration": 2.5199530124664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123067, + "balance_loss_mlp": 1.08234525, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06282055281729462, + "language_loss": 0.86132228, + "learning_rate": 0.000787963790824974, + "loss": 0.87255299, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.40722656, + "step": 1692, + "time_per_iteration": 2.9768075942993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124522, + "balance_loss_mlp": 1.08427668, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.07612118071262816, + "language_loss": 0.89543802, + "learning_rate": 0.0007877090488812651, + "loss": 0.90668321, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.40258789, + "step": 1693, + "time_per_iteration": 2.4604485034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124012, + "balance_loss_mlp": 1.08207428, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.1035661329718289, + "language_loss": 0.83982152, + "learning_rate": 0.0007874541952389973, + "loss": 0.85106164, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.41943359, + "step": 1694, + "time_per_iteration": 2.6709587574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113814, + "balance_loss_mlp": 1.09753752, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08446561178027004, + "language_loss": 0.86949492, + "learning_rate": 0.0007871992299971136, + "loss": 0.8808763, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.40625, + "step": 1695, + "time_per_iteration": 2.5585403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150743, + "balance_loss_mlp": 1.11023593, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.05830689117178756, + "language_loss": 0.84793502, + "learning_rate": 0.0007869441532546001, + "loss": 0.85944247, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.4050293, + "step": 1696, + "time_per_iteration": 2.7510788440704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148317, + "balance_loss_mlp": 1.1100266, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06976949490853021, + "language_loss": 0.79791546, + "learning_rate": 0.0007866889651104867, + "loss": 0.80939865, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.38256836, + "step": 1697, + "time_per_iteration": 2.7944459915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152152, + "balance_loss_mlp": 1.11114383, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.06767982610774756, + "language_loss": 0.83777177, + "learning_rate": 0.000786433665663846, + "loss": 0.84929335, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.40991211, + "step": 1698, + "time_per_iteration": 2.6864194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167936, + "balance_loss_mlp": 1.12514019, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.0725657973515617, + "language_loss": 0.87005848, + "learning_rate": 0.0007861782550137942, + "loss": 0.88173789, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.42797852, + "step": 1699, + "time_per_iteration": 2.896897792816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160393, + "balance_loss_mlp": 1.11986172, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.0774952835645251, + "language_loss": 0.86092401, + "learning_rate": 0.0007859227332594901, + "loss": 0.87252796, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.40527344, + "step": 1700, + "time_per_iteration": 2.8986380100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165908, + "balance_loss_mlp": 1.12449527, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09509515836767467, + "language_loss": 0.85007191, + "learning_rate": 0.0007856671005001365, + "loss": 0.86173105, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.41381836, + "step": 1701, + "time_per_iteration": 3.148084878921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168963, + "balance_loss_mlp": 1.12726378, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.07560076292899535, + "language_loss": 0.82363045, + "learning_rate": 0.0007854113568349787, + "loss": 0.83532006, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.41699219, + "step": 1702, + "time_per_iteration": 3.1411454677581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191314, + "balance_loss_mlp": 1.14882779, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.08142047178498793, + "language_loss": 0.81090933, + "learning_rate": 0.0007851555023633052, + "loss": 0.82282251, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.42504883, + "step": 1703, + "time_per_iteration": 2.9109766483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197058, + "balance_loss_mlp": 1.1559788, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07993965020483434, + "language_loss": 0.82561779, + "learning_rate": 0.0007848995371844474, + "loss": 0.83758843, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.41088867, + "step": 1704, + "time_per_iteration": 2.531611680984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197334, + "balance_loss_mlp": 1.15267849, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.11293951672356671, + "language_loss": 0.81012988, + "learning_rate": 0.0007846434613977801, + "loss": 0.82210326, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.44677734, + "step": 1705, + "time_per_iteration": 2.5413970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175519, + "balance_loss_mlp": 1.1340816, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.10106481858624654, + "language_loss": 0.78958142, + "learning_rate": 0.0007843872751027203, + "loss": 0.80133665, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.41455078, + "step": 1706, + "time_per_iteration": 2.817387580871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158115, + "balance_loss_mlp": 1.1166296, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.06764312208644677, + "language_loss": 0.87366319, + "learning_rate": 0.0007841309783987287, + "loss": 0.88524431, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.41503906, + "step": 1707, + "time_per_iteration": 2.7335729598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155907, + "balance_loss_mlp": 1.11117959, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06220723681544313, + "language_loss": 0.89445031, + "learning_rate": 0.0007838745713853084, + "loss": 0.90600932, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.44702148, + "step": 1708, + "time_per_iteration": 2.6179606914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114633, + "balance_loss_mlp": 1.10207939, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.09473479000062662, + "language_loss": 0.84092307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85238636, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.44213867, + "step": 1709, + "time_per_iteration": 2.703660249710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160153, + "balance_loss_mlp": 1.11723721, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06816782803484764, + "language_loss": 0.86778289, + "learning_rate": 0.0007833614268284082, + "loss": 0.8793844, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.42944336, + "step": 1710, + "time_per_iteration": 2.548859119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077221, + "balance_loss_mlp": 1.06558585, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.029019472878356288, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75186992, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.11621094, + "step": 1711, + "time_per_iteration": 4.9234619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117236, + "balance_loss_mlp": 1.12934983, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.10714861433418864, + "language_loss": 0.78928375, + "learning_rate": 0.0007828478422289016, + "loss": 0.80100739, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.43017578, + "step": 1712, + "time_per_iteration": 2.584307909011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167703, + "balance_loss_mlp": 1.12228465, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.08165577234876795, + "language_loss": 0.89409995, + "learning_rate": 0.0007825908851623833, + "loss": 0.90577698, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.45410156, + "step": 1713, + "time_per_iteration": 2.7400283813476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158648, + "balance_loss_mlp": 1.11475515, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08464988169520862, + "language_loss": 0.85764992, + "learning_rate": 0.0007823338183843533, + "loss": 0.86923635, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.43896484, + "step": 1714, + "time_per_iteration": 2.671375036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157012, + "balance_loss_mlp": 1.11419201, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.0730773907324959, + "language_loss": 0.81870985, + "learning_rate": 0.0007820766419946141, + "loss": 0.83028001, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4284668, + "step": 1715, + "time_per_iteration": 3.3361854553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_mlp": 1.01473284, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.017749933707714268, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80699992, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.12792969, + "step": 1716, + "time_per_iteration": 4.933880567550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193401, + "balance_loss_mlp": 1.14895988, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.1003306893312863, + "language_loss": 0.76434684, + "learning_rate": 0.0007815619607794288, + "loss": 0.77628088, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.4440918, + "step": 1717, + "time_per_iteration": 2.6259148120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191125, + "balance_loss_mlp": 1.14823365, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.07927399877074098, + "language_loss": 0.83156073, + "learning_rate": 0.0007813044561538001, + "loss": 0.84347194, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.42895508, + "step": 1718, + "time_per_iteration": 3.1473774909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.145239, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06905487251407855, + "language_loss": 0.88941157, + "learning_rate": 0.0007810468423160958, + "loss": 0.9013117, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.44799805, + "step": 1719, + "time_per_iteration": 2.895155906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181044, + "balance_loss_mlp": 1.13943982, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06204943336400955, + "language_loss": 0.82643551, + "learning_rate": 0.0007807891193663306, + "loss": 0.83824587, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.41625977, + "step": 1720, + "time_per_iteration": 2.7824859619140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165341, + "balance_loss_mlp": 1.12357068, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07732363095630222, + "language_loss": 0.82492876, + "learning_rate": 0.0007805312874045614, + "loss": 0.83658212, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.41796875, + "step": 1721, + "time_per_iteration": 2.5710601806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170989, + "balance_loss_mlp": 1.12807381, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.07358039625922873, + "language_loss": 0.86639178, + "learning_rate": 0.0007802733465308874, + "loss": 0.87810171, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.42895508, + "step": 1722, + "time_per_iteration": 2.4402778148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171295, + "balance_loss_mlp": 1.12632966, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06616160911514579, + "language_loss": 0.8424235, + "learning_rate": 0.0007800152968454501, + "loss": 0.85413647, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.44970703, + "step": 1723, + "time_per_iteration": 2.689309597015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115688, + "balance_loss_mlp": 1.11634886, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06191321033146657, + "language_loss": 0.90671206, + "learning_rate": 0.0007797571384484334, + "loss": 0.91828084, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.40527344, + "step": 1724, + "time_per_iteration": 2.8473238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147699, + "balance_loss_mlp": 1.10421109, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.06062690844208358, + "language_loss": 0.92524576, + "learning_rate": 0.0007794988714400633, + "loss": 0.93672276, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.43530273, + "step": 1725, + "time_per_iteration": 2.62685227394104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146389, + "balance_loss_mlp": 1.10118532, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.09351886782013036, + "language_loss": 0.85586655, + "learning_rate": 0.0007792404959206079, + "loss": 0.86733043, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.45214844, + "step": 1726, + "time_per_iteration": 2.487520694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150134, + "balance_loss_mlp": 1.10707533, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.09481341164405561, + "language_loss": 0.81825417, + "learning_rate": 0.0007789820119903774, + "loss": 0.82975549, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.4309082, + "step": 1727, + "time_per_iteration": 2.9732954502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118165, + "balance_loss_mlp": 1.16734493, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.0769954731958624, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79674315, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14257812, + "step": 1728, + "time_per_iteration": 4.8314409255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149054, + "balance_loss_mlp": 1.10599601, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.06765949793064117, + "language_loss": 0.84123361, + "learning_rate": 0.0007784647192990428, + "loss": 0.85272419, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.43041992, + "step": 1729, + "time_per_iteration": 2.715163230895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147649, + "balance_loss_mlp": 1.10799968, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06156065876328187, + "language_loss": 0.80939102, + "learning_rate": 0.0007782059107387696, + "loss": 0.82086754, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.39672852, + "step": 1730, + "time_per_iteration": 2.865858554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165768, + "balance_loss_mlp": 1.12247074, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.07708666526094303, + "language_loss": 0.88668191, + "learning_rate": 0.0007779469941693826, + "loss": 0.89833963, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.43261719, + "step": 1731, + "time_per_iteration": 2.8640921115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166075, + "balance_loss_mlp": 1.12351775, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.08600344935746515, + "language_loss": 0.76943499, + "learning_rate": 0.0007776879696914029, + "loss": 0.78109574, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.42553711, + "step": 1732, + "time_per_iteration": 2.8162899017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159987, + "balance_loss_mlp": 1.11745262, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.07534435583192022, + "language_loss": 0.89131331, + "learning_rate": 0.000777428837405392, + "loss": 0.90291321, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.42553711, + "step": 1733, + "time_per_iteration": 2.869436740875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151668, + "balance_loss_mlp": 1.11042213, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.0649827105829465, + "language_loss": 0.87220478, + "learning_rate": 0.0007771695974119544, + "loss": 0.88372147, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.41259766, + "step": 1734, + "time_per_iteration": 2.5153088569641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138148, + "balance_loss_mlp": 1.0959959, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07614790264044081, + "language_loss": 0.76295686, + "learning_rate": 0.0007769102498117359, + "loss": 0.77433836, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.42163086, + "step": 1735, + "time_per_iteration": 3.1105504035949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136381, + "balance_loss_mlp": 1.09430027, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06230250245944302, + "language_loss": 0.80020654, + "learning_rate": 0.000776650794705424, + "loss": 0.81157035, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.42089844, + "step": 1736, + "time_per_iteration": 3.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141082, + "balance_loss_mlp": 1.09890568, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.053956568858798265, + "language_loss": 0.82610357, + "learning_rate": 0.0007763912321937483, + "loss": 0.8375144, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.421875, + "step": 1737, + "time_per_iteration": 2.6871769428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126175, + "balance_loss_mlp": 1.0870508, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.06336651482337263, + "language_loss": 0.82955027, + "learning_rate": 0.0007761315623774799, + "loss": 0.84081209, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.39111328, + "step": 1738, + "time_per_iteration": 3.4055540561676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113225, + "balance_loss_mlp": 1.09088469, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08278309899958468, + "language_loss": 0.88244802, + "learning_rate": 0.0007758717853574313, + "loss": 0.89377058, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.41381836, + "step": 1739, + "time_per_iteration": 2.7666313648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120554, + "balance_loss_mlp": 1.08114362, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0696820530517557, + "language_loss": 0.90798807, + "learning_rate": 0.0007756119012344571, + "loss": 0.91919363, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.39404297, + "step": 1740, + "time_per_iteration": 2.5491223335266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115915, + "balance_loss_mlp": 1.07428706, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06589349032225494, + "language_loss": 0.85103011, + "learning_rate": 0.0007753519101094535, + "loss": 0.86218929, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.41625977, + "step": 1741, + "time_per_iteration": 2.765583038330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112401, + "balance_loss_mlp": 1.0837177, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.0662644502369307, + "language_loss": 0.86452365, + "learning_rate": 0.0007750918120833575, + "loss": 0.87576377, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.40283203, + "step": 1742, + "time_per_iteration": 2.6085479259490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140409, + "balance_loss_mlp": 1.10240483, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.07280628286033199, + "language_loss": 0.87783647, + "learning_rate": 0.0007748316072571485, + "loss": 0.88924056, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.37963867, + "step": 1743, + "time_per_iteration": 2.793119192123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133272, + "balance_loss_mlp": 1.09259784, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0850070564381928, + "language_loss": 0.79522568, + "learning_rate": 0.0007745712957318467, + "loss": 0.80655837, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.40698242, + "step": 1744, + "time_per_iteration": 2.943847417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137205, + "balance_loss_mlp": 1.09700739, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06831295126385283, + "language_loss": 0.86807823, + "learning_rate": 0.0007743108776085141, + "loss": 0.87945032, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.40136719, + "step": 1745, + "time_per_iteration": 2.771634101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011368, + "balance_loss_mlp": 1.09743714, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.05902486087385494, + "language_loss": 0.83364028, + "learning_rate": 0.0007740503529882543, + "loss": 0.84500825, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.39331055, + "step": 1746, + "time_per_iteration": 2.7896366119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139374, + "balance_loss_mlp": 1.09831822, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.061665767711377016, + "language_loss": 0.90955931, + "learning_rate": 0.0007737897219722114, + "loss": 0.92095304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.41088867, + "step": 1747, + "time_per_iteration": 2.7088165283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129332, + "balance_loss_mlp": 1.08725071, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08528813851267185, + "language_loss": 0.81553382, + "learning_rate": 0.0007735289846615716, + "loss": 0.82682711, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.42089844, + "step": 1748, + "time_per_iteration": 2.635098934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129982, + "balance_loss_mlp": 1.09119081, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.09169024401551043, + "language_loss": 0.82026851, + "learning_rate": 0.0007732681411575621, + "loss": 0.83156836, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.38818359, + "step": 1749, + "time_per_iteration": 2.6693224906921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134785, + "balance_loss_mlp": 1.09437299, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0698579909367107, + "language_loss": 0.88035583, + "learning_rate": 0.0007730071915614514, + "loss": 0.89170372, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.40405273, + "step": 1750, + "time_per_iteration": 2.6900789737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.09800839, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09938227861633823, + "language_loss": 0.89158392, + "learning_rate": 0.0007727461359745489, + "loss": 0.90296388, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.3996582, + "step": 1751, + "time_per_iteration": 2.5086123943328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154901, + "balance_loss_mlp": 1.1132257, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06249007419708336, + "language_loss": 0.86569941, + "learning_rate": 0.0007724849744982056, + "loss": 0.87724847, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.41674805, + "step": 1752, + "time_per_iteration": 2.700474739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169913, + "balance_loss_mlp": 1.12737882, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.06015013269361517, + "language_loss": 0.8195309, + "learning_rate": 0.0007722237072338131, + "loss": 0.83123004, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.42529297, + "step": 1753, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.14816022, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.11537307258838475, + "language_loss": 0.85648489, + "learning_rate": 0.0007719623342828046, + "loss": 0.86841327, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.44726562, + "step": 1754, + "time_per_iteration": 2.517010450363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191581, + "balance_loss_mlp": 1.14685392, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.06847069318075473, + "language_loss": 0.84535718, + "learning_rate": 0.000771700855746654, + "loss": 0.85727292, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.44750977, + "step": 1755, + "time_per_iteration": 2.5961217880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164795, + "balance_loss_mlp": 1.1231432, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.05626734330263072, + "language_loss": 0.8872534, + "learning_rate": 0.0007714392717268763, + "loss": 0.89890134, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.41674805, + "step": 1756, + "time_per_iteration": 2.5784223079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166558, + "balance_loss_mlp": 1.12185431, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.07105398160496887, + "language_loss": 0.8649826, + "learning_rate": 0.0007711775823250273, + "loss": 0.87664813, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.44702148, + "step": 1757, + "time_per_iteration": 2.5373613834381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115594, + "balance_loss_mlp": 1.11207056, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06341765106008965, + "language_loss": 0.83797616, + "learning_rate": 0.0007709157876427039, + "loss": 0.84953558, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.43896484, + "step": 1758, + "time_per_iteration": 3.1393754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144144, + "balance_loss_mlp": 1.10027504, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.0573406658982909, + "language_loss": 0.85933769, + "learning_rate": 0.0007706538877815439, + "loss": 0.8707791, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4387207, + "step": 1759, + "time_per_iteration": 2.6080896854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152987, + "balance_loss_mlp": 1.11054862, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.06135171113161323, + "language_loss": 0.83615482, + "learning_rate": 0.0007703918828432259, + "loss": 0.84768468, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.42456055, + "step": 1760, + "time_per_iteration": 2.5886309146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148897, + "balance_loss_mlp": 1.10464644, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.05937499082636783, + "language_loss": 0.88942921, + "learning_rate": 0.000770129772929469, + "loss": 0.90091813, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.44238281, + "step": 1761, + "time_per_iteration": 2.645293951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140708, + "balance_loss_mlp": 1.09629107, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07244625367361128, + "language_loss": 0.88504505, + "learning_rate": 0.0007698675581420334, + "loss": 0.89645213, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.4440918, + "step": 1762, + "time_per_iteration": 2.849560022354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149138, + "balance_loss_mlp": 1.10469711, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06385607916775927, + "language_loss": 0.79163915, + "learning_rate": 0.0007696052385827199, + "loss": 0.80313051, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.44458008, + "step": 1763, + "time_per_iteration": 2.9164280891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138684, + "balance_loss_mlp": 1.09765172, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.07477333876977248, + "language_loss": 0.78203613, + "learning_rate": 0.00076934281435337, + "loss": 0.79342294, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.41040039, + "step": 1764, + "time_per_iteration": 2.7213284969329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131547, + "balance_loss_mlp": 1.08922768, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0661700543843282, + "language_loss": 0.86476332, + "learning_rate": 0.0007690802855558658, + "loss": 0.87607884, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.4230957, + "step": 1765, + "time_per_iteration": 2.8648691177368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144213, + "balance_loss_mlp": 1.12981212, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.0393682164062729, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77519166, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.14355469, + "step": 1766, + "time_per_iteration": 4.883134603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138855, + "balance_loss_mlp": 1.09441423, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.06478844738748038, + "language_loss": 0.89260793, + "learning_rate": 0.0007685549146641262, + "loss": 0.90399647, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.44458008, + "step": 1767, + "time_per_iteration": 2.5584475994110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138308, + "balance_loss_mlp": 1.09780085, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0552886410345199, + "language_loss": 0.8865279, + "learning_rate": 0.0007682920727738579, + "loss": 0.89791095, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.4050293, + "step": 1768, + "time_per_iteration": 2.462104558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.09170651, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.07550967393636049, + "language_loss": 0.84987569, + "learning_rate": 0.000768029126723369, + "loss": 0.86121619, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.42333984, + "step": 1769, + "time_per_iteration": 2.5362985134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.09360242, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.0745429404709064, + "language_loss": 0.82167029, + "learning_rate": 0.0007677660766147447, + "loss": 0.83301806, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.41186523, + "step": 1770, + "time_per_iteration": 2.516824960708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.06356168, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.02503514207226814, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73550433, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.15917969, + "step": 1771, + "time_per_iteration": 4.943475008010864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137395, + "balance_loss_mlp": 1.09543359, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.06960763795190199, + "language_loss": 0.80136019, + "learning_rate": 0.0007672396646316306, + "loss": 0.81273413, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.41918945, + "step": 1772, + "time_per_iteration": 2.5425803661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145424, + "balance_loss_mlp": 1.10341442, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.05748114386543088, + "language_loss": 0.80760133, + "learning_rate": 0.000766976302961512, + "loss": 0.81905556, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.42041016, + "step": 1773, + "time_per_iteration": 2.982287645339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155937, + "balance_loss_mlp": 1.11330807, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.06912006035569716, + "language_loss": 0.81549138, + "learning_rate": 0.0007667128376420003, + "loss": 0.82705075, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.42626953, + "step": 1774, + "time_per_iteration": 2.5396063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151156, + "balance_loss_mlp": 1.10926604, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07768471353958366, + "language_loss": 0.84963071, + "learning_rate": 0.0007664492687753817, + "loss": 0.86114228, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.41894531, + "step": 1775, + "time_per_iteration": 2.7326042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139013, + "balance_loss_mlp": 1.09845805, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10552495092435867, + "language_loss": 0.81927752, + "learning_rate": 0.000766185596463983, + "loss": 0.83066773, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.40551758, + "step": 1776, + "time_per_iteration": 2.622465133666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126657, + "balance_loss_mlp": 1.08455205, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.06005887645947995, + "language_loss": 0.77224028, + "learning_rate": 0.0007659218208101706, + "loss": 0.78350687, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.42114258, + "step": 1777, + "time_per_iteration": 3.099862575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124902, + "balance_loss_mlp": 1.0852288, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.057585659974550854, + "language_loss": 0.85272229, + "learning_rate": 0.0007656579419163515, + "loss": 0.86397129, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.39672852, + "step": 1778, + "time_per_iteration": 2.7696709632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129572, + "balance_loss_mlp": 1.08794475, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.07376046533358642, + "language_loss": 0.77272999, + "learning_rate": 0.0007653939598849724, + "loss": 0.78402567, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.41650391, + "step": 1779, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131616, + "balance_loss_mlp": 1.11511779, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.05276839393693404, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84011823, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16503906, + "step": 1780, + "time_per_iteration": 4.96061897277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112473, + "balance_loss_mlp": 1.08267307, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07129012841004771, + "language_loss": 0.80831903, + "learning_rate": 0.000764865686819522, + "loss": 0.81956631, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.4206543, + "step": 1781, + "time_per_iteration": 3.089735507965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.08492422, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0622927262326037, + "language_loss": 0.86375809, + "learning_rate": 0.0007646013959905449, + "loss": 0.87502241, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.41503906, + "step": 1782, + "time_per_iteration": 2.6112704277038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123127, + "balance_loss_mlp": 1.08130884, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10167310682771787, + "language_loss": 0.81018484, + "learning_rate": 0.0007643370024341949, + "loss": 0.82141614, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.41821289, + "step": 1783, + "time_per_iteration": 3.1074132919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115046, + "balance_loss_mlp": 1.07563567, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.057781870331099924, + "language_loss": 0.83518296, + "learning_rate": 0.0007640725062531195, + "loss": 0.84633338, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.39404297, + "step": 1784, + "time_per_iteration": 2.491313934326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112121, + "balance_loss_mlp": 1.07228112, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.12476428026998775, + "language_loss": 0.86600161, + "learning_rate": 0.0007638079075500047, + "loss": 0.87712288, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.39819336, + "step": 1785, + "time_per_iteration": 2.5236706733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070785, + "balance_loss_mlp": 1.05457258, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.032988320908807454, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76251453, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.16210938, + "step": 1786, + "time_per_iteration": 4.938300609588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_mlp": 1.09274352, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.06899034270313556, + "language_loss": 0.83409935, + "learning_rate": 0.0007632784029886026, + "loss": 0.84544241, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.41552734, + "step": 1787, + "time_per_iteration": 2.6218347549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140121, + "balance_loss_mlp": 1.09968519, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.05777013506444436, + "language_loss": 0.85674673, + "learning_rate": 0.0007630134973358873, + "loss": 0.86814797, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.40429688, + "step": 1788, + "time_per_iteration": 2.9675180912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.11780846, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.11323624876812292, + "language_loss": 0.86969185, + "learning_rate": 0.0007627484895722763, + "loss": 0.88126147, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.39160156, + "step": 1789, + "time_per_iteration": 2.6400198936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164783, + "balance_loss_mlp": 1.1222018, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.06957715435201431, + "language_loss": 0.80509681, + "learning_rate": 0.0007624833798006552, + "loss": 0.81674469, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.42602539, + "step": 1790, + "time_per_iteration": 3.042621374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162497, + "balance_loss_mlp": 1.11924767, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.09367673394256656, + "language_loss": 0.84194326, + "learning_rate": 0.0007622181681239483, + "loss": 0.85356832, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.43261719, + "step": 1791, + "time_per_iteration": 2.642648220062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140416, + "balance_loss_mlp": 1.09907472, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07487034842421487, + "language_loss": 0.84962463, + "learning_rate": 0.0007619528546451202, + "loss": 0.86102873, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.41333008, + "step": 1792, + "time_per_iteration": 2.8014347553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.08941662, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.05771787988130437, + "language_loss": 0.84187096, + "learning_rate": 0.0007616874394671745, + "loss": 0.85317373, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.40869141, + "step": 1793, + "time_per_iteration": 3.336076498031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137422, + "balance_loss_mlp": 1.09276664, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08239177777048284, + "language_loss": 0.85433841, + "learning_rate": 0.0007614219226931547, + "loss": 0.86571258, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44677734, + "step": 1794, + "time_per_iteration": 2.6596035957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.0951401, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.06809904369873732, + "language_loss": 0.85092592, + "learning_rate": 0.0007611563044261435, + "loss": 0.86229378, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.41674805, + "step": 1795, + "time_per_iteration": 2.545440435409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140576, + "balance_loss_mlp": 1.09601521, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.08865061616635866, + "language_loss": 0.8722235, + "learning_rate": 0.0007608905847692631, + "loss": 0.88362932, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.44555664, + "step": 1796, + "time_per_iteration": 2.471306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112492, + "balance_loss_mlp": 1.08486605, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07442154430907115, + "language_loss": 0.86828166, + "learning_rate": 0.0007606247638256749, + "loss": 0.87953079, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.40039062, + "step": 1797, + "time_per_iteration": 2.8728272914886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_mlp": 1.03099036, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.022391201486326673, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79215777, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.14453125, + "step": 1798, + "time_per_iteration": 4.99533486366272 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_mlp": 1.0224725, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.020693498138200886, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80363786, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.14160156, + "step": 1799, + "time_per_iteration": 4.871920347213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131321, + "balance_loss_mlp": 1.086761, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.06425687332848114, + "language_loss": 0.8622126, + "learning_rate": 0.0007598266943068686, + "loss": 0.8735258, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44555664, + "step": 1800, + "time_per_iteration": 2.7352967262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128705, + "balance_loss_mlp": 1.0892942, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.06122285990583016, + "language_loss": 0.84089196, + "learning_rate": 0.0007595604692488507, + "loss": 0.85217899, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.39404297, + "step": 1801, + "time_per_iteration": 2.520047664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145052, + "balance_loss_mlp": 1.10182643, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.08959882775364528, + "language_loss": 0.83156121, + "learning_rate": 0.0007592941434205215, + "loss": 0.84301168, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.43237305, + "step": 1802, + "time_per_iteration": 2.774533987045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_mlp": 1.01191127, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.0173366039721641, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74594939, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11962891, + "step": 1803, + "time_per_iteration": 5.441190004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130945, + "balance_loss_mlp": 1.08481145, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07392614166366455, + "language_loss": 0.80754089, + "learning_rate": 0.0007587611898665566, + "loss": 0.81885034, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.4609375, + "step": 1804, + "time_per_iteration": 3.0738565921783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126694, + "balance_loss_mlp": 1.08320653, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.052717282161679486, + "language_loss": 0.82365519, + "learning_rate": 0.0007584945623478315, + "loss": 0.83492208, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.43530273, + "step": 1805, + "time_per_iteration": 2.810065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.08773112, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.0654216117506123, + "language_loss": 0.81839657, + "learning_rate": 0.000758227834472617, + "loss": 0.8297019, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.42822266, + "step": 1806, + "time_per_iteration": 3.0400753021240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129234, + "balance_loss_mlp": 1.08631909, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.06780310502945991, + "language_loss": 0.77468187, + "learning_rate": 0.0007579610063444664, + "loss": 0.78597426, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.42895508, + "step": 1807, + "time_per_iteration": 2.720200538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_mlp": 1.0805254, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.056464817781099026, + "language_loss": 0.87875664, + "learning_rate": 0.0007576940780669712, + "loss": 0.88999271, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4309082, + "step": 1808, + "time_per_iteration": 3.1972455978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119319, + "balance_loss_mlp": 1.07723832, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.06350201854913072, + "language_loss": 0.84194762, + "learning_rate": 0.0007574270497437624, + "loss": 0.85314083, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.42089844, + "step": 1809, + "time_per_iteration": 2.956308364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112252, + "balance_loss_mlp": 1.08036816, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.05949268624371524, + "language_loss": 0.88030243, + "learning_rate": 0.000757159921478509, + "loss": 0.89152765, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.42138672, + "step": 1810, + "time_per_iteration": 2.7515318393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_mlp": 1.04769194, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.027450813841054106, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75509393, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.719837427139282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.09272385, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.06099509375847796, + "language_loss": 0.87676752, + "learning_rate": 0.0007566253655367423, + "loss": 0.88813394, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.43896484, + "step": 1812, + "time_per_iteration": 2.6117310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145498, + "balance_loss_mlp": 1.10196316, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.26075237363376164, + "language_loss": 0.90086293, + "learning_rate": 0.000756357938067762, + "loss": 0.91231787, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.43554688, + "step": 1813, + "time_per_iteration": 2.6537845134735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09305573, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.07803772738029488, + "language_loss": 0.8299284, + "learning_rate": 0.0007560904110718033, + "loss": 0.84130079, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44165039, + "step": 1814, + "time_per_iteration": 3.2229981422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131299, + "balance_loss_mlp": 1.08549881, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06602375994559181, + "language_loss": 0.83648008, + "learning_rate": 0.0007558227846527297, + "loss": 0.8477931, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.45751953, + "step": 1815, + "time_per_iteration": 2.8217966556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137186, + "balance_loss_mlp": 1.09300709, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.06552880481969095, + "language_loss": 0.83563447, + "learning_rate": 0.0007555550589144429, + "loss": 0.84700632, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44189453, + "step": 1816, + "time_per_iteration": 2.4231276512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148289, + "balance_loss_mlp": 1.1026082, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.05960251663438414, + "language_loss": 0.84705317, + "learning_rate": 0.000755287233960883, + "loss": 0.85853606, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.45678711, + "step": 1817, + "time_per_iteration": 2.5598244667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148754, + "balance_loss_mlp": 1.10297787, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.06564730471203778, + "language_loss": 0.78051704, + "learning_rate": 0.0007550193098960292, + "loss": 0.79200459, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.45751953, + "step": 1818, + "time_per_iteration": 2.8570642471313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115418, + "balance_loss_mlp": 1.11033523, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.05538445579726575, + "language_loss": 0.8654325, + "learning_rate": 0.0007547512868238988, + "loss": 0.87697428, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.43847656, + "step": 1819, + "time_per_iteration": 3.1437833309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170578, + "balance_loss_mlp": 1.12499213, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.0822966351911203, + "language_loss": 0.83893883, + "learning_rate": 0.0007544831648485473, + "loss": 0.85064459, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.45605469, + "step": 1820, + "time_per_iteration": 2.660233736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162235, + "balance_loss_mlp": 1.11684048, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.06443547558053964, + "language_loss": 0.81439716, + "learning_rate": 0.0007542149440740694, + "loss": 0.82601953, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.45385742, + "step": 1821, + "time_per_iteration": 2.6618528366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154684, + "balance_loss_mlp": 1.10938418, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.06960442221541481, + "language_loss": 0.86201102, + "learning_rate": 0.000753946624604597, + "loss": 0.87355781, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.45288086, + "step": 1822, + "time_per_iteration": 2.7180583477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138697, + "balance_loss_mlp": 1.09466076, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.11840223630221765, + "language_loss": 0.88456279, + "learning_rate": 0.0007536782065443015, + "loss": 0.89594972, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44042969, + "step": 1823, + "time_per_iteration": 2.6035680770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147734, + "balance_loss_mlp": 1.1024822, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.08971754998357863, + "language_loss": 0.75357497, + "learning_rate": 0.0007534096899973919, + "loss": 0.76505232, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.45263672, + "step": 1824, + "time_per_iteration": 2.592313528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136405, + "balance_loss_mlp": 1.095397, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.056380284358423516, + "language_loss": 0.8296026, + "learning_rate": 0.0007531410750681154, + "loss": 0.84096658, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.41015625, + "step": 1825, + "time_per_iteration": 2.7599031925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149352, + "balance_loss_mlp": 1.10710466, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.06329210930184016, + "language_loss": 0.8686763, + "learning_rate": 0.0007528723618607575, + "loss": 0.88016987, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.42236328, + "step": 1826, + "time_per_iteration": 3.423145055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156709, + "balance_loss_mlp": 1.11808527, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.05752886424443174, + "language_loss": 0.8293525, + "learning_rate": 0.0007526035504796422, + "loss": 0.84091961, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.38598633, + "step": 1827, + "time_per_iteration": 2.774202346801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164193, + "balance_loss_mlp": 1.12080038, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.08334994788856638, + "language_loss": 0.87348354, + "learning_rate": 0.0007523346410291312, + "loss": 0.8851254, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.43408203, + "step": 1828, + "time_per_iteration": 2.7933921813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172191, + "balance_loss_mlp": 1.13127816, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.05847449829546615, + "language_loss": 0.85163879, + "learning_rate": 0.0007520656336136245, + "loss": 0.86336064, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.40942383, + "step": 1829, + "time_per_iteration": 2.9654810428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167386, + "balance_loss_mlp": 1.12675905, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06508844853371867, + "language_loss": 0.88540596, + "learning_rate": 0.0007517965283375599, + "loss": 0.89707983, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.40625, + "step": 1830, + "time_per_iteration": 2.833653211593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161789, + "balance_loss_mlp": 1.12078059, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.05306701185260888, + "language_loss": 0.89636958, + "learning_rate": 0.0007515273253054132, + "loss": 0.90798748, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.41015625, + "step": 1831, + "time_per_iteration": 2.648688554763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162371, + "balance_loss_mlp": 1.11788237, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.060637132075448665, + "language_loss": 0.8317945, + "learning_rate": 0.0007512580246216988, + "loss": 0.84341824, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44482422, + "step": 1832, + "time_per_iteration": 2.695558786392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152178, + "balance_loss_mlp": 1.11288619, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.06652239867864222, + "language_loss": 0.8520152, + "learning_rate": 0.000750988626390968, + "loss": 0.86353695, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.39306641, + "step": 1833, + "time_per_iteration": 2.5903215408325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114923, + "balance_loss_mlp": 1.10810232, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.05520517467567221, + "language_loss": 0.85274744, + "learning_rate": 0.0007507191307178108, + "loss": 0.86423969, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.41137695, + "step": 1834, + "time_per_iteration": 2.7567453384399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132557, + "balance_loss_mlp": 1.0890696, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.06897138795442613, + "language_loss": 0.75032014, + "learning_rate": 0.0007504495377068543, + "loss": 0.76164567, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.43481445, + "step": 1835, + "time_per_iteration": 2.7309370040893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134622, + "balance_loss_mlp": 1.08972788, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09099083327189633, + "language_loss": 0.81936944, + "learning_rate": 0.0007501798474627642, + "loss": 0.8307156, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44873047, + "step": 1836, + "time_per_iteration": 2.9126806259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113171, + "balance_loss_mlp": 1.08853245, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.058808043239055564, + "language_loss": 0.8375026, + "learning_rate": 0.0007499100600902433, + "loss": 0.84881973, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.43164062, + "step": 1837, + "time_per_iteration": 2.9810633659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124171, + "balance_loss_mlp": 1.08118403, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08552727697149294, + "language_loss": 0.8450433, + "learning_rate": 0.0007496401756940324, + "loss": 0.85628498, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.43017578, + "step": 1838, + "time_per_iteration": 2.670412540435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130914, + "balance_loss_mlp": 1.08897638, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.06964876492363449, + "language_loss": 0.82608843, + "learning_rate": 0.0007493701943789098, + "loss": 0.83739758, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.41967773, + "step": 1839, + "time_per_iteration": 2.772620677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.09537208, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07045943234490067, + "language_loss": 0.83116889, + "learning_rate": 0.000749100116249692, + "loss": 0.84255433, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.43188477, + "step": 1840, + "time_per_iteration": 2.6031582355499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144616, + "balance_loss_mlp": 1.10110414, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.08424265710124153, + "language_loss": 0.86582088, + "learning_rate": 0.0007488299414112321, + "loss": 0.87726706, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.43505859, + "step": 1841, + "time_per_iteration": 2.5864784717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.10726476, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.058600000923872894, + "language_loss": 0.77847576, + "learning_rate": 0.0007485596699684215, + "loss": 0.78998852, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.43994141, + "step": 1842, + "time_per_iteration": 2.8149642944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156484, + "balance_loss_mlp": 1.11266279, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.055073821734726955, + "language_loss": 0.85694617, + "learning_rate": 0.000748289302026189, + "loss": 0.86851102, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.43823242, + "step": 1843, + "time_per_iteration": 2.8475751876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158372, + "balance_loss_mlp": 1.11688685, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.057565803102883874, + "language_loss": 0.85718876, + "learning_rate": 0.0007480188376895004, + "loss": 0.86877251, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.41479492, + "step": 1844, + "time_per_iteration": 3.0344529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140478, + "balance_loss_mlp": 1.12693632, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.05127204690943662, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74951822, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.13574219, + "step": 1845, + "time_per_iteration": 4.8589537143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176931, + "balance_loss_mlp": 1.13518405, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08988090291235612, + "language_loss": 0.78641856, + "learning_rate": 0.0007474776202528074, + "loss": 0.79818785, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.41772461, + "step": 1846, + "time_per_iteration": 2.9269866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184559, + "balance_loss_mlp": 1.14243031, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08000045078310114, + "language_loss": 0.81513619, + "learning_rate": 0.000747206867362922, + "loss": 0.82698178, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.42114258, + "step": 1847, + "time_per_iteration": 3.067870616912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169442, + "balance_loss_mlp": 1.12573957, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.0760432300690223, + "language_loss": 0.84328806, + "learning_rate": 0.0007469360184988194, + "loss": 0.85498255, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.43701172, + "step": 1848, + "time_per_iteration": 2.8130369186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159569, + "balance_loss_mlp": 1.11837053, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08168000095068725, + "language_loss": 0.86707914, + "learning_rate": 0.0007466650737656518, + "loss": 0.87867486, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.41162109, + "step": 1849, + "time_per_iteration": 2.592503309249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115621, + "balance_loss_mlp": 1.11324644, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06757272046168854, + "language_loss": 0.89898217, + "learning_rate": 0.0007463940332686098, + "loss": 0.91054422, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.42944336, + "step": 1850, + "time_per_iteration": 2.4776744842529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148398, + "balance_loss_mlp": 1.10607898, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.05922624538442341, + "language_loss": 0.84461212, + "learning_rate": 0.0007461228971129205, + "loss": 0.85609609, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.42358398, + "step": 1851, + "time_per_iteration": 2.9012656211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.11387658, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.058626739978073765, + "language_loss": 0.85743707, + "learning_rate": 0.0007458516654038483, + "loss": 0.86898398, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.40820312, + "step": 1852, + "time_per_iteration": 2.666947603225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165665, + "balance_loss_mlp": 1.12160563, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06798765543406252, + "language_loss": 0.86475062, + "learning_rate": 0.0007455803382466946, + "loss": 0.87640727, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44042969, + "step": 1853, + "time_per_iteration": 2.804776191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162987, + "balance_loss_mlp": 1.11985719, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07311152518110202, + "language_loss": 0.87308323, + "learning_rate": 0.0007453089157467979, + "loss": 0.88471317, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.43139648, + "step": 1854, + "time_per_iteration": 2.8038864135742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159292, + "balance_loss_mlp": 1.1161381, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06621845487790666, + "language_loss": 0.82129812, + "learning_rate": 0.0007450373980095341, + "loss": 0.83289105, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.43164062, + "step": 1855, + "time_per_iteration": 3.0980496406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154286, + "balance_loss_mlp": 1.11268187, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.05908088829108725, + "language_loss": 0.87076378, + "learning_rate": 0.0007447657851403155, + "loss": 0.88230669, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.41601562, + "step": 1856, + "time_per_iteration": 2.6393351554870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148054, + "balance_loss_mlp": 1.10609269, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.07116077808597938, + "language_loss": 0.79415643, + "learning_rate": 0.0007444940772445915, + "loss": 0.805637, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.41943359, + "step": 1857, + "time_per_iteration": 2.7049038410186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.10770321, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06303496934817837, + "language_loss": 0.80443203, + "learning_rate": 0.0007442222744278484, + "loss": 0.81591749, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.40844727, + "step": 1858, + "time_per_iteration": 2.6416029930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.10056937, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.06290523981550739, + "language_loss": 0.84690839, + "learning_rate": 0.0007439503767956099, + "loss": 0.85831463, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.40063477, + "step": 1859, + "time_per_iteration": 2.697295665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095769, + "balance_loss_mlp": 1.08213139, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.02707100394521806, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80767375, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.13671875, + "step": 1860, + "time_per_iteration": 4.896381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157881, + "balance_loss_mlp": 1.11744571, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.054355964588402354, + "language_loss": 0.86204398, + "learning_rate": 0.000743406297506922, + "loss": 0.87362283, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.40478516, + "step": 1861, + "time_per_iteration": 2.7121450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154988, + "balance_loss_mlp": 1.11362243, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.056412092641732435, + "language_loss": 0.8442747, + "learning_rate": 0.0007431341160617031, + "loss": 0.85582459, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.41381836, + "step": 1862, + "time_per_iteration": 2.902806520462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.13052833, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06986467819319542, + "language_loss": 0.88734752, + "learning_rate": 0.0007428618402234491, + "loss": 0.89907002, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.41723633, + "step": 1863, + "time_per_iteration": 2.644352436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159657, + "balance_loss_mlp": 1.11831546, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.06293448628505635, + "language_loss": 0.8061077, + "learning_rate": 0.0007425894700978668, + "loss": 0.81770432, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.41357422, + "step": 1864, + "time_per_iteration": 2.782757043838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.10699308, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.056888458094662434, + "language_loss": 0.79858804, + "learning_rate": 0.0007423170057906996, + "loss": 0.81006974, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.41162109, + "step": 1865, + "time_per_iteration": 3.848773956298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133926, + "balance_loss_mlp": 1.09391952, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.06447904861600703, + "language_loss": 0.86500657, + "learning_rate": 0.0007420444474077275, + "loss": 0.87634581, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.40014648, + "step": 1866, + "time_per_iteration": 2.542572498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126566, + "balance_loss_mlp": 1.0855341, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.07300351460408123, + "language_loss": 0.8986578, + "learning_rate": 0.0007417717950547671, + "loss": 0.90992349, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.41040039, + "step": 1867, + "time_per_iteration": 2.5633254051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073925, + "balance_loss_mlp": 1.06143153, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.026482390846264015, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77070534, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.125, + "step": 1868, + "time_per_iteration": 4.904905557632446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111694, + "balance_loss_mlp": 1.07345176, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.053992922509511466, + "language_loss": 0.850173, + "learning_rate": 0.0007412262088623299, + "loss": 0.86128998, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.38232422, + "step": 1869, + "time_per_iteration": 2.7310874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110773, + "balance_loss_mlp": 1.07200575, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08370102618564679, + "language_loss": 0.79675972, + "learning_rate": 0.0007409532752346684, + "loss": 0.80786741, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.38769531, + "step": 1870, + "time_per_iteration": 2.6629347801208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110981, + "balance_loss_mlp": 1.07166612, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06403903481871269, + "language_loss": 0.88829064, + "learning_rate": 0.0007406802480606491, + "loss": 0.89940047, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.39306641, + "step": 1871, + "time_per_iteration": 2.6200008392333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.06835461, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.0729370697679506, + "language_loss": 0.90798759, + "learning_rate": 0.0007404071274462707, + "loss": 0.9190588, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.38769531, + "step": 1872, + "time_per_iteration": 2.5693628787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111805, + "balance_loss_mlp": 1.07978415, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.06627703814726228, + "language_loss": 0.84024733, + "learning_rate": 0.0007401339134975682, + "loss": 0.85142779, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.38208008, + "step": 1873, + "time_per_iteration": 2.7031140327453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127585, + "balance_loss_mlp": 1.08760262, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.06845959531373838, + "language_loss": 0.84298885, + "learning_rate": 0.0007398606063206122, + "loss": 0.85426462, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.39990234, + "step": 1874, + "time_per_iteration": 2.6090316772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.07598901, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.06521397848462201, + "language_loss": 0.78764814, + "learning_rate": 0.0007395872060215101, + "loss": 0.79879999, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.3918457, + "step": 1875, + "time_per_iteration": 2.620976448059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0831089, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.06345733178575377, + "language_loss": 0.88705117, + "learning_rate": 0.0007393137127064056, + "loss": 0.89827275, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.39013672, + "step": 1876, + "time_per_iteration": 2.7320597171783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125006, + "balance_loss_mlp": 1.08511841, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.056097062255587686, + "language_loss": 0.84576774, + "learning_rate": 0.0007390401264814779, + "loss": 0.85701776, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.39868164, + "step": 1877, + "time_per_iteration": 2.605865478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123607, + "balance_loss_mlp": 1.08503079, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.06159732683880817, + "language_loss": 0.84937686, + "learning_rate": 0.0007387664474529427, + "loss": 0.86061299, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.38598633, + "step": 1878, + "time_per_iteration": 2.6548514366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.09750319, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.05796680079252983, + "language_loss": 0.91768891, + "learning_rate": 0.0007384926757270518, + "loss": 0.92906928, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.40527344, + "step": 1879, + "time_per_iteration": 2.6339149475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137039, + "balance_loss_mlp": 1.09791493, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.05405313293747941, + "language_loss": 0.79881001, + "learning_rate": 0.0007382188114100924, + "loss": 0.81018037, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.39111328, + "step": 1880, + "time_per_iteration": 2.983384132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139197, + "balance_loss_mlp": 1.09964395, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.12141150358978081, + "language_loss": 0.82206392, + "learning_rate": 0.0007379448546083884, + "loss": 0.83345592, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.39575195, + "step": 1881, + "time_per_iteration": 2.9186532497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140707, + "balance_loss_mlp": 1.10127282, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06284373597557333, + "language_loss": 0.88377333, + "learning_rate": 0.0007376708054282992, + "loss": 0.8951804, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.39428711, + "step": 1882, + "time_per_iteration": 2.9895970821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144635, + "balance_loss_mlp": 1.10605919, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.05224621202588268, + "language_loss": 0.84316945, + "learning_rate": 0.0007373966639762201, + "loss": 0.85461575, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.38574219, + "step": 1883, + "time_per_iteration": 2.623133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147786, + "balance_loss_mlp": 1.10620606, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.06751899300287477, + "language_loss": 0.89170045, + "learning_rate": 0.0007371224303585822, + "loss": 0.90317833, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.41577148, + "step": 1884, + "time_per_iteration": 2.628394842147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021984, + "balance_loss_mlp": 1.01154125, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.007236456832270123, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8137905, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10449219, + "step": 1885, + "time_per_iteration": 4.717620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114112, + "balance_loss_mlp": 1.10049307, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.057116908748179596, + "language_loss": 0.82560247, + "learning_rate": 0.0007365736870525335, + "loss": 0.83701366, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.40625, + "step": 1886, + "time_per_iteration": 2.8198611736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132227, + "balance_loss_mlp": 1.09310222, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.06530442713985495, + "language_loss": 0.83123338, + "learning_rate": 0.000736299177577164, + "loss": 0.84255564, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.39135742, + "step": 1887, + "time_per_iteration": 2.613863945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128864, + "balance_loss_mlp": 1.08992994, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0666501464088242, + "language_loss": 0.84363097, + "learning_rate": 0.0007360245763623174, + "loss": 0.85491955, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3894043, + "step": 1888, + "time_per_iteration": 2.6378068923950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115221, + "balance_loss_mlp": 1.07702661, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06993226621121658, + "language_loss": 0.90142351, + "learning_rate": 0.0007357498835146039, + "loss": 0.91257572, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.38183594, + "step": 1889, + "time_per_iteration": 2.8125081062316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128167, + "balance_loss_mlp": 1.08878016, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.07359030033413445, + "language_loss": 0.87316656, + "learning_rate": 0.0007354750991406684, + "loss": 0.88444823, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.39379883, + "step": 1890, + "time_per_iteration": 2.714569568634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121285, + "balance_loss_mlp": 1.0807066, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07836036923074335, + "language_loss": 0.80991101, + "learning_rate": 0.0007352002233471919, + "loss": 0.8211239, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.40576172, + "step": 1891, + "time_per_iteration": 2.6287412643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121974, + "balance_loss_mlp": 1.08180022, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.058839902089765785, + "language_loss": 0.79524523, + "learning_rate": 0.0007349252562408906, + "loss": 0.80646491, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.40161133, + "step": 1892, + "time_per_iteration": 2.669903039932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125098, + "balance_loss_mlp": 1.08449531, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.057079030651025625, + "language_loss": 0.81590033, + "learning_rate": 0.0007346501979285158, + "loss": 0.8271513, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.40600586, + "step": 1893, + "time_per_iteration": 2.9146764278411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083448, + "balance_loss_mlp": 1.07238543, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.036364529291757694, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81622547, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11083984, + "step": 1894, + "time_per_iteration": 4.784435272216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126267, + "balance_loss_mlp": 1.08444858, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06549610472034906, + "language_loss": 0.86352968, + "learning_rate": 0.0007340998081127308, + "loss": 0.87479234, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.41796875, + "step": 1895, + "time_per_iteration": 2.7702367305755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130662, + "balance_loss_mlp": 1.09113181, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06520113052193731, + "language_loss": 0.91046786, + "learning_rate": 0.0007338244768230007, + "loss": 0.92177445, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.39550781, + "step": 1896, + "time_per_iteration": 2.7612760066986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133468, + "balance_loss_mlp": 1.09315181, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.058734972315737245, + "language_loss": 0.89108521, + "learning_rate": 0.0007335490547545578, + "loss": 0.90241992, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.40307617, + "step": 1897, + "time_per_iteration": 3.024462938308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135084, + "balance_loss_mlp": 1.09343266, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06208128991116815, + "language_loss": 0.82833707, + "learning_rate": 0.0007332735420143308, + "loss": 0.83968788, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.41650391, + "step": 1898, + "time_per_iteration": 2.725468158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112873, + "balance_loss_mlp": 1.08669686, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.09645190116324148, + "language_loss": 0.86573303, + "learning_rate": 0.0007329979387092826, + "loss": 0.8770203, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.42016602, + "step": 1899, + "time_per_iteration": 2.6357531547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133626, + "balance_loss_mlp": 1.09259379, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.06150604002201611, + "language_loss": 0.84294677, + "learning_rate": 0.0007327222449464124, + "loss": 0.85428298, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.41040039, + "step": 1900, + "time_per_iteration": 3.2381174564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136855, + "balance_loss_mlp": 1.09382069, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07567830151973255, + "language_loss": 0.89052904, + "learning_rate": 0.0007324464608327538, + "loss": 0.90189761, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4309082, + "step": 1901, + "time_per_iteration": 2.597569227218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.10814035, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.07712085030005716, + "language_loss": 0.88794601, + "learning_rate": 0.0007321705864753758, + "loss": 0.89944601, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.41870117, + "step": 1902, + "time_per_iteration": 2.6877686977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151954, + "balance_loss_mlp": 1.11097002, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.05591922142148154, + "language_loss": 0.84586883, + "learning_rate": 0.0007318946219813823, + "loss": 0.85738844, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.40991211, + "step": 1903, + "time_per_iteration": 3.0283257961273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11341679, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.0702623940180467, + "language_loss": 0.90117764, + "learning_rate": 0.000731618567457912, + "loss": 0.91269374, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.38208008, + "step": 1904, + "time_per_iteration": 2.651491165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114788, + "balance_loss_mlp": 1.10522676, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07047012066076976, + "language_loss": 0.87036794, + "learning_rate": 0.000731342423012139, + "loss": 0.88184673, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.42700195, + "step": 1905, + "time_per_iteration": 3.0361618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143776, + "balance_loss_mlp": 1.10331631, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.06969182334255739, + "language_loss": 0.82982039, + "learning_rate": 0.0007310661887512722, + "loss": 0.84125817, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.40478516, + "step": 1906, + "time_per_iteration": 3.020333766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134716, + "balance_loss_mlp": 1.09592557, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.056548054453958524, + "language_loss": 0.82503444, + "learning_rate": 0.0007307898647825549, + "loss": 0.83638155, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.38793945, + "step": 1907, + "time_per_iteration": 2.6819958686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128623, + "balance_loss_mlp": 1.08568358, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.0662764931561561, + "language_loss": 0.89910614, + "learning_rate": 0.0007305134512132659, + "loss": 0.9103924, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.42944336, + "step": 1908, + "time_per_iteration": 2.688716411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.08063269, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.07972147303822336, + "language_loss": 0.83329952, + "learning_rate": 0.0007302369481507183, + "loss": 0.8445071, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.40136719, + "step": 1909, + "time_per_iteration": 2.520551919937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043123, + "balance_loss_mlp": 1.03272831, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.028970701382128577, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004882, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10400391, + "step": 1910, + "time_per_iteration": 4.862990140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_mlp": 1.07534695, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.0535153553246422, + "language_loss": 0.85860741, + "learning_rate": 0.000729683673975274, + "loss": 0.86976075, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.3996582, + "step": 1911, + "time_per_iteration": 2.6834514141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117796, + "balance_loss_mlp": 1.07783747, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.07394300555179863, + "language_loss": 0.83108044, + "learning_rate": 0.0007294069030771774, + "loss": 0.84225845, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.39941406, + "step": 1912, + "time_per_iteration": 3.6458523273468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124936, + "balance_loss_mlp": 1.08483398, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.05916806609098389, + "language_loss": 0.90897858, + "learning_rate": 0.0007291300431154224, + "loss": 0.920228, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.40112305, + "step": 1913, + "time_per_iteration": 2.5737557411193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_mlp": 1.02157927, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.013681752942923219, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71422619, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.11279297, + "step": 1914, + "time_per_iteration": 5.031456232070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113953, + "balance_loss_mlp": 1.07499564, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.06158754254944219, + "language_loss": 0.79961407, + "learning_rate": 0.0007285760564309179, + "loss": 0.81075364, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.38964844, + "step": 1915, + "time_per_iteration": 3.152339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122924, + "balance_loss_mlp": 1.08346629, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10197178679971165, + "language_loss": 0.85308397, + "learning_rate": 0.0007282989299232448, + "loss": 0.86431319, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.39453125, + "step": 1916, + "time_per_iteration": 3.0152268409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119949, + "balance_loss_mlp": 1.08013296, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.05980283450468872, + "language_loss": 0.8385278, + "learning_rate": 0.0007280217147820668, + "loss": 0.84972733, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.39794922, + "step": 1917, + "time_per_iteration": 2.625802755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114962, + "balance_loss_mlp": 1.07512259, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06755957483710798, + "language_loss": 0.79489267, + "learning_rate": 0.0007277444111150079, + "loss": 0.80604231, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.3984375, + "step": 1918, + "time_per_iteration": 2.6753525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112846, + "balance_loss_mlp": 1.08785725, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.07157808177363079, + "language_loss": 0.84730321, + "learning_rate": 0.0007274670190297272, + "loss": 0.8585878, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.40576172, + "step": 1919, + "time_per_iteration": 2.6149959564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.09986341, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.05944559747374387, + "language_loss": 0.8264004, + "learning_rate": 0.0007271895386339179, + "loss": 0.83782172, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.42285156, + "step": 1920, + "time_per_iteration": 2.7611513137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140712, + "balance_loss_mlp": 1.09970427, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.059089751588204814, + "language_loss": 0.83542717, + "learning_rate": 0.0007269119700353073, + "loss": 0.8468343, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.41015625, + "step": 1921, + "time_per_iteration": 2.782167911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148229, + "balance_loss_mlp": 1.10738814, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06644949508392005, + "language_loss": 0.85268104, + "learning_rate": 0.0007266343133416571, + "loss": 0.8641634, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.40844727, + "step": 1922, + "time_per_iteration": 2.7218997478485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.06340241, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.03214674667569998, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78192997, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.12695312, + "step": 1923, + "time_per_iteration": 4.837427854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145902, + "balance_loss_mlp": 1.1028676, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.07518583721861193, + "language_loss": 0.84417462, + "learning_rate": 0.0007260787361004556, + "loss": 0.85563368, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.43041992, + "step": 1924, + "time_per_iteration": 2.5874598026275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.03880954, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023888622594867324, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74812186, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11865234, + "step": 1925, + "time_per_iteration": 4.961286544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137865, + "balance_loss_mlp": 1.09571242, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.05584746966952834, + "language_loss": 0.87657702, + "learning_rate": 0.0007255228077730903, + "loss": 0.88795567, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.42163086, + "step": 1926, + "time_per_iteration": 2.663482666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.09786606, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.05562014185368244, + "language_loss": 0.81976974, + "learning_rate": 0.0007252447122218632, + "loss": 0.83117759, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.42919922, + "step": 1927, + "time_per_iteration": 3.1484758853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138853, + "balance_loss_mlp": 1.09655809, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.06601877155853234, + "language_loss": 0.88791764, + "learning_rate": 0.0007249665292228834, + "loss": 0.89930612, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.4230957, + "step": 1928, + "time_per_iteration": 2.5840864181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140563, + "balance_loss_mlp": 1.09872091, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.05314866644458525, + "language_loss": 0.83534646, + "learning_rate": 0.000724688258884151, + "loss": 0.84675211, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.41845703, + "step": 1929, + "time_per_iteration": 2.6063482761383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129765, + "balance_loss_mlp": 1.09166527, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.06946275153671234, + "language_loss": 0.86767673, + "learning_rate": 0.0007244099013137002, + "loss": 0.87897444, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.38085938, + "step": 1930, + "time_per_iteration": 3.0539071559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.0873971, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.05696415350586704, + "language_loss": 0.89040637, + "learning_rate": 0.0007241314566195993, + "loss": 0.90168232, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.40185547, + "step": 1931, + "time_per_iteration": 3.2625389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111855, + "balance_loss_mlp": 1.07861531, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.08463017827171934, + "language_loss": 0.85909784, + "learning_rate": 0.0007238529249099496, + "loss": 0.87028337, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.39941406, + "step": 1932, + "time_per_iteration": 2.6740944385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.09080601, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.046016525030599324, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78958464, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10791016, + "step": 1933, + "time_per_iteration": 4.862685203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125745, + "balance_loss_mlp": 1.08347321, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.10032321862894769, + "language_loss": 0.80747449, + "learning_rate": 0.000723295600876581, + "loss": 0.81873196, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.42285156, + "step": 1934, + "time_per_iteration": 2.990391969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125218, + "balance_loss_mlp": 1.08406699, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.057414096803471676, + "language_loss": 0.87956464, + "learning_rate": 0.0007230168087692344, + "loss": 0.89081681, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.41162109, + "step": 1935, + "time_per_iteration": 2.656625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119009, + "balance_loss_mlp": 1.07924092, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.060205825913767164, + "language_loss": 0.82307911, + "learning_rate": 0.0007227379300790839, + "loss": 0.83426917, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.39770508, + "step": 1936, + "time_per_iteration": 2.997037649154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114267, + "balance_loss_mlp": 1.07218599, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.06128365804507508, + "language_loss": 0.86067426, + "learning_rate": 0.0007224589649143997, + "loss": 0.87181687, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.4206543, + "step": 1937, + "time_per_iteration": 2.5290677547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124508, + "balance_loss_mlp": 1.08228397, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.06605047879793914, + "language_loss": 0.81297445, + "learning_rate": 0.0007221799133834861, + "loss": 0.82421947, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.42236328, + "step": 1938, + "time_per_iteration": 2.613140106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122203, + "balance_loss_mlp": 1.08195794, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.09318016716004435, + "language_loss": 0.8198092, + "learning_rate": 0.00072190077559468, + "loss": 0.83103126, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.40209961, + "step": 1939, + "time_per_iteration": 2.517237424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115818, + "balance_loss_mlp": 1.07578754, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.0553068133661429, + "language_loss": 0.8932575, + "learning_rate": 0.0007216215516563527, + "loss": 0.90441567, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.40014648, + "step": 1940, + "time_per_iteration": 2.7175915241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_mlp": 1.07089305, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.06982995582267476, + "language_loss": 0.83827746, + "learning_rate": 0.0007213422416769083, + "loss": 0.84939647, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.41015625, + "step": 1941, + "time_per_iteration": 2.5922279357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116664, + "balance_loss_mlp": 1.07684803, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.050249137281424494, + "language_loss": 0.75479639, + "learning_rate": 0.0007210628457647849, + "loss": 0.76596296, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.39819336, + "step": 1942, + "time_per_iteration": 2.583151340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.07781446, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.0794488438004998, + "language_loss": 0.79022861, + "learning_rate": 0.000720783364028453, + "loss": 0.8014161, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.40942383, + "step": 1943, + "time_per_iteration": 2.7737677097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114071, + "balance_loss_mlp": 1.07418346, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.05694655733140731, + "language_loss": 0.87941283, + "learning_rate": 0.0007205037965764177, + "loss": 0.89055347, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.39868164, + "step": 1944, + "time_per_iteration": 2.558089256286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123121, + "balance_loss_mlp": 1.08430672, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07621334150317126, + "language_loss": 0.85730159, + "learning_rate": 0.0007202241435172161, + "loss": 0.86853278, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.38769531, + "step": 1945, + "time_per_iteration": 2.7602779865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125439, + "balance_loss_mlp": 1.08574176, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07927003262790512, + "language_loss": 0.88465476, + "learning_rate": 0.0007199444049594198, + "loss": 0.89590919, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.39697266, + "step": 1946, + "time_per_iteration": 2.9583580493927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119027, + "balance_loss_mlp": 1.07665968, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.055396154938164174, + "language_loss": 0.8346498, + "learning_rate": 0.0007196645810116322, + "loss": 0.8458401, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.42382812, + "step": 1947, + "time_per_iteration": 2.6851320266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131178, + "balance_loss_mlp": 1.09045637, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.05889971918419499, + "language_loss": 0.84302223, + "learning_rate": 0.0007193846717824912, + "loss": 0.854334, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.40698242, + "step": 1948, + "time_per_iteration": 2.9035325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.08848619, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.07994215642664601, + "language_loss": 0.88549483, + "learning_rate": 0.0007191046773806669, + "loss": 0.89678907, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.40942383, + "step": 1949, + "time_per_iteration": 2.574697256088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135159, + "balance_loss_mlp": 1.09224343, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07615017139071276, + "language_loss": 0.8356899, + "learning_rate": 0.0007188245979148631, + "loss": 0.84704149, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.42919922, + "step": 1950, + "time_per_iteration": 3.216397285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137761, + "balance_loss_mlp": 1.09475029, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.061651705216508604, + "language_loss": 0.87894762, + "learning_rate": 0.0007185444334938157, + "loss": 0.89032525, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.43041992, + "step": 1951, + "time_per_iteration": 2.6782584190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127424, + "balance_loss_mlp": 1.08879972, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.07782676746029546, + "language_loss": 0.84900033, + "learning_rate": 0.0007182641842262947, + "loss": 0.86027455, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.38647461, + "step": 1952, + "time_per_iteration": 2.639446258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125752, + "balance_loss_mlp": 1.08603168, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.05954692469221933, + "language_loss": 0.78027642, + "learning_rate": 0.0007179838502211022, + "loss": 0.79153389, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.3972168, + "step": 1953, + "time_per_iteration": 2.84329891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131364, + "balance_loss_mlp": 1.09028411, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.10232430816689406, + "language_loss": 0.86411202, + "learning_rate": 0.0007177034315870738, + "loss": 0.8754257, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.41064453, + "step": 1954, + "time_per_iteration": 2.957648992538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124051, + "balance_loss_mlp": 1.08325803, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06271313302399782, + "language_loss": 0.91398948, + "learning_rate": 0.0007174229284330773, + "loss": 0.92523003, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.40795898, + "step": 1955, + "time_per_iteration": 2.5879859924316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128257, + "balance_loss_mlp": 1.08879828, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.06607511431735706, + "language_loss": 0.86850858, + "learning_rate": 0.0007171423408680141, + "loss": 0.87979114, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.39453125, + "step": 1956, + "time_per_iteration": 2.7903566360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123297, + "balance_loss_mlp": 1.08295655, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.06886679209235984, + "language_loss": 0.90041375, + "learning_rate": 0.0007168616690008176, + "loss": 0.91164672, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.40356445, + "step": 1957, + "time_per_iteration": 2.6327474117279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116463, + "balance_loss_mlp": 1.07705224, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.062429689069725576, + "language_loss": 0.85725892, + "learning_rate": 0.0007165809129404545, + "loss": 0.86842352, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.39404297, + "step": 1958, + "time_per_iteration": 2.7385900020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124898, + "balance_loss_mlp": 1.08527279, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.05793527093847313, + "language_loss": 0.85962278, + "learning_rate": 0.0007163000727959239, + "loss": 0.87087178, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.39624023, + "step": 1959, + "time_per_iteration": 2.485438585281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_mlp": 1.0320313, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.027906108498427614, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79005599, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.14453125, + "step": 1960, + "time_per_iteration": 4.834578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_mlp": 1.07865775, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.05325294699236946, + "language_loss": 0.84349847, + "learning_rate": 0.00071573814069052, + "loss": 0.85467696, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.39208984, + "step": 1961, + "time_per_iteration": 2.9086802005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120534, + "balance_loss_mlp": 1.08219612, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.09498383670658105, + "language_loss": 0.88074362, + "learning_rate": 0.0007154570489478081, + "loss": 0.89194894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.38330078, + "step": 1962, + "time_per_iteration": 3.2217841148376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117183, + "balance_loss_mlp": 1.07889283, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.05466788938828107, + "language_loss": 0.86278516, + "learning_rate": 0.0007151758735572514, + "loss": 0.87395698, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.38232422, + "step": 1963, + "time_per_iteration": 3.01104998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130106, + "balance_loss_mlp": 1.08921766, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.06218420858169212, + "language_loss": 0.81413925, + "learning_rate": 0.0007148946146280119, + "loss": 0.82544029, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.40893555, + "step": 1964, + "time_per_iteration": 2.8039112091064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_mlp": 1.01440012, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.022738468700431315, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73218751, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12207031, + "step": 1965, + "time_per_iteration": 4.8600172996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024213, + "balance_loss_mlp": 1.0124352, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.018349030303600054, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76366156, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.11767578, + "step": 1966, + "time_per_iteration": 4.918729782104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135372, + "balance_loss_mlp": 1.09648633, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.2766921299066869, + "language_loss": 0.83812642, + "learning_rate": 0.0007140503377003022, + "loss": 0.84948009, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.38891602, + "step": 1967, + "time_per_iteration": 3.015761613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149326, + "balance_loss_mlp": 1.10862756, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.07158509383086724, + "language_loss": 0.8519339, + "learning_rate": 0.000713768745708599, + "loss": 0.8634271, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.40698242, + "step": 1968, + "time_per_iteration": 2.6109209060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140905, + "balance_loss_mlp": 1.09996843, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.05954158443482363, + "language_loss": 0.774553, + "learning_rate": 0.0007134870707245085, + "loss": 0.78596205, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.40893555, + "step": 1969, + "time_per_iteration": 3.2631757259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150809, + "balance_loss_mlp": 1.11008716, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.05763521765218817, + "language_loss": 0.84313977, + "learning_rate": 0.0007132053128573864, + "loss": 0.85464787, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.40698242, + "step": 1970, + "time_per_iteration": 2.7791051864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143919, + "balance_loss_mlp": 1.10353041, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06905446326925666, + "language_loss": 0.84168518, + "learning_rate": 0.0007129234722166211, + "loss": 0.85312432, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.40356445, + "step": 1971, + "time_per_iteration": 2.8210554122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149932, + "balance_loss_mlp": 1.11152232, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.07023460279096982, + "language_loss": 0.91057038, + "learning_rate": 0.0007126415489116328, + "loss": 0.92206967, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3840332, + "step": 1972, + "time_per_iteration": 2.672755002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153021, + "balance_loss_mlp": 1.11210799, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06814261110374484, + "language_loss": 0.81719398, + "learning_rate": 0.0007123595430518736, + "loss": 0.82872415, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.40917969, + "step": 1973, + "time_per_iteration": 2.8325109481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_mlp": 1.10081029, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.06503005991167149, + "language_loss": 0.86840981, + "learning_rate": 0.0007120774547468282, + "loss": 0.87980628, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.38793945, + "step": 1974, + "time_per_iteration": 2.6115715503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148199, + "balance_loss_mlp": 1.10781133, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.05441443516000103, + "language_loss": 0.81729043, + "learning_rate": 0.0007117952841060128, + "loss": 0.82877243, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.40380859, + "step": 1975, + "time_per_iteration": 2.6378135681152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135454, + "balance_loss_mlp": 1.09389758, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08133175482890537, + "language_loss": 0.83869064, + "learning_rate": 0.0007115130312389756, + "loss": 0.85004514, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.41552734, + "step": 1976, + "time_per_iteration": 2.664318084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139177, + "balance_loss_mlp": 1.0974772, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.06620518382871708, + "language_loss": 0.79781663, + "learning_rate": 0.0007112306962552973, + "loss": 0.80920839, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.41699219, + "step": 1977, + "time_per_iteration": 2.6198599338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.0891974, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.05972767263520316, + "language_loss": 0.85605282, + "learning_rate": 0.0007109482792645896, + "loss": 0.86734867, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.40356445, + "step": 1978, + "time_per_iteration": 2.728576898574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132218, + "balance_loss_mlp": 1.09066188, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.09572440125940551, + "language_loss": 0.84308225, + "learning_rate": 0.0007106657803764969, + "loss": 0.85440445, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.41552734, + "step": 1979, + "time_per_iteration": 2.7279720306396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126537, + "balance_loss_mlp": 1.08340704, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.05862837672704736, + "language_loss": 0.82269728, + "learning_rate": 0.0007103831997006948, + "loss": 0.83396262, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.43164062, + "step": 1980, + "time_per_iteration": 2.746915817260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127621, + "balance_loss_mlp": 1.08663654, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.05821983888794681, + "language_loss": 0.85798764, + "learning_rate": 0.0007101005373468908, + "loss": 0.86926389, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.40991211, + "step": 1981, + "time_per_iteration": 2.878394365310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131348, + "balance_loss_mlp": 1.09060264, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.057148713710776886, + "language_loss": 0.86977971, + "learning_rate": 0.0007098177934248242, + "loss": 0.88109326, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.40771484, + "step": 1982, + "time_per_iteration": 2.7281908988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142672, + "balance_loss_mlp": 1.09918451, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.07304374640444197, + "language_loss": 0.85583997, + "learning_rate": 0.0007095349680442661, + "loss": 0.86726665, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.43505859, + "step": 1983, + "time_per_iteration": 2.831989288330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132213, + "balance_loss_mlp": 1.09015596, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.059661631452858944, + "language_loss": 0.79073238, + "learning_rate": 0.0007092520613150188, + "loss": 0.80205452, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4206543, + "step": 1984, + "time_per_iteration": 2.6566810607910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.09416926, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.0624399969319272, + "language_loss": 0.81395054, + "learning_rate": 0.0007089690733469165, + "loss": 0.82531422, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.42236328, + "step": 1985, + "time_per_iteration": 2.713041067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133128, + "balance_loss_mlp": 1.09023643, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.0833415836593691, + "language_loss": 0.83054602, + "learning_rate": 0.000708686004249825, + "loss": 0.84187728, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.42895508, + "step": 1986, + "time_per_iteration": 2.7708489894866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135389, + "balance_loss_mlp": 1.09311724, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.050231849807362665, + "language_loss": 0.91983181, + "learning_rate": 0.0007084028541336413, + "loss": 0.93118572, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.42260742, + "step": 1987, + "time_per_iteration": 2.7049031257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135282, + "balance_loss_mlp": 1.09205675, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.07987509930436443, + "language_loss": 0.86416399, + "learning_rate": 0.0007081196231082942, + "loss": 0.87551689, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.43212891, + "step": 1988, + "time_per_iteration": 2.769559860229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.09949565, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.09872496004335095, + "language_loss": 0.80492568, + "learning_rate": 0.0007078363112837436, + "loss": 0.81635618, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.43579102, + "step": 1989, + "time_per_iteration": 2.836904525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144237, + "balance_loss_mlp": 1.10065365, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.05755280117815587, + "language_loss": 0.85391158, + "learning_rate": 0.000707552918769981, + "loss": 0.86535394, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43579102, + "step": 1990, + "time_per_iteration": 2.552560806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114164, + "balance_loss_mlp": 1.09846199, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.058237292508227935, + "language_loss": 0.83844453, + "learning_rate": 0.000707269445677029, + "loss": 0.84986091, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43188477, + "step": 1991, + "time_per_iteration": 2.717240571975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155192, + "balance_loss_mlp": 1.11270583, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.08345502818850435, + "language_loss": 0.85774487, + "learning_rate": 0.0007069858921149416, + "loss": 0.86929679, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.42480469, + "step": 1992, + "time_per_iteration": 2.937901496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143498, + "balance_loss_mlp": 1.10120225, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.06679457573221616, + "language_loss": 0.86415881, + "learning_rate": 0.0007067022581938043, + "loss": 0.87559378, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.4230957, + "step": 1993, + "time_per_iteration": 2.8283159732818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147458, + "balance_loss_mlp": 1.10614026, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.06079929242541683, + "language_loss": 0.83476102, + "learning_rate": 0.0007064185440237334, + "loss": 0.84623557, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.41333008, + "step": 1994, + "time_per_iteration": 2.738664150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148789, + "balance_loss_mlp": 1.10627878, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.05320553517563596, + "language_loss": 0.8495338, + "learning_rate": 0.0007061347497148764, + "loss": 0.8610217, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.42504883, + "step": 1995, + "time_per_iteration": 2.7379775047302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147554, + "balance_loss_mlp": 1.10444832, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.059351713178290334, + "language_loss": 0.86747766, + "learning_rate": 0.0007058508753774122, + "loss": 0.87895322, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.4309082, + "step": 1996, + "time_per_iteration": 2.6882424354553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144268, + "balance_loss_mlp": 1.10242534, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.08780844300106258, + "language_loss": 0.87086272, + "learning_rate": 0.0007055669211215505, + "loss": 0.88230544, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.41870117, + "step": 1997, + "time_per_iteration": 2.5902607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136259, + "balance_loss_mlp": 1.09236586, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.0743501008638896, + "language_loss": 0.77852333, + "learning_rate": 0.0007052828870575322, + "loss": 0.78988594, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43896484, + "step": 1998, + "time_per_iteration": 2.643887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113691, + "balance_loss_mlp": 1.09521055, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.05655172042288627, + "language_loss": 0.87035221, + "learning_rate": 0.0007049987732956291, + "loss": 0.88172132, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.41723633, + "step": 1999, + "time_per_iteration": 2.9655773639678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132979, + "balance_loss_mlp": 1.09325886, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.061738893850828154, + "language_loss": 0.83046496, + "learning_rate": 0.0007047145799461439, + "loss": 0.84179473, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.39746094, + "step": 2000, + "time_per_iteration": 2.8509583473205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_mlp": 1.0917958, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06203375299954445, + "language_loss": 0.82530397, + "learning_rate": 0.00070443030711941, + "loss": 0.83663273, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.41088867, + "step": 2001, + "time_per_iteration": 2.759324312210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134639, + "balance_loss_mlp": 1.09386945, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.05757301433327453, + "language_loss": 0.83082199, + "learning_rate": 0.0007041459549257924, + "loss": 0.84216839, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.40771484, + "step": 2002, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121155, + "balance_loss_mlp": 1.08014655, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.07528883527847323, + "language_loss": 0.78547823, + "learning_rate": 0.0007038615234756859, + "loss": 0.79668975, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.41015625, + "step": 2003, + "time_per_iteration": 3.211712598800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_mlp": 1.08257461, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.05751633762771481, + "language_loss": 0.83558142, + "learning_rate": 0.000703577012879517, + "loss": 0.84683371, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.42651367, + "step": 2004, + "time_per_iteration": 2.628211498260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130283, + "balance_loss_mlp": 1.08956099, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.08619617913051347, + "language_loss": 0.89379585, + "learning_rate": 0.0007032924232477423, + "loss": 0.90509868, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.40722656, + "step": 2005, + "time_per_iteration": 2.631619930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128848, + "balance_loss_mlp": 1.08743477, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.06586843636176778, + "language_loss": 0.80831605, + "learning_rate": 0.0007030077546908493, + "loss": 0.81960452, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.6160101890563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336479, + "balance_loss_mlp": 1.3253212, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.11294410837330418, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84401143, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11181641, + "step": 2007, + "time_per_iteration": 4.7873475551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131514, + "balance_loss_mlp": 1.09014845, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.06382618687285554, + "language_loss": 0.79329109, + "learning_rate": 0.0007024381812438117, + "loss": 0.8046062, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.41381836, + "step": 2008, + "time_per_iteration": 2.5387141704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152986, + "balance_loss_mlp": 1.11390948, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.0811673363837608, + "language_loss": 0.83681285, + "learning_rate": 0.0007021532765747951, + "loss": 0.84834278, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.390625, + "step": 2009, + "time_per_iteration": 2.9795420169830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164171, + "balance_loss_mlp": 1.12082672, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.11123688030830275, + "language_loss": 0.7961666, + "learning_rate": 0.0007018682934229162, + "loss": 0.80780828, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43334961, + "step": 2010, + "time_per_iteration": 2.9108352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164881, + "balance_loss_mlp": 1.1216315, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.07913719393788664, + "language_loss": 0.83099723, + "learning_rate": 0.0007015832318988152, + "loss": 0.842646, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43237305, + "step": 2011, + "time_per_iteration": 2.605280637741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082789, + "balance_loss_mlp": 1.07096386, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.024547203760462325, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74972868, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11816406, + "step": 2012, + "time_per_iteration": 4.955415964126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161774, + "balance_loss_mlp": 1.12167192, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.062010894867637535, + "language_loss": 0.84259552, + "learning_rate": 0.0007010128741766604, + "loss": 0.85421324, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.40112305, + "step": 2013, + "time_per_iteration": 2.738905906677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162675, + "balance_loss_mlp": 1.12080884, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.08443522979585812, + "language_loss": 0.84619504, + "learning_rate": 0.0007007275782000391, + "loss": 0.85782182, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.41870117, + "step": 2014, + "time_per_iteration": 2.6049582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178912, + "balance_loss_mlp": 1.13528132, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.05901822901260885, + "language_loss": 0.84836662, + "learning_rate": 0.0007004422042940605, + "loss": 0.8601557, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.43603516, + "step": 2015, + "time_per_iteration": 2.5449817180633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174031, + "balance_loss_mlp": 1.13106763, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.07137462797198264, + "language_loss": 0.89881837, + "learning_rate": 0.0007001567525695169, + "loss": 0.9105587, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.42993164, + "step": 2016, + "time_per_iteration": 2.5804128646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191346, + "balance_loss_mlp": 1.14921737, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.11416128839824946, + "language_loss": 0.84030014, + "learning_rate": 0.0006998712231372303, + "loss": 0.85221362, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.42138672, + "step": 2017, + "time_per_iteration": 2.9779462814331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182085, + "balance_loss_mlp": 1.13845432, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06300984009010882, + "language_loss": 0.86622429, + "learning_rate": 0.0006995856161080532, + "loss": 0.87804508, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43652344, + "step": 2018, + "time_per_iteration": 2.8405675888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160301, + "balance_loss_mlp": 1.11588371, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.0764923139512956, + "language_loss": 0.8250891, + "learning_rate": 0.0006992999315928679, + "loss": 0.83669221, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.44433594, + "step": 2019, + "time_per_iteration": 2.7929439544677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146323, + "balance_loss_mlp": 1.10407472, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.09156853050649941, + "language_loss": 0.86159158, + "learning_rate": 0.0006990141697025871, + "loss": 0.8730548, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.42236328, + "step": 2020, + "time_per_iteration": 2.7913589477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137863, + "balance_loss_mlp": 1.12422562, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.035838926183426385, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77497506, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.13671875, + "step": 2021, + "time_per_iteration": 4.727250576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011348, + "balance_loss_mlp": 1.09398317, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0717580829802053, + "language_loss": 0.82676983, + "learning_rate": 0.0006984424142405392, + "loss": 0.8381179, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.40771484, + "step": 2022, + "time_per_iteration": 2.810420513153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130527, + "balance_loss_mlp": 1.09006715, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.11474151925346394, + "language_loss": 0.8263585, + "learning_rate": 0.0006981564208907474, + "loss": 0.83766377, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.40454102, + "step": 2023, + "time_per_iteration": 2.604849100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139234, + "balance_loss_mlp": 1.09763026, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.05701984367640102, + "language_loss": 0.90312237, + "learning_rate": 0.0006978703506098102, + "loss": 0.91451472, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.41601562, + "step": 2024, + "time_per_iteration": 2.7345082759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115758, + "balance_loss_mlp": 1.11683416, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.06830457595999238, + "language_loss": 0.87819719, + "learning_rate": 0.00069758420350879, + "loss": 0.88977301, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.40722656, + "step": 2025, + "time_per_iteration": 2.6252336502075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160672, + "balance_loss_mlp": 1.11689889, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.07405760759256953, + "language_loss": 0.8637889, + "learning_rate": 0.000697297979698779, + "loss": 0.87539566, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43774414, + "step": 2026, + "time_per_iteration": 2.709831476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.11291099, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06812366476721117, + "language_loss": 0.83983821, + "learning_rate": 0.0006970116792908992, + "loss": 0.85135239, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.38500977, + "step": 2027, + "time_per_iteration": 3.0651228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.10976994, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.06881031116362346, + "language_loss": 0.82086015, + "learning_rate": 0.000696725302396302, + "loss": 0.832358, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.39990234, + "step": 2028, + "time_per_iteration": 2.6441640853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134129, + "balance_loss_mlp": 1.09400284, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.05768401763088921, + "language_loss": 0.86036873, + "learning_rate": 0.0006964388491261692, + "loss": 0.87171006, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.40136719, + "step": 2029, + "time_per_iteration": 3.3004355430603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129182, + "balance_loss_mlp": 1.08941352, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.06928638271863855, + "language_loss": 0.87596297, + "learning_rate": 0.0006961523195917114, + "loss": 0.88725477, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.39770508, + "step": 2030, + "time_per_iteration": 2.8312549591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112269, + "balance_loss_mlp": 1.08041883, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.06430070846126967, + "language_loss": 0.78209358, + "learning_rate": 0.0006958657139041696, + "loss": 0.79332048, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.4230957, + "step": 2031, + "time_per_iteration": 2.789843797683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172125, + "balance_loss_mlp": 1.1593461, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.04676690558545683, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77885091, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.12792969, + "step": 2032, + "time_per_iteration": 4.9584527015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118419, + "balance_loss_mlp": 1.07781672, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.06222398192409584, + "language_loss": 0.78433788, + "learning_rate": 0.0006952922745149434, + "loss": 0.79552209, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.40600586, + "step": 2033, + "time_per_iteration": 2.6696994304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125088, + "balance_loss_mlp": 1.08288765, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06080690179225973, + "language_loss": 0.88040847, + "learning_rate": 0.000695005441035888, + "loss": 0.89165938, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.421875, + "step": 2034, + "time_per_iteration": 2.675685167312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126781, + "balance_loss_mlp": 1.11333418, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.02489517999219278, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74850214, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.13476562, + "step": 2035, + "time_per_iteration": 4.8780670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114086, + "balance_loss_mlp": 1.10006714, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.09902005838056731, + "language_loss": 0.81387436, + "learning_rate": 0.0006944315470656863, + "loss": 0.82528299, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.40795898, + "step": 2036, + "time_per_iteration": 3.04048228263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132051, + "balance_loss_mlp": 1.08858752, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.07431126960541347, + "language_loss": 0.91352618, + "learning_rate": 0.000694144486797345, + "loss": 0.92484671, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.43432617, + "step": 2037, + "time_per_iteration": 2.692013740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110594, + "balance_loss_mlp": 1.09695601, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.027663679576331687, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8063103, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.13671875, + "step": 2038, + "time_per_iteration": 4.626150369644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128452, + "balance_loss_mlp": 1.08796859, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.06987974305662424, + "language_loss": 0.90060711, + "learning_rate": 0.0006935701402514156, + "loss": 0.91189158, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.40454102, + "step": 2039, + "time_per_iteration": 2.5738487243652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099838, + "balance_loss_mlp": 1.0864867, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.03469500580229188, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74134731, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13378906, + "step": 2040, + "time_per_iteration": 4.957871437072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140825, + "balance_loss_mlp": 1.10112846, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.08036310752647091, + "language_loss": 0.84965599, + "learning_rate": 0.0006929954931031422, + "loss": 0.86106431, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.39672852, + "step": 2041, + "time_per_iteration": 4.232867956161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_mlp": 1.09039509, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05705738410966496, + "language_loss": 0.8864727, + "learning_rate": 0.0006927080570819805, + "loss": 0.89776957, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.39282227, + "step": 2042, + "time_per_iteration": 2.6111459732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_mlp": 1.10252953, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.08862983476083965, + "language_loss": 0.81371272, + "learning_rate": 0.0006924205462449161, + "loss": 0.82514596, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.40795898, + "step": 2043, + "time_per_iteration": 2.6160669326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128783, + "balance_loss_mlp": 1.08932424, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.06601435567751561, + "language_loss": 0.82073617, + "learning_rate": 0.0006921329607035702, + "loss": 0.83202398, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.39453125, + "step": 2044, + "time_per_iteration": 3.2338860034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.08441699, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.06846789620147704, + "language_loss": 0.88441163, + "learning_rate": 0.0006918453005695938, + "loss": 0.89562631, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.37011719, + "step": 2045, + "time_per_iteration": 2.6499555110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135104, + "balance_loss_mlp": 1.09426332, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.05142411082006327, + "language_loss": 0.84655213, + "learning_rate": 0.0006915575659546662, + "loss": 0.85790318, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.40869141, + "step": 2046, + "time_per_iteration": 2.652902364730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133716, + "balance_loss_mlp": 1.09339929, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.08744808643608758, + "language_loss": 0.80837369, + "learning_rate": 0.0006912697569704959, + "loss": 0.81971085, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.40307617, + "step": 2047, + "time_per_iteration": 2.6129064559936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131331, + "balance_loss_mlp": 1.09158659, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07468037026935817, + "language_loss": 0.86945641, + "learning_rate": 0.0006909818737288205, + "loss": 0.88076973, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.3972168, + "step": 2048, + "time_per_iteration": 2.5576181411743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10632348, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07110132916922086, + "language_loss": 0.81226838, + "learning_rate": 0.000690693916341406, + "loss": 0.82373071, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.39916992, + "step": 2049, + "time_per_iteration": 2.5884814262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154156, + "balance_loss_mlp": 1.11398268, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.05472880535545416, + "language_loss": 0.82429487, + "learning_rate": 0.0006904058849200475, + "loss": 0.83583641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.40185547, + "step": 2050, + "time_per_iteration": 2.7662599086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144327, + "balance_loss_mlp": 1.10565519, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.06127353443593348, + "language_loss": 0.85204089, + "learning_rate": 0.0006901177795765683, + "loss": 0.86348414, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.38647461, + "step": 2051, + "time_per_iteration": 2.577353000640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011475, + "balance_loss_mlp": 1.10768366, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.10882102145067868, + "language_loss": 0.81508064, + "learning_rate": 0.0006898296004228213, + "loss": 0.82655561, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.39819336, + "step": 2052, + "time_per_iteration": 2.7242588996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118361, + "balance_loss_mlp": 1.10605848, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03880030883121314, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79245102, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12304688, + "step": 2053, + "time_per_iteration": 4.852335691452026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.1204555, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.06533456514809383, + "language_loss": 0.79943091, + "learning_rate": 0.0006892530211320763, + "loss": 0.81103128, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.39575195, + "step": 2054, + "time_per_iteration": 2.726592779159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163981, + "balance_loss_mlp": 1.12528563, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.06955061494726521, + "language_loss": 0.8399905, + "learning_rate": 0.000688964621218926, + "loss": 0.85163033, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.38696289, + "step": 2055, + "time_per_iteration": 2.6089248657226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156737, + "balance_loss_mlp": 1.11737382, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.06754212988294535, + "language_loss": 0.80637926, + "learning_rate": 0.0006886761479432037, + "loss": 0.81794661, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.39379883, + "step": 2056, + "time_per_iteration": 2.8334691524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169364, + "balance_loss_mlp": 1.12866604, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.08783588969410645, + "language_loss": 0.85058302, + "learning_rate": 0.0006883876014169045, + "loss": 0.86227667, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.40698242, + "step": 2057, + "time_per_iteration": 2.4859981536865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163618, + "balance_loss_mlp": 1.12344468, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07066278036752763, + "language_loss": 0.90527105, + "learning_rate": 0.000688098981752052, + "loss": 0.91690719, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.40161133, + "step": 2058, + "time_per_iteration": 2.737825393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169191, + "balance_loss_mlp": 1.12849319, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08574741875980238, + "language_loss": 0.80283022, + "learning_rate": 0.0006878102890606982, + "loss": 0.81452215, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.40722656, + "step": 2059, + "time_per_iteration": 3.0589451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159966, + "balance_loss_mlp": 1.12034082, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.07158976818793618, + "language_loss": 0.81510139, + "learning_rate": 0.0006875215234549239, + "loss": 0.8267011, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.39648438, + "step": 2060, + "time_per_iteration": 2.5404529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151611, + "balance_loss_mlp": 1.11150885, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.11168111879418678, + "language_loss": 0.86092877, + "learning_rate": 0.0006872326850468376, + "loss": 0.87244487, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.40087891, + "step": 2061, + "time_per_iteration": 2.6653215885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153197, + "balance_loss_mlp": 1.11133087, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.0731410886524803, + "language_loss": 0.79433036, + "learning_rate": 0.0006869437739485762, + "loss": 0.80586231, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.41870117, + "step": 2062, + "time_per_iteration": 2.6032299995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147299, + "balance_loss_mlp": 1.1086272, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06685158443863869, + "language_loss": 0.9296748, + "learning_rate": 0.0006866547902723053, + "loss": 0.9411478, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.38647461, + "step": 2063, + "time_per_iteration": 2.676166534423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150184, + "balance_loss_mlp": 1.11148858, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10136223850880095, + "language_loss": 0.80330342, + "learning_rate": 0.000686365734130218, + "loss": 0.81480527, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.38696289, + "step": 2064, + "time_per_iteration": 2.6844232082366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143564, + "balance_loss_mlp": 1.10420108, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06083764513088428, + "language_loss": 0.84282482, + "learning_rate": 0.000686076605634536, + "loss": 0.85426044, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.39379883, + "step": 2065, + "time_per_iteration": 2.6315250396728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156007, + "balance_loss_mlp": 1.11704922, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.07154960647229537, + "language_loss": 0.84777498, + "learning_rate": 0.0006857874048975088, + "loss": 0.85933506, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.38964844, + "step": 2066, + "time_per_iteration": 2.651740074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144331, + "balance_loss_mlp": 1.10298944, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06215318135177391, + "language_loss": 0.87357152, + "learning_rate": 0.0006854981320314142, + "loss": 0.88501477, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.41381836, + "step": 2067, + "time_per_iteration": 2.5062263011932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150736, + "balance_loss_mlp": 1.11089611, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.07144157906743025, + "language_loss": 0.87282014, + "learning_rate": 0.0006852087871485579, + "loss": 0.88432747, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.3984375, + "step": 2068, + "time_per_iteration": 2.6593010425567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141379, + "balance_loss_mlp": 1.10206354, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08492249089395289, + "language_loss": 0.82224536, + "learning_rate": 0.0006849193703612735, + "loss": 0.83365911, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.39282227, + "step": 2069, + "time_per_iteration": 2.755782127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137654, + "balance_loss_mlp": 1.09817159, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07327967142242812, + "language_loss": 0.78054988, + "learning_rate": 0.0006846298817819225, + "loss": 0.79192644, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.39477539, + "step": 2070, + "time_per_iteration": 2.987943410873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148271, + "balance_loss_mlp": 1.10909855, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.08050617332568782, + "language_loss": 0.81162381, + "learning_rate": 0.0006843403215228945, + "loss": 0.82310653, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.3918457, + "step": 2071, + "time_per_iteration": 2.4827940464019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165055, + "balance_loss_mlp": 1.12585878, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.07083437878036915, + "language_loss": 0.80721962, + "learning_rate": 0.0006840506896966065, + "loss": 0.81887019, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.3918457, + "step": 2072, + "time_per_iteration": 2.6827309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166963, + "balance_loss_mlp": 1.12621748, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.06725102297232902, + "language_loss": 0.8278873, + "learning_rate": 0.0006837609864155038, + "loss": 0.83955693, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.40771484, + "step": 2073, + "time_per_iteration": 2.9130313396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116361, + "balance_loss_mlp": 1.12584436, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.07471059517929624, + "language_loss": 0.8375988, + "learning_rate": 0.0006834712117920592, + "loss": 0.84923482, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.37768555, + "step": 2074, + "time_per_iteration": 2.61501145362854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162616, + "balance_loss_mlp": 1.12325335, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.13245970923224126, + "language_loss": 0.85901093, + "learning_rate": 0.0006831813659387729, + "loss": 0.87063706, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.39331055, + "step": 2075, + "time_per_iteration": 2.563549041748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149354, + "balance_loss_mlp": 1.11075377, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.06732512968880089, + "language_loss": 0.84738618, + "learning_rate": 0.0006828914489681733, + "loss": 0.85887969, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.38574219, + "step": 2076, + "time_per_iteration": 2.7011008262634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142979, + "balance_loss_mlp": 1.10440326, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.050728888200014394, + "language_loss": 0.85780215, + "learning_rate": 0.0006826014609928162, + "loss": 0.86923194, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.38598633, + "step": 2077, + "time_per_iteration": 2.699880838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_mlp": 1.01472485, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.012471286598558728, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84226274, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12158203, + "step": 2078, + "time_per_iteration": 4.819272518157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112436, + "balance_loss_mlp": 1.08549809, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.08765386089658693, + "language_loss": 0.80571902, + "learning_rate": 0.0006820212724781896, + "loss": 0.81696254, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.38842773, + "step": 2079, + "time_per_iteration": 2.6927945613861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112693, + "balance_loss_mlp": 1.07526088, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06830833334646268, + "language_loss": 0.84229112, + "learning_rate": 0.0006817310721641694, + "loss": 0.85341799, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.37402344, + "step": 2080, + "time_per_iteration": 2.8158507347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_mlp": 1.07422495, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0821477508940244, + "language_loss": 0.84532309, + "learning_rate": 0.00068144080129589, + "loss": 0.85646749, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.40234375, + "step": 2081, + "time_per_iteration": 2.665823221206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111145, + "balance_loss_mlp": 1.07206321, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.06681211266265834, + "language_loss": 0.83178174, + "learning_rate": 0.0006811504599860441, + "loss": 0.84289622, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.39379883, + "step": 2082, + "time_per_iteration": 2.517651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112401, + "balance_loss_mlp": 1.07382464, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.04646658923847655, + "language_loss": 0.86172366, + "learning_rate": 0.0006808600483473526, + "loss": 0.87284768, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.38549805, + "step": 2083, + "time_per_iteration": 2.85060715675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106871, + "balance_loss_mlp": 1.06743646, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.05030907040360332, + "language_loss": 0.86459124, + "learning_rate": 0.0006805695664925629, + "loss": 0.87565994, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.39379883, + "step": 2084, + "time_per_iteration": 2.775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117346, + "balance_loss_mlp": 1.07810271, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.06453737345570608, + "language_loss": 0.84040797, + "learning_rate": 0.0006802790145344506, + "loss": 0.85158145, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.39233398, + "step": 2085, + "time_per_iteration": 2.4470229148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112227, + "balance_loss_mlp": 1.08459997, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07025741726477988, + "language_loss": 0.87659204, + "learning_rate": 0.0006799883925858176, + "loss": 0.8878147, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.37646484, + "step": 2086, + "time_per_iteration": 2.861490249633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136148, + "balance_loss_mlp": 1.09709549, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06341077230687828, + "language_loss": 0.85575259, + "learning_rate": 0.0006796977007594933, + "loss": 0.86711407, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.39038086, + "step": 2087, + "time_per_iteration": 2.619633197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.10920811, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.0625455511079972, + "language_loss": 0.86956239, + "learning_rate": 0.0006794069391683345, + "loss": 0.88106287, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.40844727, + "step": 2088, + "time_per_iteration": 4.210111618041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145795, + "balance_loss_mlp": 1.10683715, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.0705312667092641, + "language_loss": 0.81334388, + "learning_rate": 0.0006791161079252248, + "loss": 0.8248018, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.38916016, + "step": 2089, + "time_per_iteration": 2.614766836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_mlp": 1.10286903, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.084499094041807, + "language_loss": 0.82758236, + "learning_rate": 0.0006788252071430747, + "loss": 0.83899295, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.38183594, + "step": 2090, + "time_per_iteration": 2.617656707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135863, + "balance_loss_mlp": 1.09490228, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.0700927477934208, + "language_loss": 0.8703053, + "learning_rate": 0.0006785342369348222, + "loss": 0.88166392, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.40942383, + "step": 2091, + "time_per_iteration": 2.7607271671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122396, + "balance_loss_mlp": 1.08513117, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.09140990562062702, + "language_loss": 0.8009733, + "learning_rate": 0.0006782431974134316, + "loss": 0.81219733, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.37280273, + "step": 2092, + "time_per_iteration": 2.5610032081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118064, + "balance_loss_mlp": 1.07889199, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.054626907115785994, + "language_loss": 0.89608824, + "learning_rate": 0.0006779520886918949, + "loss": 0.90726894, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.3918457, + "step": 2093, + "time_per_iteration": 3.064581871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110103, + "balance_loss_mlp": 1.07279015, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.057101365791561574, + "language_loss": 0.81741238, + "learning_rate": 0.0006776609108832301, + "loss": 0.82851338, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.37304688, + "step": 2094, + "time_per_iteration": 2.77875018119812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_mlp": 1.06403446, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.06401566733015203, + "language_loss": 0.85612595, + "learning_rate": 0.0006773696641004828, + "loss": 0.86712897, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.36254883, + "step": 2095, + "time_per_iteration": 2.5543506145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06522298, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.06439261414673134, + "language_loss": 0.77821416, + "learning_rate": 0.0006770783484567247, + "loss": 0.78923213, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.36572266, + "step": 2096, + "time_per_iteration": 3.14194393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114941, + "balance_loss_mlp": 1.07862973, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.051673087984505275, + "language_loss": 0.86408114, + "learning_rate": 0.000676786964065055, + "loss": 0.87523055, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.36328125, + "step": 2097, + "time_per_iteration": 2.796668529510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109226, + "balance_loss_mlp": 1.07270014, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07558073774647381, + "language_loss": 0.79608446, + "learning_rate": 0.0006764955110385986, + "loss": 0.80717671, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.36547852, + "step": 2098, + "time_per_iteration": 2.721588134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.07998002, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06754969850087679, + "language_loss": 0.80409288, + "learning_rate": 0.0006762039894905083, + "loss": 0.8152715, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.37890625, + "step": 2099, + "time_per_iteration": 2.6286327838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126862, + "balance_loss_mlp": 1.08728456, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.06639046911061866, + "language_loss": 0.80760598, + "learning_rate": 0.000675912399533962, + "loss": 0.8188746, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.39599609, + "step": 2100, + "time_per_iteration": 2.5150249004364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110396, + "balance_loss_mlp": 1.07420361, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.05652757132031041, + "language_loss": 0.85431337, + "learning_rate": 0.0006756207412821656, + "loss": 0.86541736, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.36206055, + "step": 2101, + "time_per_iteration": 2.9816384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.06962454, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08079981189537652, + "language_loss": 0.81269771, + "learning_rate": 0.0006753290148483505, + "loss": 0.8237704, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.37670898, + "step": 2102, + "time_per_iteration": 3.0291824340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111488, + "balance_loss_mlp": 1.07458103, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07115498960503684, + "language_loss": 0.79040611, + "learning_rate": 0.0006750372203457752, + "loss": 0.80152106, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.36914062, + "step": 2103, + "time_per_iteration": 2.5193490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111376, + "balance_loss_mlp": 1.07458746, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.049732783973711246, + "language_loss": 0.87039417, + "learning_rate": 0.0006747453578877242, + "loss": 0.88150793, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.36767578, + "step": 2104, + "time_per_iteration": 2.691030979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116651, + "balance_loss_mlp": 1.07998228, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.06592833756650988, + "language_loss": 0.83420014, + "learning_rate": 0.0006744534275875085, + "loss": 0.8453666, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.36669922, + "step": 2105, + "time_per_iteration": 2.9842946529388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.08099532, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.07270624559080442, + "language_loss": 0.85434729, + "learning_rate": 0.0006741614295584657, + "loss": 0.86553085, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.3737793, + "step": 2106, + "time_per_iteration": 2.63811993598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117316, + "balance_loss_mlp": 1.08057594, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.05922552771988275, + "language_loss": 0.78890157, + "learning_rate": 0.0006738693639139595, + "loss": 0.80007476, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.36743164, + "step": 2107, + "time_per_iteration": 2.9618351459503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116371, + "balance_loss_mlp": 1.07746077, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.06915522511623486, + "language_loss": 0.77808583, + "learning_rate": 0.0006735772307673796, + "loss": 0.78924954, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.38916016, + "step": 2108, + "time_per_iteration": 3.575981855392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111094, + "balance_loss_mlp": 1.07380557, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06309901973905298, + "language_loss": 0.83742046, + "learning_rate": 0.0006732850302321421, + "loss": 0.84853137, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.37280273, + "step": 2109, + "time_per_iteration": 3.045565605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114415, + "balance_loss_mlp": 1.0778178, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.060704196703692835, + "language_loss": 0.84782875, + "learning_rate": 0.00067299276242169, + "loss": 0.85897285, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.3659668, + "step": 2110, + "time_per_iteration": 2.6868693828582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047442, + "balance_loss_mlp": 1.03666544, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.029253972882140933, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75429612, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.10791016, + "step": 2111, + "time_per_iteration": 4.918604612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110434, + "balance_loss_mlp": 1.07281184, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.06207465310904933, + "language_loss": 0.78018594, + "learning_rate": 0.0006724080254290395, + "loss": 0.79129028, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.37597656, + "step": 2112, + "time_per_iteration": 2.798377752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116483, + "balance_loss_mlp": 1.08012438, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07195778929743761, + "language_loss": 0.89838338, + "learning_rate": 0.0006721155564738566, + "loss": 0.90954828, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.36401367, + "step": 2113, + "time_per_iteration": 2.721280813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_mlp": 1.02395451, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.019551827625956694, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79657471, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.10888672, + "step": 2114, + "time_per_iteration": 4.956322193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110507, + "balance_loss_mlp": 1.07269359, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.052092004512661015, + "language_loss": 0.85970294, + "learning_rate": 0.0006715304182135078, + "loss": 0.87080801, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.37792969, + "step": 2115, + "time_per_iteration": 2.611116647720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114836, + "balance_loss_mlp": 1.07611692, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.051206353593090614, + "language_loss": 0.89130676, + "learning_rate": 0.0006712377491355127, + "loss": 0.90245515, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.38696289, + "step": 2116, + "time_per_iteration": 2.8788397312164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120118, + "balance_loss_mlp": 1.0829246, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.049235441975469474, + "language_loss": 0.81475073, + "learning_rate": 0.0006709450135771274, + "loss": 0.82595193, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.37182617, + "step": 2117, + "time_per_iteration": 2.944436550140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118262, + "balance_loss_mlp": 1.08233273, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.05682697017745506, + "language_loss": 0.86693907, + "learning_rate": 0.0006706522116520023, + "loss": 0.87812167, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.35913086, + "step": 2118, + "time_per_iteration": 2.6161422729492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125881, + "balance_loss_mlp": 1.08766294, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.060179733914174166, + "language_loss": 0.83147317, + "learning_rate": 0.0006703593434738127, + "loss": 0.84273201, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.38208008, + "step": 2119, + "time_per_iteration": 2.719313383102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123164, + "balance_loss_mlp": 1.0857563, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06825324786035328, + "language_loss": 0.78421569, + "learning_rate": 0.0006700664091562604, + "loss": 0.79544735, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.37402344, + "step": 2120, + "time_per_iteration": 2.569246530532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125736, + "balance_loss_mlp": 1.09090257, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.051920603902655335, + "language_loss": 0.85211694, + "learning_rate": 0.0006697734088130725, + "loss": 0.86337435, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.34863281, + "step": 2121, + "time_per_iteration": 2.67394757270813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124636, + "balance_loss_mlp": 1.08732319, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.05791753235244458, + "language_loss": 0.85750419, + "learning_rate": 0.0006694803425580018, + "loss": 0.86875051, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.37304688, + "step": 2122, + "time_per_iteration": 2.9812121391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129477, + "balance_loss_mlp": 1.09178257, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.06590998571054847, + "language_loss": 0.84986377, + "learning_rate": 0.0006691872105048268, + "loss": 0.86115849, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.37646484, + "step": 2123, + "time_per_iteration": 2.56272292137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137814, + "balance_loss_mlp": 1.10157394, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.05742584890727743, + "language_loss": 0.84864831, + "learning_rate": 0.0006688940127673513, + "loss": 0.86002642, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.36254883, + "step": 2124, + "time_per_iteration": 2.6935954093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113197, + "balance_loss_mlp": 1.09642184, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.05672589959491125, + "language_loss": 0.85339016, + "learning_rate": 0.0006686007494594049, + "loss": 0.86470985, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.35571289, + "step": 2125, + "time_per_iteration": 2.8291172981262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128385, + "balance_loss_mlp": 1.09097719, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.06786502616833631, + "language_loss": 0.81025755, + "learning_rate": 0.0006683074206948425, + "loss": 0.82154143, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.37402344, + "step": 2126, + "time_per_iteration": 2.5193305015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126549, + "balance_loss_mlp": 1.09095287, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.06065849070073351, + "language_loss": 0.81971312, + "learning_rate": 0.0006680140265875443, + "loss": 0.83097857, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.35595703, + "step": 2127, + "time_per_iteration": 2.8254714012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.09184861, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.054477830354085016, + "language_loss": 0.95947516, + "learning_rate": 0.0006677205672514162, + "loss": 0.97074527, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.35205078, + "step": 2128, + "time_per_iteration": 2.6226608753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120165, + "balance_loss_mlp": 1.0867151, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.047090391860463696, + "language_loss": 0.88730562, + "learning_rate": 0.000667427042800389, + "loss": 0.8985073, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.3347168, + "step": 2129, + "time_per_iteration": 2.7718160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118491, + "balance_loss_mlp": 1.0833478, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.05934025192817406, + "language_loss": 0.83200449, + "learning_rate": 0.0006671334533484192, + "loss": 0.84318936, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.3515625, + "step": 2130, + "time_per_iteration": 2.7164061069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126113, + "balance_loss_mlp": 1.09199548, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.04849724567471186, + "language_loss": 0.83465552, + "learning_rate": 0.0006668397990094881, + "loss": 0.84591663, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.34130859, + "step": 2131, + "time_per_iteration": 2.684115171432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124098, + "balance_loss_mlp": 1.08738196, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.059898700524732326, + "language_loss": 0.84854865, + "learning_rate": 0.0006665460798976027, + "loss": 0.85978961, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.3671875, + "step": 2132, + "time_per_iteration": 2.748350143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114613, + "balance_loss_mlp": 1.07899356, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057665198388541644, + "language_loss": 0.81392014, + "learning_rate": 0.0006662522961267947, + "loss": 0.82506627, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.35620117, + "step": 2133, + "time_per_iteration": 2.696699619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117281, + "balance_loss_mlp": 1.08192313, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.05272213252392562, + "language_loss": 0.87773252, + "learning_rate": 0.0006659584478111211, + "loss": 0.88890535, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.35351562, + "step": 2134, + "time_per_iteration": 2.793302536010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_mlp": 1.08249605, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.06878890228068688, + "language_loss": 0.83315176, + "learning_rate": 0.000665664535064664, + "loss": 0.84434175, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.36499023, + "step": 2135, + "time_per_iteration": 3.0627260208129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104736, + "balance_loss_mlp": 1.06987929, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.05984370507865806, + "language_loss": 0.83017695, + "learning_rate": 0.0006653705580015303, + "loss": 0.84122425, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.34863281, + "step": 2136, + "time_per_iteration": 2.6851253509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103645, + "balance_loss_mlp": 1.0668807, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.07790160743926922, + "language_loss": 0.86554241, + "learning_rate": 0.0006650765167358523, + "loss": 0.87657887, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.36743164, + "step": 2137, + "time_per_iteration": 2.7750425338745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111084, + "balance_loss_mlp": 1.07579243, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06074101962252474, + "language_loss": 0.9028185, + "learning_rate": 0.0006647824113817864, + "loss": 0.91392696, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.3503418, + "step": 2138, + "time_per_iteration": 2.5466508865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120271, + "balance_loss_mlp": 1.08348298, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.0860402389983067, + "language_loss": 0.81677365, + "learning_rate": 0.000664488242053515, + "loss": 0.82797635, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.36767578, + "step": 2139, + "time_per_iteration": 2.7149875164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114944, + "balance_loss_mlp": 1.08108878, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.05168082296105111, + "language_loss": 0.83784723, + "learning_rate": 0.0006641940088652445, + "loss": 0.84899676, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.33886719, + "step": 2140, + "time_per_iteration": 2.7871952056884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118503, + "balance_loss_mlp": 1.08130932, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.07036618696819374, + "language_loss": 0.8248812, + "learning_rate": 0.0006638997119312065, + "loss": 0.83606619, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.37207031, + "step": 2141, + "time_per_iteration": 2.7391679286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_mlp": 1.02841258, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.01970513212166274, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76101923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10205078, + "step": 2142, + "time_per_iteration": 4.920190095901489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113096, + "balance_loss_mlp": 1.09562647, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.07114532863779677, + "language_loss": 0.8524918, + "learning_rate": 0.000663310927282877, + "loss": 0.86380136, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.35327148, + "step": 2143, + "time_per_iteration": 2.762634515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.09098172, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.06302616573108136, + "language_loss": 0.86451441, + "learning_rate": 0.000663016439797172, + "loss": 0.87578332, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.35913086, + "step": 2144, + "time_per_iteration": 2.623093366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117577, + "balance_loss_mlp": 1.082816, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.054034946771414454, + "language_loss": 0.80777407, + "learning_rate": 0.0006627218890228724, + "loss": 0.81894982, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.34765625, + "step": 2145, + "time_per_iteration": 2.79042911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.08373237, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.06837741268569841, + "language_loss": 0.83587825, + "learning_rate": 0.0006624272750743326, + "loss": 0.84706175, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.34643555, + "step": 2146, + "time_per_iteration": 3.0066065788269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110844, + "balance_loss_mlp": 1.07591534, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.052525216454956766, + "language_loss": 0.83126348, + "learning_rate": 0.0006621325980659322, + "loss": 0.84237194, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.34912109, + "step": 2147, + "time_per_iteration": 2.77634334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110797, + "balance_loss_mlp": 1.07429504, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06743799661442922, + "language_loss": 0.82004929, + "learning_rate": 0.000661837858112075, + "loss": 0.83115721, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.36499023, + "step": 2148, + "time_per_iteration": 2.8309879302978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108806, + "balance_loss_mlp": 1.07156515, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.060878143567582824, + "language_loss": 0.88845801, + "learning_rate": 0.0006615430553271888, + "loss": 0.89954603, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.37231445, + "step": 2149, + "time_per_iteration": 2.7831413745880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110838, + "balance_loss_mlp": 1.0737617, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.05890657915946428, + "language_loss": 0.85358977, + "learning_rate": 0.0006612481898257264, + "loss": 0.86467361, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.34619141, + "step": 2150, + "time_per_iteration": 2.8594231605529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116144, + "balance_loss_mlp": 1.08021438, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.0708787663681645, + "language_loss": 0.85383213, + "learning_rate": 0.000660953261722165, + "loss": 0.86499357, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.359375, + "step": 2151, + "time_per_iteration": 2.616218090057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110269, + "balance_loss_mlp": 1.07512605, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.05740780888166335, + "language_loss": 0.82834315, + "learning_rate": 0.0006606582711310055, + "loss": 0.83944577, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.3515625, + "step": 2152, + "time_per_iteration": 2.752922773361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116071, + "balance_loss_mlp": 1.07918727, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.062483875204726216, + "language_loss": 0.83428371, + "learning_rate": 0.0006603632181667736, + "loss": 0.84544444, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.36865234, + "step": 2153, + "time_per_iteration": 2.6699299812316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093007, + "balance_loss_mlp": 1.0828501, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03944020407638644, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8003633, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.1015625, + "step": 2154, + "time_per_iteration": 4.931839227676392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117724, + "balance_loss_mlp": 1.0825572, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08977793466970029, + "language_loss": 0.82004881, + "learning_rate": 0.0006597729255773153, + "loss": 0.83122605, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.3515625, + "step": 2155, + "time_per_iteration": 2.5100300312042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114058, + "balance_loss_mlp": 1.07769895, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.24704033930451297, + "language_loss": 0.82534748, + "learning_rate": 0.0006594776861812608, + "loss": 0.83648813, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.36328125, + "step": 2156, + "time_per_iteration": 2.652275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124067, + "balance_loss_mlp": 1.0867784, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.053182178449683815, + "language_loss": 0.86615425, + "learning_rate": 0.0006591823848704776, + "loss": 0.87739491, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.37280273, + "step": 2157, + "time_per_iteration": 2.958137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123111, + "balance_loss_mlp": 1.08653796, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.05319975052329094, + "language_loss": 0.81529272, + "learning_rate": 0.0006588870217596117, + "loss": 0.82652378, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.36572266, + "step": 2158, + "time_per_iteration": 2.739755392074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136072, + "balance_loss_mlp": 1.09847283, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.06859141393857857, + "language_loss": 0.85955006, + "learning_rate": 0.0006585915969633334, + "loss": 0.87091076, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.37573242, + "step": 2159, + "time_per_iteration": 2.561397075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138332, + "balance_loss_mlp": 1.1019969, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06079365960323944, + "language_loss": 0.89314306, + "learning_rate": 0.0006582961105963366, + "loss": 0.90452635, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.36328125, + "step": 2160, + "time_per_iteration": 2.791609287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141319, + "balance_loss_mlp": 1.10546052, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.06462553372591408, + "language_loss": 0.77976739, + "learning_rate": 0.0006580005627733395, + "loss": 0.79118055, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.35913086, + "step": 2161, + "time_per_iteration": 2.6615841388702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152655, + "balance_loss_mlp": 1.11536634, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.06713934338553489, + "language_loss": 0.82142949, + "learning_rate": 0.0006577049536090838, + "loss": 0.83295602, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.37280273, + "step": 2162, + "time_per_iteration": 2.7025601863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11163712, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.06389110494138472, + "language_loss": 0.8567937, + "learning_rate": 0.000657409283218335, + "loss": 0.86828005, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.37011719, + "step": 2163, + "time_per_iteration": 2.6993329524993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160123, + "balance_loss_mlp": 1.12352586, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.058431004936437055, + "language_loss": 0.81466627, + "learning_rate": 0.0006571135517158829, + "loss": 0.82626748, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.3659668, + "step": 2164, + "time_per_iteration": 2.6519243717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114432, + "balance_loss_mlp": 1.13316202, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.04824937130362004, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77908379, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.11181641, + "step": 2165, + "time_per_iteration": 4.770123481750488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155561, + "balance_loss_mlp": 1.11765289, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.07363984603082524, + "language_loss": 0.83210087, + "learning_rate": 0.0006565219058351444, + "loss": 0.84365654, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.37866211, + "step": 2166, + "time_per_iteration": 2.6601247787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144571, + "balance_loss_mlp": 1.10470724, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.06568932383648114, + "language_loss": 0.83008349, + "learning_rate": 0.0006562259916865553, + "loss": 0.84152913, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.39868164, + "step": 2167, + "time_per_iteration": 2.5785412788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137223, + "balance_loss_mlp": 1.0999341, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.06458514122378838, + "language_loss": 0.79966152, + "learning_rate": 0.0006559300168856573, + "loss": 0.81103373, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.37255859, + "step": 2168, + "time_per_iteration": 2.7237303256988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140045, + "balance_loss_mlp": 1.10316169, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.050633821406227124, + "language_loss": 0.86603534, + "learning_rate": 0.0006556339815473577, + "loss": 0.8774358, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.36889648, + "step": 2169, + "time_per_iteration": 2.6403653621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140714, + "balance_loss_mlp": 1.10254359, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.05999280354484277, + "language_loss": 0.86559451, + "learning_rate": 0.000655337885786588, + "loss": 0.87700164, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.3815918, + "step": 2170, + "time_per_iteration": 2.927175283432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144977, + "balance_loss_mlp": 1.10737872, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.06541761902088469, + "language_loss": 0.85292417, + "learning_rate": 0.0006550417297183025, + "loss": 0.86437398, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.37597656, + "step": 2171, + "time_per_iteration": 2.617203950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139774, + "balance_loss_mlp": 1.10174668, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.06470887192105082, + "language_loss": 0.81668884, + "learning_rate": 0.0006547455134574793, + "loss": 0.82808661, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.37988281, + "step": 2172, + "time_per_iteration": 2.6800732612609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.10817289, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06060457888036509, + "language_loss": 0.84434199, + "learning_rate": 0.0006544492371191198, + "loss": 0.85579354, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.36962891, + "step": 2173, + "time_per_iteration": 3.134876251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140851, + "balance_loss_mlp": 1.10113096, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.09700819760133231, + "language_loss": 0.83721489, + "learning_rate": 0.0006541529008182485, + "loss": 0.84862345, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.39697266, + "step": 2174, + "time_per_iteration": 3.1724131107330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113477, + "balance_loss_mlp": 1.09893537, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.060160949925642034, + "language_loss": 0.87700981, + "learning_rate": 0.0006538565046699136, + "loss": 0.88835752, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.3581543, + "step": 2175, + "time_per_iteration": 2.5730292797088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133428, + "balance_loss_mlp": 1.09683084, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.06692113802371265, + "language_loss": 0.81824857, + "learning_rate": 0.0006535600487891862, + "loss": 0.82958287, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.3659668, + "step": 2176, + "time_per_iteration": 2.7692394256591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121709, + "balance_loss_mlp": 1.08651876, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.07459509047969586, + "language_loss": 0.89445305, + "learning_rate": 0.0006532635332911603, + "loss": 0.90567011, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.3515625, + "step": 2177, + "time_per_iteration": 2.668281078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122309, + "balance_loss_mlp": 1.08449602, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.054056674099833946, + "language_loss": 0.80669487, + "learning_rate": 0.0006529669582909541, + "loss": 0.81791794, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.37744141, + "step": 2178, + "time_per_iteration": 3.234210729598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132134, + "balance_loss_mlp": 1.0946312, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.13706718234639897, + "language_loss": 0.85650241, + "learning_rate": 0.0006526703239037077, + "loss": 0.86782372, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.37475586, + "step": 2179, + "time_per_iteration": 2.6495871543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129835, + "balance_loss_mlp": 1.09094954, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.09871097727336539, + "language_loss": 0.86649984, + "learning_rate": 0.0006523736302445851, + "loss": 0.8777982, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.38891602, + "step": 2180, + "time_per_iteration": 2.7558817863464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133221, + "balance_loss_mlp": 1.09390545, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.05706426412838818, + "language_loss": 0.77595234, + "learning_rate": 0.0006520768774287728, + "loss": 0.78728461, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.39306641, + "step": 2181, + "time_per_iteration": 3.7205944061279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143371, + "balance_loss_mlp": 1.10436535, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06053658357019196, + "language_loss": 0.85689628, + "learning_rate": 0.0006517800655714806, + "loss": 0.86832994, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.39013672, + "step": 2182, + "time_per_iteration": 2.8325769901275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140717, + "balance_loss_mlp": 1.10218823, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07751994631636654, + "language_loss": 0.85342467, + "learning_rate": 0.0006514831947879407, + "loss": 0.86483186, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.38500977, + "step": 2183, + "time_per_iteration": 2.930466890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154155, + "balance_loss_mlp": 1.11531675, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.061313063449444025, + "language_loss": 0.78360265, + "learning_rate": 0.0006511862651934091, + "loss": 0.7951442, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.38842773, + "step": 2184, + "time_per_iteration": 3.0874462127685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168235, + "balance_loss_mlp": 1.1299212, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.07362784353092817, + "language_loss": 0.820894, + "learning_rate": 0.0006508892769031638, + "loss": 0.83257627, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.3828125, + "step": 2185, + "time_per_iteration": 2.6239352226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.11551726, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.06908564705964859, + "language_loss": 0.87278891, + "learning_rate": 0.000650592230032506, + "loss": 0.88430935, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.36523438, + "step": 2186, + "time_per_iteration": 2.7282140254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149658, + "balance_loss_mlp": 1.11079597, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.0823679101553184, + "language_loss": 0.85327846, + "learning_rate": 0.0006502951246967595, + "loss": 0.86477506, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.38891602, + "step": 2187, + "time_per_iteration": 2.8729426860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154068, + "balance_loss_mlp": 1.1164453, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.05336445965116177, + "language_loss": 0.86749196, + "learning_rate": 0.0006499979610112706, + "loss": 0.87903261, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.3762207, + "step": 2188, + "time_per_iteration": 2.7119579315185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151369, + "balance_loss_mlp": 1.1127454, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.055701229884667774, + "language_loss": 0.84561181, + "learning_rate": 0.000649700739091409, + "loss": 0.85712552, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.38623047, + "step": 2189, + "time_per_iteration": 2.7023189067840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108128, + "balance_loss_mlp": 1.07126629, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.037864476589066096, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74917555, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10009766, + "step": 2190, + "time_per_iteration": 4.808679103851318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.10751486, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.07155258064415941, + "language_loss": 0.85762346, + "learning_rate": 0.0006491061210101557, + "loss": 0.8690486, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.35009766, + "step": 2191, + "time_per_iteration": 2.7315032482147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.10880995, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.05430057095490736, + "language_loss": 0.84269011, + "learning_rate": 0.0006488087250796157, + "loss": 0.85415035, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.37231445, + "step": 2192, + "time_per_iteration": 2.91867995262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140476, + "balance_loss_mlp": 1.10154223, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.05336306174245454, + "language_loss": 0.81998622, + "learning_rate": 0.0006485112713764049, + "loss": 0.83139098, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.38916016, + "step": 2193, + "time_per_iteration": 2.954740047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139123, + "balance_loss_mlp": 1.10178626, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.05416843930927548, + "language_loss": 0.83712393, + "learning_rate": 0.0006482137600160051, + "loss": 0.84851515, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.3737793, + "step": 2194, + "time_per_iteration": 2.4989676475524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144403, + "balance_loss_mlp": 1.10573113, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.05184002865736912, + "language_loss": 0.8501671, + "learning_rate": 0.0006479161911139206, + "loss": 0.86161113, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.38671875, + "step": 2195, + "time_per_iteration": 2.5739963054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.09721804, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08063840338659255, + "language_loss": 0.85617948, + "learning_rate": 0.0006476185647856778, + "loss": 0.86753291, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.38134766, + "step": 2196, + "time_per_iteration": 2.578218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124656, + "balance_loss_mlp": 1.08808231, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.05804099842364966, + "language_loss": 0.82180464, + "learning_rate": 0.0006473208811468255, + "loss": 0.8330512, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.36547852, + "step": 2197, + "time_per_iteration": 2.8833000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123675, + "balance_loss_mlp": 1.08707809, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.058050592535879256, + "language_loss": 0.84475237, + "learning_rate": 0.0006470231403129347, + "loss": 0.8559891, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.36621094, + "step": 2198, + "time_per_iteration": 2.590959072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124319, + "balance_loss_mlp": 1.08781683, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.05086119187590394, + "language_loss": 0.82252729, + "learning_rate": 0.0006467253423995988, + "loss": 0.83377045, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.36499023, + "step": 2199, + "time_per_iteration": 2.8386192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128105, + "balance_loss_mlp": 1.0917697, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.06679650853448169, + "language_loss": 0.79627949, + "learning_rate": 0.000646427487522433, + "loss": 0.8075605, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.36352539, + "step": 2200, + "time_per_iteration": 2.635103464126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08423305, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.0831390189187338, + "language_loss": 0.83172977, + "learning_rate": 0.0006461295757970749, + "loss": 0.84293896, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.36669922, + "step": 2201, + "time_per_iteration": 2.819474697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.07891917, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.062417347947693186, + "language_loss": 0.81792694, + "learning_rate": 0.0006458316073391839, + "loss": 0.82911074, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.39428711, + "step": 2202, + "time_per_iteration": 2.871166229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122162, + "balance_loss_mlp": 1.0872103, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.05500893378921445, + "language_loss": 0.88072616, + "learning_rate": 0.0006455335822644422, + "loss": 0.89194781, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.34936523, + "step": 2203, + "time_per_iteration": 2.6111316680908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123624, + "balance_loss_mlp": 1.08683574, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.06843699867702463, + "language_loss": 0.78204858, + "learning_rate": 0.0006452355006885527, + "loss": 0.79328489, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.36791992, + "step": 2204, + "time_per_iteration": 2.6248953342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119975, + "balance_loss_mlp": 1.08209014, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.07209183527246785, + "language_loss": 0.87310261, + "learning_rate": 0.0006449373627272412, + "loss": 0.88430238, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.37866211, + "step": 2205, + "time_per_iteration": 2.703838348388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119116, + "balance_loss_mlp": 1.08197045, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.07114514004539872, + "language_loss": 0.82698691, + "learning_rate": 0.0006446391684962553, + "loss": 0.8381781, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.37158203, + "step": 2206, + "time_per_iteration": 2.6619176864624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115514, + "balance_loss_mlp": 1.08022797, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.05684297237550015, + "language_loss": 0.83361518, + "learning_rate": 0.000644340918111364, + "loss": 0.84477031, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.3527832, + "step": 2207, + "time_per_iteration": 2.5367140769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126169, + "balance_loss_mlp": 1.09016824, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07504639835111325, + "language_loss": 0.8513602, + "learning_rate": 0.0006440426116883585, + "loss": 0.8626219, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.36010742, + "step": 2208, + "time_per_iteration": 2.5879015922546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118301, + "balance_loss_mlp": 1.08129835, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06421639244231503, + "language_loss": 0.86279738, + "learning_rate": 0.0006437442493430519, + "loss": 0.8739804, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.37011719, + "step": 2209, + "time_per_iteration": 2.6396701335906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114919, + "balance_loss_mlp": 1.07741535, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.06478280605491378, + "language_loss": 0.87082028, + "learning_rate": 0.000643445831191278, + "loss": 0.88196945, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.37524414, + "step": 2210, + "time_per_iteration": 2.902726173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109225, + "balance_loss_mlp": 1.07265139, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.0604627940505335, + "language_loss": 0.81718135, + "learning_rate": 0.0006431473573488937, + "loss": 0.82827359, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.3659668, + "step": 2211, + "time_per_iteration": 2.756131887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.06492758, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.0751061946408966, + "language_loss": 0.84961367, + "learning_rate": 0.0006428488279317765, + "loss": 0.86063254, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.36938477, + "step": 2212, + "time_per_iteration": 2.6532113552093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100031, + "balance_loss_mlp": 1.06541276, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.06889274289933833, + "language_loss": 0.87372804, + "learning_rate": 0.0006425502430558259, + "loss": 0.88472843, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.34619141, + "step": 2213, + "time_per_iteration": 2.6332669258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_mlp": 1.06874728, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.08165118310272598, + "language_loss": 0.84992635, + "learning_rate": 0.0006422516028369628, + "loss": 0.86098623, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.37231445, + "step": 2214, + "time_per_iteration": 2.618557929992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098802, + "balance_loss_mlp": 1.06237185, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.05512742279801928, + "language_loss": 0.8369562, + "learning_rate": 0.0006419529073911296, + "loss": 0.84794426, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.36425781, + "step": 2215, + "time_per_iteration": 2.833543062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095611, + "balance_loss_mlp": 1.06166017, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.0818108199754697, + "language_loss": 0.85651129, + "learning_rate": 0.0006416541568342901, + "loss": 0.8674674, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.33935547, + "step": 2216, + "time_per_iteration": 2.8430728912353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.0622437, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.05864229124252446, + "language_loss": 0.84272695, + "learning_rate": 0.0006413553512824297, + "loss": 0.85369843, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.34912109, + "step": 2217, + "time_per_iteration": 2.7368276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095005, + "balance_loss_mlp": 1.05943322, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.06419705252846208, + "language_loss": 0.84589773, + "learning_rate": 0.0006410564908515549, + "loss": 0.85684776, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.35595703, + "step": 2218, + "time_per_iteration": 2.650841236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096054, + "balance_loss_mlp": 1.06052935, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.06892642653628764, + "language_loss": 0.85406113, + "learning_rate": 0.0006407575756576935, + "loss": 0.86502165, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.35546875, + "step": 2219, + "time_per_iteration": 2.7199461460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103421, + "balance_loss_mlp": 1.0681113, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.055123892223664483, + "language_loss": 0.88112384, + "learning_rate": 0.0006404586058168951, + "loss": 0.89215803, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.35327148, + "step": 2220, + "time_per_iteration": 2.7125062942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_mlp": 1.07129836, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.06740071030395202, + "language_loss": 0.86848915, + "learning_rate": 0.0006401595814452296, + "loss": 0.87955624, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.35424805, + "step": 2221, + "time_per_iteration": 2.6037752628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07349372, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.06763062065124635, + "language_loss": 0.81391692, + "learning_rate": 0.000639860502658789, + "loss": 0.82500279, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.35131836, + "step": 2222, + "time_per_iteration": 2.620530366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_mlp": 1.07475281, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.07514934658842116, + "language_loss": 0.85168004, + "learning_rate": 0.0006395613695736853, + "loss": 0.86278212, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.35449219, + "step": 2223, + "time_per_iteration": 2.67494797706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106777, + "balance_loss_mlp": 1.07015634, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.06258659729032073, + "language_loss": 0.81998539, + "learning_rate": 0.0006392621823060529, + "loss": 0.83105314, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.36621094, + "step": 2224, + "time_per_iteration": 2.729048490524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107838, + "balance_loss_mlp": 1.07197976, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.07791132694448914, + "language_loss": 0.85259843, + "learning_rate": 0.0006389629409720465, + "loss": 0.86367679, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.35839844, + "step": 2225, + "time_per_iteration": 2.6461989879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102526, + "balance_loss_mlp": 1.06836081, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.06694393428490365, + "language_loss": 0.88831687, + "learning_rate": 0.0006386636456878417, + "loss": 0.89934212, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.34155273, + "step": 2226, + "time_per_iteration": 2.8701326847076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106393, + "balance_loss_mlp": 1.07091641, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07990341915486338, + "language_loss": 0.92087269, + "learning_rate": 0.0006383642965696353, + "loss": 0.93193656, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.35473633, + "step": 2227, + "time_per_iteration": 2.4640464782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_mlp": 1.06544292, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.053395147694407376, + "language_loss": 0.82962096, + "learning_rate": 0.000638064893733645, + "loss": 0.84064686, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.37158203, + "step": 2228, + "time_per_iteration": 2.783597946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117505, + "balance_loss_mlp": 1.08198094, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.07356604001224937, + "language_loss": 0.89838171, + "learning_rate": 0.000637765437296109, + "loss": 0.90955675, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.35522461, + "step": 2229, + "time_per_iteration": 2.6639621257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112327, + "balance_loss_mlp": 1.07644475, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.05563237387214821, + "language_loss": 0.85128897, + "learning_rate": 0.000637465927373287, + "loss": 0.86241227, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.35913086, + "step": 2230, + "time_per_iteration": 2.6883277893066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107316, + "balance_loss_mlp": 1.07253075, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.06522010118943229, + "language_loss": 0.78980476, + "learning_rate": 0.000637166364081459, + "loss": 0.80087787, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.34790039, + "step": 2231, + "time_per_iteration": 2.711379051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111807, + "balance_loss_mlp": 1.07814288, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.06512604260411947, + "language_loss": 0.84616333, + "learning_rate": 0.0006368667475369256, + "loss": 0.85728139, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.33666992, + "step": 2232, + "time_per_iteration": 2.7521519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083825, + "balance_loss_mlp": 1.07271492, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.03367734377341464, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79611605, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.11132812, + "step": 2233, + "time_per_iteration": 4.941352605819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106502, + "balance_loss_mlp": 1.05414832, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.027928692850204096, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79960448, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10888672, + "step": 2234, + "time_per_iteration": 4.825460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117593, + "balance_loss_mlp": 1.08302259, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.05150642259295507, + "language_loss": 0.86345804, + "learning_rate": 0.0006359675795504112, + "loss": 0.87463403, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.34619141, + "step": 2235, + "time_per_iteration": 2.662977695465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.09099901, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.07348370683515035, + "language_loss": 0.74711537, + "learning_rate": 0.0006356677511584775, + "loss": 0.75838703, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.36181641, + "step": 2236, + "time_per_iteration": 3.51220965385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127234, + "balance_loss_mlp": 1.09337878, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.061045373266899905, + "language_loss": 0.86476523, + "learning_rate": 0.0006353678700956511, + "loss": 0.8760376, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.33886719, + "step": 2237, + "time_per_iteration": 2.60677170753479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_mlp": 1.09085464, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.06413862374233106, + "language_loss": 0.83745819, + "learning_rate": 0.0006350679364783569, + "loss": 0.84870958, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.34326172, + "step": 2238, + "time_per_iteration": 2.7771050930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117438, + "balance_loss_mlp": 1.08212781, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.057478588831895126, + "language_loss": 0.85746336, + "learning_rate": 0.0006347679504230393, + "loss": 0.86863768, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.35351562, + "step": 2239, + "time_per_iteration": 2.6826984882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120648, + "balance_loss_mlp": 1.08405077, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.0566935574955873, + "language_loss": 0.76113296, + "learning_rate": 0.0006344679120461632, + "loss": 0.7723394, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.36621094, + "step": 2240, + "time_per_iteration": 3.3756330013275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122418, + "balance_loss_mlp": 1.0843904, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.06383187448999712, + "language_loss": 0.80362582, + "learning_rate": 0.0006341678214642134, + "loss": 0.81484997, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.38037109, + "step": 2241, + "time_per_iteration": 2.6837639808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121026, + "balance_loss_mlp": 1.08633661, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06213603676301435, + "language_loss": 0.82894886, + "learning_rate": 0.0006338676787936963, + "loss": 0.84015912, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.34716797, + "step": 2242, + "time_per_iteration": 3.0835442543029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.09019864, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.06026893794725229, + "language_loss": 0.83885002, + "learning_rate": 0.0006335674841511367, + "loss": 0.85011244, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.36035156, + "step": 2243, + "time_per_iteration": 2.6649861335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054007, + "balance_loss_mlp": 1.04466057, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.029651379922801115, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80235171, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09326172, + "step": 2244, + "time_per_iteration": 5.015843868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043427, + "balance_loss_mlp": 1.03412855, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.025217175998849217, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7840898, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09277344, + "step": 2245, + "time_per_iteration": 4.923234939575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118877, + "balance_loss_mlp": 1.08282828, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.05723795681410829, + "language_loss": 0.83027297, + "learning_rate": 0.0006326665895567652, + "loss": 0.84146178, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.3605957, + "step": 2246, + "time_per_iteration": 2.6065175533294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112241, + "balance_loss_mlp": 1.08652771, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.06570844887047847, + "language_loss": 0.87358153, + "learning_rate": 0.0006323661881916976, + "loss": 0.88480568, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.35864258, + "step": 2247, + "time_per_iteration": 2.682924509048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124487, + "balance_loss_mlp": 1.08996427, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.05864327339271887, + "language_loss": 0.8139447, + "learning_rate": 0.0006320657354375179, + "loss": 0.82518953, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.34521484, + "step": 2248, + "time_per_iteration": 2.9315433502197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112819, + "balance_loss_mlp": 1.09125865, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.05550733837968219, + "language_loss": 0.87244421, + "learning_rate": 0.0006317652314108726, + "loss": 0.88372612, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.36938477, + "step": 2249, + "time_per_iteration": 2.5357820987701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125911, + "balance_loss_mlp": 1.09186506, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.06944226399680122, + "language_loss": 0.91745955, + "learning_rate": 0.0006314646762284277, + "loss": 0.92871869, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.34057617, + "step": 2250, + "time_per_iteration": 2.650629997253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010844, + "balance_loss_mlp": 1.00116396, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012503035455709091, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76436675, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09667969, + "step": 2251, + "time_per_iteration": 4.895758867263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118011, + "balance_loss_mlp": 1.08341658, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.05843208138947643, + "language_loss": 0.77784407, + "learning_rate": 0.0006308634128629022, + "loss": 0.78902417, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.34570312, + "step": 2252, + "time_per_iteration": 2.916623592376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112901, + "balance_loss_mlp": 1.09289002, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0729174620046303, + "language_loss": 0.87908506, + "learning_rate": 0.0006305627049132531, + "loss": 0.89037514, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.36132812, + "step": 2253, + "time_per_iteration": 2.741239070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121412, + "balance_loss_mlp": 1.08660293, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05583951255628595, + "language_loss": 0.8599245, + "learning_rate": 0.0006302619462746662, + "loss": 0.87113857, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.34814453, + "step": 2254, + "time_per_iteration": 3.1628546714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123282, + "balance_loss_mlp": 1.08966494, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.05704174545577272, + "language_loss": 0.90291667, + "learning_rate": 0.0006299611370639069, + "loss": 0.91414952, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.33618164, + "step": 2255, + "time_per_iteration": 2.7106690406799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125975, + "balance_loss_mlp": 1.09157157, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.06008787734976465, + "language_loss": 0.79589838, + "learning_rate": 0.0006296602773977593, + "loss": 0.80715805, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.34423828, + "step": 2256, + "time_per_iteration": 2.673064947128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.08652973, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.05906133720876415, + "language_loss": 0.87730187, + "learning_rate": 0.0006293593673930277, + "loss": 0.88852072, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.35400391, + "step": 2257, + "time_per_iteration": 2.6278131008148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115203, + "balance_loss_mlp": 1.08010745, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07846710421999975, + "language_loss": 0.7888447, + "learning_rate": 0.0006290584071665358, + "loss": 0.79999673, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.35107422, + "step": 2258, + "time_per_iteration": 2.8708009719848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112425, + "balance_loss_mlp": 1.07709181, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06520269446334741, + "language_loss": 0.82244253, + "learning_rate": 0.0006287573968351266, + "loss": 0.83356678, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.35351562, + "step": 2259, + "time_per_iteration": 2.5682222843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113367, + "balance_loss_mlp": 1.07729471, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.07246583855612315, + "language_loss": 0.82777989, + "learning_rate": 0.0006284563365156626, + "loss": 0.83891356, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.3605957, + "step": 2260, + "time_per_iteration": 2.827087879180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108747, + "balance_loss_mlp": 1.07148242, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.12125557864683041, + "language_loss": 0.87600839, + "learning_rate": 0.0006281552263250261, + "loss": 0.88709581, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.37255859, + "step": 2261, + "time_per_iteration": 2.479753017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_mlp": 1.02072453, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.029168664611412945, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81722796, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10546875, + "step": 2262, + "time_per_iteration": 4.812009334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106503, + "balance_loss_mlp": 1.07104969, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.06614620097740347, + "language_loss": 0.81361771, + "learning_rate": 0.0006275528567978593, + "loss": 0.82468277, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.35449219, + "step": 2263, + "time_per_iteration": 2.903029203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115264, + "balance_loss_mlp": 1.07923913, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07895665669601973, + "language_loss": 0.82951373, + "learning_rate": 0.0006272515976951898, + "loss": 0.84066635, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.3605957, + "step": 2264, + "time_per_iteration": 3.066096544265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109411, + "balance_loss_mlp": 1.07300496, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06560373300441709, + "language_loss": 0.79299462, + "learning_rate": 0.0006269502891890687, + "loss": 0.80408877, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.36425781, + "step": 2265, + "time_per_iteration": 3.036302089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098467, + "balance_loss_mlp": 1.06504071, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05296436812265497, + "language_loss": 0.88411891, + "learning_rate": 0.0006266489313964743, + "loss": 0.89510357, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.33447266, + "step": 2266, + "time_per_iteration": 2.766963481903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105293, + "balance_loss_mlp": 1.06907725, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.057339134399699385, + "language_loss": 0.85443783, + "learning_rate": 0.0006263475244344041, + "loss": 0.86549073, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.36230469, + "step": 2267, + "time_per_iteration": 2.8397552967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104848, + "balance_loss_mlp": 1.0681076, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.06097162500725226, + "language_loss": 0.84725475, + "learning_rate": 0.0006260460684198746, + "loss": 0.85830331, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.36743164, + "step": 2268, + "time_per_iteration": 2.725037097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.06901538, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07238177879654556, + "language_loss": 0.84404624, + "learning_rate": 0.0006257445634699213, + "loss": 0.85510075, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.36425781, + "step": 2269, + "time_per_iteration": 2.623194456100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.06855631, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.060050482587473634, + "language_loss": 0.83212304, + "learning_rate": 0.0006254430097015993, + "loss": 0.84317344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.36499023, + "step": 2270, + "time_per_iteration": 2.6570417881011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_mlp": 1.02752221, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.021802814945167073, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751677, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.1015625, + "step": 2271, + "time_per_iteration": 4.800662517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109594, + "balance_loss_mlp": 1.07299662, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.08079345415457889, + "language_loss": 0.85730046, + "learning_rate": 0.0006248397561781609, + "loss": 0.8683964, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.3659668, + "step": 2272, + "time_per_iteration": 2.879779815673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110506, + "balance_loss_mlp": 1.07312167, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.06456885574264018, + "language_loss": 0.86308181, + "learning_rate": 0.0006245380566572482, + "loss": 0.87418681, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.37402344, + "step": 2273, + "time_per_iteration": 2.671515703201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108969, + "balance_loss_mlp": 1.07227635, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07977356034675265, + "language_loss": 0.76295209, + "learning_rate": 0.0006242363087863744, + "loss": 0.77404177, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.36669922, + "step": 2274, + "time_per_iteration": 3.0036468505859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_mlp": 1.07430363, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06387432282930158, + "language_loss": 0.86488557, + "learning_rate": 0.0006239345126826878, + "loss": 0.87598979, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.36132812, + "step": 2275, + "time_per_iteration": 2.8046963214874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113113, + "balance_loss_mlp": 1.07551455, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.06304446482372832, + "language_loss": 0.84217036, + "learning_rate": 0.0006236326684633561, + "loss": 0.85330147, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.37597656, + "step": 2276, + "time_per_iteration": 2.8238136768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113871, + "balance_loss_mlp": 1.07725024, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07202298424456109, + "language_loss": 0.75335848, + "learning_rate": 0.0006233307762455658, + "loss": 0.76449716, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.36645508, + "step": 2277, + "time_per_iteration": 2.6191978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121381, + "balance_loss_mlp": 1.08576083, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.053405108271766075, + "language_loss": 0.8389169, + "learning_rate": 0.0006230288361465216, + "loss": 0.85013068, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.35644531, + "step": 2278, + "time_per_iteration": 3.0405595302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113147, + "balance_loss_mlp": 1.09399056, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.06317085407877503, + "language_loss": 0.85187429, + "learning_rate": 0.0006227268482834473, + "loss": 0.86318898, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.37475586, + "step": 2279, + "time_per_iteration": 2.884791135787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140517, + "balance_loss_mlp": 1.10272789, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.08374351035766264, + "language_loss": 0.87551039, + "learning_rate": 0.000622424812773585, + "loss": 0.88691556, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.37768555, + "step": 2280, + "time_per_iteration": 2.790846824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129266, + "balance_loss_mlp": 1.09150028, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07881944372222376, + "language_loss": 0.79747838, + "learning_rate": 0.000622122729734195, + "loss": 0.80877101, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.37744141, + "step": 2281, + "time_per_iteration": 2.5392401218414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130236, + "balance_loss_mlp": 1.09404397, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06512890224106707, + "language_loss": 0.87574816, + "learning_rate": 0.0006218205992825566, + "loss": 0.88705051, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.36206055, + "step": 2282, + "time_per_iteration": 2.6409003734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130264, + "balance_loss_mlp": 1.09304714, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.058092029820517505, + "language_loss": 0.82094592, + "learning_rate": 0.0006215184215359671, + "loss": 0.83224851, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.37207031, + "step": 2283, + "time_per_iteration": 2.798405647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112171, + "balance_loss_mlp": 1.08506513, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.06799742884418125, + "language_loss": 0.86864793, + "learning_rate": 0.0006212161966117425, + "loss": 0.87986505, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.36669922, + "step": 2284, + "time_per_iteration": 2.7305543422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120327, + "balance_loss_mlp": 1.0823704, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.0718064317498989, + "language_loss": 0.81899178, + "learning_rate": 0.0006209139246272164, + "loss": 0.83019507, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.37939453, + "step": 2285, + "time_per_iteration": 2.9496707916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114672, + "balance_loss_mlp": 1.07569027, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.0666339573323591, + "language_loss": 0.81558084, + "learning_rate": 0.0006206116056997421, + "loss": 0.82672757, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.38964844, + "step": 2286, + "time_per_iteration": 2.56559681892395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112208, + "balance_loss_mlp": 1.08414793, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.07939984369379535, + "language_loss": 0.82495737, + "learning_rate": 0.0006203092399466892, + "loss": 0.83617818, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.37915039, + "step": 2287, + "time_per_iteration": 2.614211082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08368051, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.05953237575059506, + "language_loss": 0.85318255, + "learning_rate": 0.0006200068274854473, + "loss": 0.86438239, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.36303711, + "step": 2288, + "time_per_iteration": 2.6718688011169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123012, + "balance_loss_mlp": 1.08679628, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0828196201385275, + "language_loss": 0.86406159, + "learning_rate": 0.0006197043684334229, + "loss": 0.87529171, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.36230469, + "step": 2289, + "time_per_iteration": 2.7540907859802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128468, + "balance_loss_mlp": 1.09158421, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.11266642339430595, + "language_loss": 0.79650962, + "learning_rate": 0.0006194018629080411, + "loss": 0.80779433, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.36865234, + "step": 2290, + "time_per_iteration": 2.7200653553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127999, + "balance_loss_mlp": 1.09099627, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.0658560511601545, + "language_loss": 0.81793892, + "learning_rate": 0.0006190993110267451, + "loss": 0.82921886, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.36987305, + "step": 2291, + "time_per_iteration": 2.709512233734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130933, + "balance_loss_mlp": 1.09311938, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.0787223425712205, + "language_loss": 0.84518313, + "learning_rate": 0.0006187967129069958, + "loss": 0.85649246, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.37792969, + "step": 2292, + "time_per_iteration": 2.4924299716949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.08935523, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07162475848736369, + "language_loss": 0.87490463, + "learning_rate": 0.0006184940686662722, + "loss": 0.88615251, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.35449219, + "step": 2293, + "time_per_iteration": 2.751055955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119268, + "balance_loss_mlp": 1.08445859, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06340812224100711, + "language_loss": 0.9041853, + "learning_rate": 0.0006181913784220714, + "loss": 0.91537791, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.34838867, + "step": 2294, + "time_per_iteration": 2.64821457862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_mlp": 1.0290786, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.025861242717412188, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81591213, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.08349609, + "step": 2295, + "time_per_iteration": 4.885660171508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119684, + "balance_loss_mlp": 1.08537531, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.10155164806079009, + "language_loss": 0.80041152, + "learning_rate": 0.0006175858603933146, + "loss": 0.81160837, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.34326172, + "step": 2296, + "time_per_iteration": 2.881615400314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129433, + "balance_loss_mlp": 1.09393275, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.0685445546464461, + "language_loss": 0.81208229, + "learning_rate": 0.0006172830328438416, + "loss": 0.82337666, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.35498047, + "step": 2297, + "time_per_iteration": 2.940401315689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123665, + "balance_loss_mlp": 1.08680558, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.09103818832157724, + "language_loss": 0.87286425, + "learning_rate": 0.0006169801597610572, + "loss": 0.88410091, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.36889648, + "step": 2298, + "time_per_iteration": 2.7739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.08195138, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.1052787532551667, + "language_loss": 0.9040001, + "learning_rate": 0.0006166772412625469, + "loss": 0.91515625, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.33666992, + "step": 2299, + "time_per_iteration": 2.734384298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112173, + "balance_loss_mlp": 1.07710147, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.07592361192988976, + "language_loss": 0.81779516, + "learning_rate": 0.0006163742774659141, + "loss": 0.82891691, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.35107422, + "step": 2300, + "time_per_iteration": 2.8436357975006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107602, + "balance_loss_mlp": 1.07410455, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.0790889900730028, + "language_loss": 0.86033177, + "learning_rate": 0.0006160712684887801, + "loss": 0.87140775, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.33496094, + "step": 2301, + "time_per_iteration": 2.816479206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.07118952, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.0554513610730849, + "language_loss": 0.82599401, + "learning_rate": 0.0006157682144487832, + "loss": 0.83703709, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.33129883, + "step": 2302, + "time_per_iteration": 2.7371127605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112573, + "balance_loss_mlp": 1.07776368, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.08617173815320239, + "language_loss": 0.83484352, + "learning_rate": 0.0006154651154635793, + "loss": 0.84596926, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.34838867, + "step": 2303, + "time_per_iteration": 2.822388172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122213, + "balance_loss_mlp": 1.08664048, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.06891313471916412, + "language_loss": 0.85087454, + "learning_rate": 0.0006151619716508421, + "loss": 0.86209667, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.35571289, + "step": 2304, + "time_per_iteration": 2.5669307708740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113601, + "balance_loss_mlp": 1.07905424, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0676174746334525, + "language_loss": 0.87354678, + "learning_rate": 0.0006148587831282625, + "loss": 0.88468277, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.34545898, + "step": 2305, + "time_per_iteration": 2.7296478748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_mlp": 1.03257155, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03035679683037383, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80218178, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.09521484, + "step": 2306, + "time_per_iteration": 4.932115077972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132739, + "balance_loss_mlp": 1.09490204, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.0708853860960667, + "language_loss": 0.87972111, + "learning_rate": 0.0006142522724244255, + "loss": 0.89104849, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.37817383, + "step": 2307, + "time_per_iteration": 2.5106770992279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_mlp": 1.01785433, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.02287011405410123, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77512109, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09521484, + "step": 2308, + "time_per_iteration": 4.842617034912109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120011, + "balance_loss_mlp": 1.08405757, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07624843376245131, + "language_loss": 0.77539825, + "learning_rate": 0.000613645584293942, + "loss": 0.78659838, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.35986328, + "step": 2309, + "time_per_iteration": 2.8661446571350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.08933806, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.0700550632478262, + "language_loss": 0.83505249, + "learning_rate": 0.0006133421739881185, + "loss": 0.84630251, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.35693359, + "step": 2310, + "time_per_iteration": 2.6644127368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118668, + "balance_loss_mlp": 1.08319092, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.11928760190169391, + "language_loss": 0.83116257, + "learning_rate": 0.0006130387196789605, + "loss": 0.84234929, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.35522461, + "step": 2311, + "time_per_iteration": 2.7157018184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111828, + "balance_loss_mlp": 1.07699549, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.05741887786628051, + "language_loss": 0.84819949, + "learning_rate": 0.0006127352214842795, + "loss": 0.85931778, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.34838867, + "step": 2312, + "time_per_iteration": 2.9459052085876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118701, + "balance_loss_mlp": 1.08293796, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07350143541661519, + "language_loss": 0.85691726, + "learning_rate": 0.0006124316795219041, + "loss": 0.86810434, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.35742188, + "step": 2313, + "time_per_iteration": 2.772299289703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131037, + "balance_loss_mlp": 1.0956552, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.06263706285199609, + "language_loss": 0.82505524, + "learning_rate": 0.0006121280939096794, + "loss": 0.83636558, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.35424805, + "step": 2314, + "time_per_iteration": 2.7951674461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114668, + "balance_loss_mlp": 1.11020195, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.0720052818844606, + "language_loss": 0.88360798, + "learning_rate": 0.000611824464765468, + "loss": 0.89507478, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.36499023, + "step": 2315, + "time_per_iteration": 2.5895602703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067507, + "balance_loss_mlp": 1.05682635, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.0344692196546668, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79662448, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.10693359, + "step": 2316, + "time_per_iteration": 4.6560447216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137929, + "balance_loss_mlp": 1.1022377, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.06826351361083724, + "language_loss": 0.85665047, + "learning_rate": 0.000611217076352619, + "loss": 0.86802971, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.35693359, + "step": 2317, + "time_per_iteration": 2.7965078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132041, + "balance_loss_mlp": 1.09835279, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06652231411845559, + "language_loss": 0.83542907, + "learning_rate": 0.0006109133173197905, + "loss": 0.84674948, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.33691406, + "step": 2318, + "time_per_iteration": 2.678832769393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124047, + "balance_loss_mlp": 1.08897519, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.06811942389724822, + "language_loss": 0.85992062, + "learning_rate": 0.0006106095152265935, + "loss": 0.8711611, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.35107422, + "step": 2319, + "time_per_iteration": 2.9018518924713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111639, + "balance_loss_mlp": 1.08060324, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.06308491230142964, + "language_loss": 0.85126555, + "learning_rate": 0.0006103056701909739, + "loss": 0.8624295, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.3581543, + "step": 2320, + "time_per_iteration": 2.927619218826294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111434, + "balance_loss_mlp": 1.07869673, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.08034132862269446, + "language_loss": 0.83192152, + "learning_rate": 0.0006100017823308956, + "loss": 0.8430649, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.35644531, + "step": 2321, + "time_per_iteration": 3.1759355068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111575, + "balance_loss_mlp": 1.07645655, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.0688182521177716, + "language_loss": 0.79684091, + "learning_rate": 0.0006096978517643377, + "loss": 0.8079567, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.35131836, + "step": 2322, + "time_per_iteration": 2.791020154953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_mlp": 1.07337499, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.08831218810897808, + "language_loss": 0.83671057, + "learning_rate": 0.0006093938786092968, + "loss": 0.84780538, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.36108398, + "step": 2323, + "time_per_iteration": 2.614248037338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107948, + "balance_loss_mlp": 1.0734967, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06554008035854059, + "language_loss": 0.90401232, + "learning_rate": 0.0006090898629837857, + "loss": 0.91509175, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.34448242, + "step": 2324, + "time_per_iteration": 2.7988476753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114598, + "balance_loss_mlp": 1.07950234, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.05596676685861875, + "language_loss": 0.87779921, + "learning_rate": 0.0006087858050058337, + "loss": 0.88894522, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.35083008, + "step": 2325, + "time_per_iteration": 2.8598742485046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106952, + "balance_loss_mlp": 1.07309675, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.08404177014968839, + "language_loss": 0.82489681, + "learning_rate": 0.0006084817047934866, + "loss": 0.83596623, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.33886719, + "step": 2326, + "time_per_iteration": 2.6458888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_mlp": 1.07780075, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.07155239810176077, + "language_loss": 0.89966661, + "learning_rate": 0.0006081775624648066, + "loss": 0.91078842, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.34399414, + "step": 2327, + "time_per_iteration": 2.580366373062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120962, + "balance_loss_mlp": 1.08689189, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.06301539333332261, + "language_loss": 0.83119273, + "learning_rate": 0.0006078733781378721, + "loss": 0.8424024, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.34082031, + "step": 2328, + "time_per_iteration": 2.54127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110302, + "balance_loss_mlp": 1.07594562, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.057204005558127505, + "language_loss": 0.82213807, + "learning_rate": 0.0006075691519307781, + "loss": 0.83324105, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.34375, + "step": 2329, + "time_per_iteration": 2.8602964878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117341, + "balance_loss_mlp": 1.08193612, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.055534005363494426, + "language_loss": 0.81606597, + "learning_rate": 0.0006072648839616356, + "loss": 0.82723939, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.35400391, + "step": 2330, + "time_per_iteration": 2.662810802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119722, + "balance_loss_mlp": 1.08565211, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.050779766652796585, + "language_loss": 0.82901573, + "learning_rate": 0.0006069605743485718, + "loss": 0.84021294, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.34057617, + "step": 2331, + "time_per_iteration": 3.3678483963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128598, + "balance_loss_mlp": 1.0950762, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.04918059080846435, + "language_loss": 0.83280981, + "learning_rate": 0.0006066562232097303, + "loss": 0.84409571, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.33544922, + "step": 2332, + "time_per_iteration": 2.7449440956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123187, + "balance_loss_mlp": 1.08785367, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.052836841401222294, + "language_loss": 0.86161315, + "learning_rate": 0.0006063518306632708, + "loss": 0.87284505, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.35375977, + "step": 2333, + "time_per_iteration": 2.9690473079681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127444, + "balance_loss_mlp": 1.09220576, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.06707958703687776, + "language_loss": 0.82286978, + "learning_rate": 0.0006060473968273688, + "loss": 0.83414423, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.35229492, + "step": 2334, + "time_per_iteration": 2.665539026260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142277, + "balance_loss_mlp": 1.13331211, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.036352477885187424, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79021817, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.08984375, + "step": 2335, + "time_per_iteration": 4.888899326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115384, + "balance_loss_mlp": 1.10641909, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.027581232823365703, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82120597, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.08984375, + "step": 2336, + "time_per_iteration": 4.835580348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126219, + "balance_loss_mlp": 1.09155297, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.06484007747623576, + "language_loss": 0.88115394, + "learning_rate": 0.0006051338487650047, + "loss": 0.89241612, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.34667969, + "step": 2337, + "time_per_iteration": 2.4327657222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125299, + "balance_loss_mlp": 1.08846319, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.06762371666749806, + "language_loss": 0.82857472, + "learning_rate": 0.0006048292509534095, + "loss": 0.83982766, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.3684082, + "step": 2338, + "time_per_iteration": 2.583315372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122898, + "balance_loss_mlp": 1.08851767, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.06288042140328122, + "language_loss": 0.78114402, + "learning_rate": 0.0006045246124434895, + "loss": 0.792373, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.34350586, + "step": 2339, + "time_per_iteration": 2.718944787979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111671, + "balance_loss_mlp": 1.08223438, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06455240115792976, + "language_loss": 0.86995041, + "learning_rate": 0.0006042199333535162, + "loss": 0.88111752, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.3449707, + "step": 2340, + "time_per_iteration": 3.280731439590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120556, + "balance_loss_mlp": 1.08803582, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06119421780994794, + "language_loss": 0.83960807, + "learning_rate": 0.0006039152138017763, + "loss": 0.85081363, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.32519531, + "step": 2341, + "time_per_iteration": 3.042808771133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08285511, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.06181422787629511, + "language_loss": 0.83835328, + "learning_rate": 0.0006036104539065726, + "loss": 0.84952325, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.34155273, + "step": 2342, + "time_per_iteration": 2.671872138977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117893, + "balance_loss_mlp": 1.08208227, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.05413998463628708, + "language_loss": 0.84596831, + "learning_rate": 0.000603305653786223, + "loss": 0.85714728, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.3581543, + "step": 2343, + "time_per_iteration": 3.153627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116847, + "balance_loss_mlp": 1.08182287, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.06019885466307642, + "language_loss": 0.84242773, + "learning_rate": 0.0006030008135590622, + "loss": 0.85359621, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.35058594, + "step": 2344, + "time_per_iteration": 2.724281072616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109974, + "balance_loss_mlp": 1.07564187, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.06173385406680834, + "language_loss": 0.80783409, + "learning_rate": 0.0006026959333434387, + "loss": 0.81893378, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.34350586, + "step": 2345, + "time_per_iteration": 2.7752277851104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107914, + "balance_loss_mlp": 1.07336736, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.04677974400708639, + "language_loss": 0.77811158, + "learning_rate": 0.0006023910132577181, + "loss": 0.78919077, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.34545898, + "step": 2346, + "time_per_iteration": 2.663447141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_mlp": 1.06802082, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.060558646022808645, + "language_loss": 0.85310882, + "learning_rate": 0.0006020860534202806, + "loss": 0.86412525, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.33618164, + "step": 2347, + "time_per_iteration": 2.480811595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108815, + "balance_loss_mlp": 1.07388651, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06606096221098971, + "language_loss": 0.81316173, + "learning_rate": 0.0006017810539495224, + "loss": 0.82424992, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.34960938, + "step": 2348, + "time_per_iteration": 2.9476070404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098582, + "balance_loss_mlp": 1.06415427, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.0571113923067653, + "language_loss": 0.82774842, + "learning_rate": 0.0006014760149638547, + "loss": 0.83873427, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.34423828, + "step": 2349, + "time_per_iteration": 2.6655263900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103718, + "balance_loss_mlp": 1.07005334, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.06475243948679671, + "language_loss": 0.88831103, + "learning_rate": 0.000601170936581704, + "loss": 0.89934826, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.33666992, + "step": 2350, + "time_per_iteration": 2.5269417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06343222, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.06432650174878703, + "language_loss": 0.84562814, + "learning_rate": 0.0006008658189215121, + "loss": 0.85660601, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.34399414, + "step": 2351, + "time_per_iteration": 2.621596097946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110179, + "balance_loss_mlp": 1.07267594, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.3016755485520666, + "language_loss": 0.8046757, + "learning_rate": 0.0006005606621017366, + "loss": 0.81577748, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.375, + "step": 2352, + "time_per_iteration": 2.561138153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111286, + "balance_loss_mlp": 1.07564211, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.055264843638134026, + "language_loss": 0.80770934, + "learning_rate": 0.0006002554662408496, + "loss": 0.81882215, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.35644531, + "step": 2353, + "time_per_iteration": 2.87947940826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118454, + "balance_loss_mlp": 1.08180928, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.06003231312298175, + "language_loss": 0.91710508, + "learning_rate": 0.0005999502314573388, + "loss": 0.92828965, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.36645508, + "step": 2354, + "time_per_iteration": 2.703589916229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127767, + "balance_loss_mlp": 1.09119391, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.06522748471040672, + "language_loss": 0.86741221, + "learning_rate": 0.0005996449578697066, + "loss": 0.87868989, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.36547852, + "step": 2355, + "time_per_iteration": 2.6407227516174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114254, + "balance_loss_mlp": 1.10627651, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.05645244306136207, + "language_loss": 0.81587362, + "learning_rate": 0.0005993396455964709, + "loss": 0.827299, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.36279297, + "step": 2356, + "time_per_iteration": 2.7260916233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159041, + "balance_loss_mlp": 1.12263405, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.0574643396084849, + "language_loss": 0.81904489, + "learning_rate": 0.0005990342947561647, + "loss": 0.83063525, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.36401367, + "step": 2357, + "time_per_iteration": 2.763461112976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158122, + "balance_loss_mlp": 1.12109542, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.06627350558163068, + "language_loss": 0.78124607, + "learning_rate": 0.0005987289054673351, + "loss": 0.79282725, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.37011719, + "step": 2358, + "time_per_iteration": 2.7317159175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172658, + "balance_loss_mlp": 1.16121387, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.05600708096364228, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77748394, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11425781, + "step": 2359, + "time_per_iteration": 4.815205335617065 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168714, + "balance_loss_mlp": 1.13257003, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.0832511333205401, + "language_loss": 0.91429126, + "learning_rate": 0.0005981180120183722, + "loss": 0.92597842, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.36206055, + "step": 2360, + "time_per_iteration": 2.675994873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154784, + "balance_loss_mlp": 1.11825836, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.06456101952662723, + "language_loss": 0.85450256, + "learning_rate": 0.0005978125080954089, + "loss": 0.86605042, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.36523438, + "step": 2361, + "time_per_iteration": 2.844592332839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134938, + "balance_loss_mlp": 1.0997715, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.06943573222196867, + "language_loss": 0.77225572, + "learning_rate": 0.000597506966198262, + "loss": 0.7836051, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.35180664, + "step": 2362, + "time_per_iteration": 2.990652322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127189, + "balance_loss_mlp": 1.09216547, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.07387250459530678, + "language_loss": 0.84014916, + "learning_rate": 0.0005972013864455536, + "loss": 0.85142106, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.3503418, + "step": 2363, + "time_per_iteration": 2.589594841003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124933, + "balance_loss_mlp": 1.09141088, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.06451639193106218, + "language_loss": 0.85711533, + "learning_rate": 0.0005968957689559203, + "loss": 0.86836469, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.33544922, + "step": 2364, + "time_per_iteration": 2.6682167053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119758, + "balance_loss_mlp": 1.08585453, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.06239355206550831, + "language_loss": 0.89508158, + "learning_rate": 0.0005965901138480131, + "loss": 0.90627909, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.33911133, + "step": 2365, + "time_per_iteration": 2.6365487575531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125783, + "balance_loss_mlp": 1.08816087, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07086256306792152, + "language_loss": 0.87331104, + "learning_rate": 0.0005962844212404982, + "loss": 0.88456881, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.37597656, + "step": 2366, + "time_per_iteration": 2.6617612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123043, + "balance_loss_mlp": 1.08763838, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05743086206543283, + "language_loss": 0.87604624, + "learning_rate": 0.0005959786912520558, + "loss": 0.88727665, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.35400391, + "step": 2367, + "time_per_iteration": 2.5842456817626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112429, + "balance_loss_mlp": 1.08878994, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.05541530908978363, + "language_loss": 0.84261698, + "learning_rate": 0.0005956729240013806, + "loss": 0.8538599, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.35522461, + "step": 2368, + "time_per_iteration": 2.8338305950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131752, + "balance_loss_mlp": 1.09880257, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06117437276065272, + "language_loss": 0.91673207, + "learning_rate": 0.0005953671196071824, + "loss": 0.92804956, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.32958984, + "step": 2369, + "time_per_iteration": 2.6954920291900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140492, + "balance_loss_mlp": 1.10089099, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.05874804832244865, + "language_loss": 0.80540514, + "learning_rate": 0.0005950612781881846, + "loss": 0.81681007, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.39575195, + "step": 2370, + "time_per_iteration": 2.695518732070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133052, + "balance_loss_mlp": 1.09526241, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.054922750315337415, + "language_loss": 0.76194978, + "learning_rate": 0.0005947553998631259, + "loss": 0.77328038, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.37792969, + "step": 2371, + "time_per_iteration": 2.854757070541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133988, + "balance_loss_mlp": 1.09777188, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.04850294692755014, + "language_loss": 0.79227567, + "learning_rate": 0.000594449484750758, + "loss": 0.80361551, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.36206055, + "step": 2372, + "time_per_iteration": 3.2277348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128775, + "balance_loss_mlp": 1.09263051, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.06286219474212958, + "language_loss": 0.83387208, + "learning_rate": 0.0005941435329698484, + "loss": 0.84515989, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.36132812, + "step": 2373, + "time_per_iteration": 2.676492929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126424, + "balance_loss_mlp": 1.09025562, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.05768590484176838, + "language_loss": 0.83615124, + "learning_rate": 0.0005938375446391778, + "loss": 0.84741557, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.36181641, + "step": 2374, + "time_per_iteration": 2.7465567588806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137671, + "balance_loss_mlp": 1.09969115, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.05745321957635053, + "language_loss": 0.89048398, + "learning_rate": 0.0005935315198775415, + "loss": 0.90186071, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.38012695, + "step": 2375, + "time_per_iteration": 2.6580095291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128206, + "balance_loss_mlp": 1.09320593, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06107240600749233, + "language_loss": 0.87175268, + "learning_rate": 0.0005932254588037486, + "loss": 0.88303471, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.35009766, + "step": 2376, + "time_per_iteration": 2.488588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121963, + "balance_loss_mlp": 1.08600903, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.05478508122440065, + "language_loss": 0.86331463, + "learning_rate": 0.000592919361536623, + "loss": 0.87453431, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.35961914, + "step": 2377, + "time_per_iteration": 2.644374132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127537, + "balance_loss_mlp": 1.09196472, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.05713052679154174, + "language_loss": 0.89246452, + "learning_rate": 0.0005926132281950017, + "loss": 0.90373993, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.35571289, + "step": 2378, + "time_per_iteration": 2.7563676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121403, + "balance_loss_mlp": 1.08406663, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.05503863795363348, + "language_loss": 0.85310149, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431557, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.37329102, + "step": 2379, + "time_per_iteration": 2.8923282623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123743, + "balance_loss_mlp": 1.087098, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.05441682742417314, + "language_loss": 0.86308765, + "learning_rate": 0.0005920008537636931, + "loss": 0.8743251, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.3659668, + "step": 2380, + "time_per_iteration": 2.8928191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121741, + "balance_loss_mlp": 1.0852387, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.0522540937039379, + "language_loss": 0.86756825, + "learning_rate": 0.0005916946129117504, + "loss": 0.87878567, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.36523438, + "step": 2381, + "time_per_iteration": 2.9031155109405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129507, + "balance_loss_mlp": 1.09281409, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.055637229661903514, + "language_loss": 0.80852348, + "learning_rate": 0.0005913883364608017, + "loss": 0.8198185, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.36694336, + "step": 2382, + "time_per_iteration": 3.0779874324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123213, + "balance_loss_mlp": 1.088094, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.05906328885450196, + "language_loss": 0.88737094, + "learning_rate": 0.0005910820245297542, + "loss": 0.89860308, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.35131836, + "step": 2383, + "time_per_iteration": 2.889805555343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119246, + "balance_loss_mlp": 1.0824585, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.06990697064707628, + "language_loss": 0.80825961, + "learning_rate": 0.000590775677237529, + "loss": 0.81945217, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.36791992, + "step": 2384, + "time_per_iteration": 2.7286477088928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011127, + "balance_loss_mlp": 1.07562566, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.06044507930671915, + "language_loss": 0.80186594, + "learning_rate": 0.0005904692947030601, + "loss": 0.81299293, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.37084961, + "step": 2385, + "time_per_iteration": 2.6249661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112584, + "balance_loss_mlp": 1.07446146, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.06266023003425206, + "language_loss": 0.89858609, + "learning_rate": 0.0005901628770452963, + "loss": 0.90971196, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.38110352, + "step": 2386, + "time_per_iteration": 2.628052234649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106229, + "balance_loss_mlp": 1.06925035, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.05741151930163357, + "language_loss": 0.87304425, + "learning_rate": 0.000589856424383199, + "loss": 0.88410658, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.36987305, + "step": 2387, + "time_per_iteration": 2.6852517127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.07863569, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.06606538590283985, + "language_loss": 0.83553612, + "learning_rate": 0.000589549936835744, + "loss": 0.84669703, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.37451172, + "step": 2388, + "time_per_iteration": 2.8861043453216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106236, + "balance_loss_mlp": 1.07135534, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06160096974470471, + "language_loss": 0.79523546, + "learning_rate": 0.0005892434145219202, + "loss": 0.80629778, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.34912109, + "step": 2389, + "time_per_iteration": 2.6016130447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_mlp": 1.06219506, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07218042116864783, + "language_loss": 0.82768381, + "learning_rate": 0.0005889368575607303, + "loss": 0.83865625, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.35058594, + "step": 2390, + "time_per_iteration": 2.806382894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_mlp": 1.06791568, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.06321076421250729, + "language_loss": 0.78347147, + "learning_rate": 0.00058863026607119, + "loss": 0.7944994, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.34912109, + "step": 2391, + "time_per_iteration": 3.0679373741149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06800711, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07981135891264553, + "language_loss": 0.80153728, + "learning_rate": 0.0005883236401723287, + "loss": 0.81255829, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.34130859, + "step": 2392, + "time_per_iteration": 3.178016185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102518, + "balance_loss_mlp": 1.06830466, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.05809686694512272, + "language_loss": 0.8436439, + "learning_rate": 0.0005880169799831893, + "loss": 0.85466909, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.34204102, + "step": 2393, + "time_per_iteration": 2.7394168376922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099974, + "balance_loss_mlp": 1.06537914, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.05496993027151255, + "language_loss": 0.81652063, + "learning_rate": 0.0005877102856228278, + "loss": 0.82752037, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.34594727, + "step": 2394, + "time_per_iteration": 2.857044219970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_mlp": 1.07225823, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.0685378240754912, + "language_loss": 0.84987622, + "learning_rate": 0.0005874035572103133, + "loss": 0.86095524, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.35644531, + "step": 2395, + "time_per_iteration": 2.6805660724639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_mlp": 1.06699777, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.07818612590839771, + "language_loss": 0.82504952, + "learning_rate": 0.0005870967948647288, + "loss": 0.83607757, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.35839844, + "step": 2396, + "time_per_iteration": 2.7740094661712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13801181, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.06620078890509219, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75458288, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.11962891, + "step": 2397, + "time_per_iteration": 5.407956838607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_mlp": 1.07158542, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.05578291602549768, + "language_loss": 0.85959148, + "learning_rate": 0.0005864831688507443, + "loss": 0.87066138, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.35424805, + "step": 2398, + "time_per_iteration": 3.000498056411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108167, + "balance_loss_mlp": 1.07342887, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.0567470157783756, + "language_loss": 0.7555595, + "learning_rate": 0.0005861763054205754, + "loss": 0.7666412, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.34765625, + "step": 2399, + "time_per_iteration": 2.7206692695617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108701, + "balance_loss_mlp": 1.07303381, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.054446102099669776, + "language_loss": 0.80056608, + "learning_rate": 0.0005858694085337976, + "loss": 0.81165302, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.35668945, + "step": 2400, + "time_per_iteration": 2.8272197246551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107355, + "balance_loss_mlp": 1.07090116, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.06783884534527172, + "language_loss": 0.83774948, + "learning_rate": 0.0005855624783095589, + "loss": 0.84882307, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.36425781, + "step": 2401, + "time_per_iteration": 2.6019625663757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102889, + "balance_loss_mlp": 1.06812799, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.05559222161472476, + "language_loss": 0.8541491, + "learning_rate": 0.00058525551486702, + "loss": 0.86517805, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.34790039, + "step": 2402, + "time_per_iteration": 2.5166754722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106757, + "balance_loss_mlp": 1.07058895, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.07030933336499708, + "language_loss": 0.80856764, + "learning_rate": 0.0005849485183253548, + "loss": 0.81963521, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.36206055, + "step": 2403, + "time_per_iteration": 2.6906049251556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.07090759, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.057304610397081915, + "language_loss": 0.87811077, + "learning_rate": 0.0005846414888037501, + "loss": 0.88916934, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.34960938, + "step": 2404, + "time_per_iteration": 2.488797426223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.06899309, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.05034114049250231, + "language_loss": 0.82261539, + "learning_rate": 0.0005843344264214049, + "loss": 0.83363742, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.33203125, + "step": 2405, + "time_per_iteration": 2.746372938156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110347, + "balance_loss_mlp": 1.07068777, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.10060755415937467, + "language_loss": 0.85092008, + "learning_rate": 0.0005840273312975317, + "loss": 0.86195481, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.32788086, + "step": 2406, + "time_per_iteration": 2.834230661392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112626, + "balance_loss_mlp": 1.07829416, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.06610522075480575, + "language_loss": 0.90376371, + "learning_rate": 0.0005837202035513555, + "loss": 0.91489005, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.34326172, + "step": 2407, + "time_per_iteration": 2.577099084854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112101, + "balance_loss_mlp": 1.07693422, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06799718927162632, + "language_loss": 0.81987119, + "learning_rate": 0.0005834130433021136, + "loss": 0.83099222, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.3515625, + "step": 2408, + "time_per_iteration": 2.751481771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07537687, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07576984187058394, + "language_loss": 0.73707795, + "learning_rate": 0.0005831058506690563, + "loss": 0.74819058, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.359375, + "step": 2409, + "time_per_iteration": 2.6351587772369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104428, + "balance_loss_mlp": 1.0719074, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06066453040155937, + "language_loss": 0.86246306, + "learning_rate": 0.0005827986257714464, + "loss": 0.87350732, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.32519531, + "step": 2410, + "time_per_iteration": 2.9171712398529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_mlp": 1.07334006, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.05632663018450853, + "language_loss": 0.8897202, + "learning_rate": 0.0005824913687285591, + "loss": 0.90078408, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.33032227, + "step": 2411, + "time_per_iteration": 2.6863625049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104253, + "balance_loss_mlp": 1.07056427, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.09102731097831396, + "language_loss": 0.81903768, + "learning_rate": 0.0005821840796596821, + "loss": 0.83008015, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.3371582, + "step": 2412, + "time_per_iteration": 2.658602714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108642, + "balance_loss_mlp": 1.07605052, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.04905521047169809, + "language_loss": 0.8043226, + "learning_rate": 0.0005818767586841158, + "loss": 0.81540906, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.32592773, + "step": 2413, + "time_per_iteration": 2.7577285766601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108976, + "balance_loss_mlp": 1.07655096, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06302213894221746, + "language_loss": 0.865412, + "learning_rate": 0.0005815694059211726, + "loss": 0.8765018, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.32421875, + "step": 2414, + "time_per_iteration": 2.6655328273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174358, + "balance_loss_mlp": 1.16362953, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.06384975588330166, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82047987, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.10742188, + "step": 2415, + "time_per_iteration": 4.795905828475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_mlp": 1.09135294, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.035706806463564576, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78046715, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.10302734, + "step": 2416, + "time_per_iteration": 4.964730978012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100793, + "balance_loss_mlp": 1.06910706, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.054161288123553565, + "language_loss": 0.8669647, + "learning_rate": 0.0005806471581013931, + "loss": 0.8779726, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.31640625, + "step": 2417, + "time_per_iteration": 2.7034828662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106596, + "balance_loss_mlp": 1.07221591, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.05684649238509572, + "language_loss": 0.78830767, + "learning_rate": 0.0005803396793823146, + "loss": 0.79937363, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.34375, + "step": 2418, + "time_per_iteration": 2.810929536819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112917, + "balance_loss_mlp": 1.07848907, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07858966703970842, + "language_loss": 0.86256903, + "learning_rate": 0.0005800321694726065, + "loss": 0.87369823, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.34423828, + "step": 2419, + "time_per_iteration": 2.797091484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113087, + "balance_loss_mlp": 1.07880187, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.06627504844203173, + "language_loss": 0.86954433, + "learning_rate": 0.0005797246284916545, + "loss": 0.8806752, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.34277344, + "step": 2420, + "time_per_iteration": 2.689190149307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_mlp": 1.09355068, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.047662019725998206, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78608793, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.10058594, + "step": 2421, + "time_per_iteration": 6.38897705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112649, + "balance_loss_mlp": 1.09318316, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.06710074217369558, + "language_loss": 0.88096154, + "learning_rate": 0.0005791094537936233, + "loss": 0.8922264, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.33325195, + "step": 2422, + "time_per_iteration": 4.209144353866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126187, + "balance_loss_mlp": 1.09340453, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.0626199173608307, + "language_loss": 0.82125473, + "learning_rate": 0.0005788018203153762, + "loss": 0.83251661, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.32788086, + "step": 2423, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138161, + "balance_loss_mlp": 1.10540235, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.07666207610831233, + "language_loss": 0.85944337, + "learning_rate": 0.000578494156243549, + "loss": 0.87082505, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.32763672, + "step": 2424, + "time_per_iteration": 2.582838296890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142028, + "balance_loss_mlp": 1.10779119, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.11745148991984863, + "language_loss": 0.89446878, + "learning_rate": 0.0005781864616975878, + "loss": 0.90588903, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.3425293, + "step": 2425, + "time_per_iteration": 2.6464650630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135149, + "balance_loss_mlp": 1.10081649, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.07242740344873133, + "language_loss": 0.84278369, + "learning_rate": 0.0005778787367969502, + "loss": 0.85413516, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.34375, + "step": 2426, + "time_per_iteration": 2.5785605907440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.09822595, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.06251358549871673, + "language_loss": 0.81181312, + "learning_rate": 0.0005775709816611053, + "loss": 0.82312894, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.33374023, + "step": 2427, + "time_per_iteration": 2.9622879028320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125428, + "balance_loss_mlp": 1.09100056, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.06013841542134278, + "language_loss": 0.83607411, + "learning_rate": 0.0005772631964095346, + "loss": 0.84732836, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.34448242, + "step": 2428, + "time_per_iteration": 2.681161403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123616, + "balance_loss_mlp": 1.08990407, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.05815575913312505, + "language_loss": 0.85975552, + "learning_rate": 0.000576955381161731, + "loss": 0.87099165, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.3371582, + "step": 2429, + "time_per_iteration": 2.670814275741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122337, + "balance_loss_mlp": 1.08843446, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07250877112671852, + "language_loss": 0.86541677, + "learning_rate": 0.0005766475360371985, + "loss": 0.8766402, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.33935547, + "step": 2430, + "time_per_iteration": 2.5907814502716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118824, + "balance_loss_mlp": 1.08368063, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0946745942266809, + "language_loss": 0.84659714, + "learning_rate": 0.0005763396611554536, + "loss": 0.85778534, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.3515625, + "step": 2431, + "time_per_iteration": 2.679352045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123862, + "balance_loss_mlp": 1.0890286, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.06880905442669231, + "language_loss": 0.80567783, + "learning_rate": 0.0005760317566360237, + "loss": 0.81691647, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.34838867, + "step": 2432, + "time_per_iteration": 3.0134341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116239, + "balance_loss_mlp": 1.08090591, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.09211359876128772, + "language_loss": 0.85498667, + "learning_rate": 0.000575723822598448, + "loss": 0.86614907, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.35375977, + "step": 2433, + "time_per_iteration": 2.807387351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113775, + "balance_loss_mlp": 1.07882285, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07984033993726149, + "language_loss": 0.81515086, + "learning_rate": 0.0005754158591622773, + "loss": 0.82628858, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.35009766, + "step": 2434, + "time_per_iteration": 2.9610190391540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108856, + "balance_loss_mlp": 1.07335579, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.08173781127815187, + "language_loss": 0.83058012, + "learning_rate": 0.0005751078664470732, + "loss": 0.84166867, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.35522461, + "step": 2435, + "time_per_iteration": 2.5381393432617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105873, + "balance_loss_mlp": 1.07125473, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.06625067078188727, + "language_loss": 0.86156499, + "learning_rate": 0.0005747998445724094, + "loss": 0.87262368, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.34643555, + "step": 2436, + "time_per_iteration": 2.5991244316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110631, + "balance_loss_mlp": 1.0730263, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.06922366477490534, + "language_loss": 0.8967731, + "learning_rate": 0.0005744917936578707, + "loss": 0.90783614, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.33276367, + "step": 2437, + "time_per_iteration": 2.7876076698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110478, + "balance_loss_mlp": 1.07087731, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.05346939801811538, + "language_loss": 0.83987176, + "learning_rate": 0.0005741837138230526, + "loss": 0.8509196, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.33911133, + "step": 2438, + "time_per_iteration": 2.7089829444885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110533, + "balance_loss_mlp": 1.07063985, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06113216144822436, + "language_loss": 0.8632471, + "learning_rate": 0.0005738756051875627, + "loss": 0.87430036, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.34692383, + "step": 2439, + "time_per_iteration": 3.10072922706604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106031, + "balance_loss_mlp": 1.07031631, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.054040954813727636, + "language_loss": 0.83196378, + "learning_rate": 0.0005735674678710192, + "loss": 0.84302408, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.35668945, + "step": 2440, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06644058, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.06378034204188901, + "language_loss": 0.81315678, + "learning_rate": 0.0005732593019930517, + "loss": 0.82417667, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.35571289, + "step": 2441, + "time_per_iteration": 2.8945391178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_mlp": 1.0766257, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0589509513637404, + "language_loss": 0.88047123, + "learning_rate": 0.0005729511076733008, + "loss": 0.89160711, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.36962891, + "step": 2442, + "time_per_iteration": 2.6688244342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119163, + "balance_loss_mlp": 1.08199334, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.06849073497169517, + "language_loss": 0.84747314, + "learning_rate": 0.000572642885031418, + "loss": 0.85866475, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.37207031, + "step": 2443, + "time_per_iteration": 2.9179134368896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108245, + "balance_loss_mlp": 1.07276881, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.0584848920178353, + "language_loss": 0.80748844, + "learning_rate": 0.0005723346341870662, + "loss": 0.81857085, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.35522461, + "step": 2444, + "time_per_iteration": 2.701399087905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129757, + "balance_loss_mlp": 1.09277797, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.11865712100152984, + "language_loss": 0.86692929, + "learning_rate": 0.0005720263552599188, + "loss": 0.87822688, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.36962891, + "step": 2445, + "time_per_iteration": 2.4486730098724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121458, + "balance_loss_mlp": 1.08500421, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08366602087356424, + "language_loss": 0.79955238, + "learning_rate": 0.0005717180483696604, + "loss": 0.81076699, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.36499023, + "step": 2446, + "time_per_iteration": 2.8785839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120985, + "balance_loss_mlp": 1.08486462, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0682417361382486, + "language_loss": 0.83352333, + "learning_rate": 0.0005714097136359862, + "loss": 0.84473318, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.36157227, + "step": 2447, + "time_per_iteration": 2.6363351345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118201, + "balance_loss_mlp": 1.08296275, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.051381339811927676, + "language_loss": 0.86498094, + "learning_rate": 0.0005711013511786027, + "loss": 0.87616301, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.35253906, + "step": 2448, + "time_per_iteration": 2.762845993041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111685, + "balance_loss_mlp": 1.08170676, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.058854412729412026, + "language_loss": 0.84082228, + "learning_rate": 0.0005707929611172263, + "loss": 0.85199082, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3515625, + "step": 2449, + "time_per_iteration": 2.7246243953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08007717, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.11039935923903105, + "language_loss": 0.84227139, + "learning_rate": 0.000570484543571585, + "loss": 0.85343003, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.35791016, + "step": 2450, + "time_per_iteration": 2.610919237136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113904, + "balance_loss_mlp": 1.0777123, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.0667594391398321, + "language_loss": 0.82813287, + "learning_rate": 0.0005701760986614171, + "loss": 0.8392719, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.36181641, + "step": 2451, + "time_per_iteration": 2.5151522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120641, + "balance_loss_mlp": 1.08590317, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.0603467987943219, + "language_loss": 0.87650943, + "learning_rate": 0.0005698676265064714, + "loss": 0.88771582, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.34765625, + "step": 2452, + "time_per_iteration": 2.5722150802612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114487, + "balance_loss_mlp": 1.07920074, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.07549274937771847, + "language_loss": 0.89053345, + "learning_rate": 0.0005695591272265074, + "loss": 0.90167832, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.35327148, + "step": 2453, + "time_per_iteration": 2.5431923866271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109778, + "balance_loss_mlp": 1.07384801, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.05406998074400625, + "language_loss": 0.82143486, + "learning_rate": 0.0005692506009412954, + "loss": 0.83253264, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.359375, + "step": 2454, + "time_per_iteration": 2.6976101398468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176153, + "balance_loss_mlp": 1.16375494, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.047894752053778404, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78727424, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.12402344, + "step": 2455, + "time_per_iteration": 5.006427049636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103739, + "balance_loss_mlp": 1.07000232, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07748007609747588, + "language_loss": 0.89838475, + "learning_rate": 0.0005686334678342593, + "loss": 0.90942216, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.3371582, + "step": 2456, + "time_per_iteration": 2.88089919090271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110083, + "balance_loss_mlp": 1.07586968, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.053591450648947214, + "language_loss": 0.81747675, + "learning_rate": 0.0005683248612520274, + "loss": 0.82857764, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.34204102, + "step": 2457, + "time_per_iteration": 3.0411272048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111421, + "balance_loss_mlp": 1.07811391, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.10239407628225645, + "language_loss": 0.84273934, + "learning_rate": 0.0005680162281437321, + "loss": 0.85388148, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.36083984, + "step": 2458, + "time_per_iteration": 2.8898301124572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120752, + "balance_loss_mlp": 1.08608592, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.0555075738071769, + "language_loss": 0.85104299, + "learning_rate": 0.000567707568629195, + "loss": 0.86225057, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.34692383, + "step": 2459, + "time_per_iteration": 2.706040143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122742, + "balance_loss_mlp": 1.08778977, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.06127780861136823, + "language_loss": 0.82619834, + "learning_rate": 0.0005673988828282486, + "loss": 0.83742571, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.34985352, + "step": 2460, + "time_per_iteration": 2.674525499343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111568, + "balance_loss_mlp": 1.07668757, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05574274236604154, + "language_loss": 0.81308633, + "learning_rate": 0.0005670901708607352, + "loss": 0.82420194, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.34912109, + "step": 2461, + "time_per_iteration": 2.982827663421631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109707, + "balance_loss_mlp": 1.0753746, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.15434207723638854, + "language_loss": 0.84411561, + "learning_rate": 0.0005667814328465076, + "loss": 0.85521269, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.34350586, + "step": 2462, + "time_per_iteration": 2.639051914215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.07245243, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.07072772635633937, + "language_loss": 0.81988347, + "learning_rate": 0.0005664726689054285, + "loss": 0.83094847, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.34033203, + "step": 2463, + "time_per_iteration": 2.4655356407165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.07973766, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.06987107232693553, + "language_loss": 0.8107388, + "learning_rate": 0.0005661638791573704, + "loss": 0.82186544, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.32958984, + "step": 2464, + "time_per_iteration": 2.7433135509490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111686, + "balance_loss_mlp": 1.07742512, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.060845328276789123, + "language_loss": 0.87247777, + "learning_rate": 0.0005658550637222164, + "loss": 0.88359463, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.34277344, + "step": 2465, + "time_per_iteration": 2.615755558013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113074, + "balance_loss_mlp": 1.07762074, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.05153784391367151, + "language_loss": 0.82349539, + "learning_rate": 0.0005655462227198592, + "loss": 0.83462608, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.35473633, + "step": 2466, + "time_per_iteration": 2.91003680229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109891, + "balance_loss_mlp": 1.07460487, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.055186067432112955, + "language_loss": 0.84493053, + "learning_rate": 0.0005652373562702016, + "loss": 0.85602945, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.3527832, + "step": 2467, + "time_per_iteration": 2.6209630966186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.07982516, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06952200013405305, + "language_loss": 0.88642848, + "learning_rate": 0.000564928464493156, + "loss": 0.89760423, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.37744141, + "step": 2468, + "time_per_iteration": 2.609154224395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117938, + "balance_loss_mlp": 1.0807451, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.05705018138682977, + "language_loss": 0.81856024, + "learning_rate": 0.000564619547508645, + "loss": 0.82973957, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.37158203, + "step": 2469, + "time_per_iteration": 3.041351556777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117314, + "balance_loss_mlp": 1.07849944, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.08036472839994792, + "language_loss": 0.83256048, + "learning_rate": 0.0005643106054366008, + "loss": 0.84373355, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.38818359, + "step": 2470, + "time_per_iteration": 2.5631182193756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07258332, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.05805518051262763, + "language_loss": 0.79916292, + "learning_rate": 0.000564001638396965, + "loss": 0.81025255, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.36376953, + "step": 2471, + "time_per_iteration": 2.7381579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110653, + "balance_loss_mlp": 1.0717926, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0665112346682766, + "language_loss": 0.82313401, + "learning_rate": 0.0005636926465096897, + "loss": 0.83419931, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.34741211, + "step": 2472, + "time_per_iteration": 3.0346837043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111103, + "balance_loss_mlp": 1.07622218, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.06532220540392095, + "language_loss": 0.87808621, + "learning_rate": 0.0005633836298947363, + "loss": 0.88919711, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.34912109, + "step": 2473, + "time_per_iteration": 2.587581157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122307, + "balance_loss_mlp": 1.08716393, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09099011339346055, + "language_loss": 0.70947754, + "learning_rate": 0.000563074588672075, + "loss": 0.72070062, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.3515625, + "step": 2474, + "time_per_iteration": 2.7112982273101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.09012604, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06360669353624634, + "language_loss": 0.85420531, + "learning_rate": 0.0005627655229616868, + "loss": 0.8654604, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.35400391, + "step": 2475, + "time_per_iteration": 2.7166192531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131445, + "balance_loss_mlp": 1.09532499, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.05566651470752815, + "language_loss": 0.90219474, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350919, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.36132812, + "step": 2476, + "time_per_iteration": 2.8342158794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.08339906, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06751222051526788, + "language_loss": 0.8450973, + "learning_rate": 0.0005621473185576986, + "loss": 0.85629016, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.35913086, + "step": 2477, + "time_per_iteration": 2.727320432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126891, + "balance_loss_mlp": 1.0915097, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.06498777385437565, + "language_loss": 0.87181318, + "learning_rate": 0.0005618381801041068, + "loss": 0.88308215, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.35400391, + "step": 2478, + "time_per_iteration": 2.622197389602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136775, + "balance_loss_mlp": 1.09965336, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.0693017023966873, + "language_loss": 0.83176625, + "learning_rate": 0.0005615290176428044, + "loss": 0.84313405, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.37084961, + "step": 2479, + "time_per_iteration": 2.6874895095825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.10275292, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.06633902685884922, + "language_loss": 0.85015559, + "learning_rate": 0.0005612198312938187, + "loss": 0.86152905, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.34619141, + "step": 2480, + "time_per_iteration": 2.7283356189727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143197, + "balance_loss_mlp": 1.10717165, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08700724997250119, + "language_loss": 0.79558903, + "learning_rate": 0.0005609106211771868, + "loss": 0.80702102, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.36035156, + "step": 2481, + "time_per_iteration": 2.8008668422698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155105, + "balance_loss_mlp": 1.11857891, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07115217474866456, + "language_loss": 0.89249581, + "learning_rate": 0.0005606013874129543, + "loss": 0.90404689, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.36523438, + "step": 2482, + "time_per_iteration": 2.746906280517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146439, + "balance_loss_mlp": 1.11027122, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.052135079835272054, + "language_loss": 0.80459106, + "learning_rate": 0.0005602921301211768, + "loss": 0.81605548, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.36181641, + "step": 2483, + "time_per_iteration": 2.760091543197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133668, + "balance_loss_mlp": 1.09895456, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.06775745953777351, + "language_loss": 0.82220864, + "learning_rate": 0.0005599828494219185, + "loss": 0.83354533, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.34716797, + "step": 2484, + "time_per_iteration": 2.5458662509918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113545, + "balance_loss_mlp": 1.10004473, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.08200141457856946, + "language_loss": 0.89550984, + "learning_rate": 0.0005596735454352527, + "loss": 0.90686429, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.35424805, + "step": 2485, + "time_per_iteration": 2.8570785522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143886, + "balance_loss_mlp": 1.1075511, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07792091337932193, + "language_loss": 0.85635722, + "learning_rate": 0.0005593642182812619, + "loss": 0.86779606, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.36352539, + "step": 2486, + "time_per_iteration": 2.630213975906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139867, + "balance_loss_mlp": 1.10369921, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.06102595686098437, + "language_loss": 0.83692348, + "learning_rate": 0.0005590548680800378, + "loss": 0.84832209, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.36206055, + "step": 2487, + "time_per_iteration": 3.1342179775238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139371, + "balance_loss_mlp": 1.10389483, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0657277256500081, + "language_loss": 0.76383913, + "learning_rate": 0.0005587454949516804, + "loss": 0.77523285, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.35498047, + "step": 2488, + "time_per_iteration": 2.6958112716674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145548, + "balance_loss_mlp": 1.10833097, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.061160167550216256, + "language_loss": 0.88185161, + "learning_rate": 0.0005584360990162993, + "loss": 0.89330709, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.37255859, + "step": 2489, + "time_per_iteration": 2.61667537689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133811, + "balance_loss_mlp": 1.09881115, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.0507120714137282, + "language_loss": 0.85551566, + "learning_rate": 0.0005581266803940124, + "loss": 0.86685371, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.35009766, + "step": 2490, + "time_per_iteration": 2.7139766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133471, + "balance_loss_mlp": 1.09649253, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0583035541715914, + "language_loss": 0.87154239, + "learning_rate": 0.0005578172392049471, + "loss": 0.88287711, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.36987305, + "step": 2491, + "time_per_iteration": 2.7481577396392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134521, + "balance_loss_mlp": 1.09918737, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.08141144255217014, + "language_loss": 0.84311044, + "learning_rate": 0.0005575077755692386, + "loss": 0.85445559, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.35351562, + "step": 2492, + "time_per_iteration": 2.7962934970855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132576, + "balance_loss_mlp": 1.09793389, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.053456927876726165, + "language_loss": 0.86199152, + "learning_rate": 0.0005571982896070316, + "loss": 0.87331724, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.34692383, + "step": 2493, + "time_per_iteration": 2.6755988597869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131371, + "balance_loss_mlp": 1.09534633, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.059320296473078654, + "language_loss": 0.89793247, + "learning_rate": 0.0005568887814384792, + "loss": 0.90924621, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.36035156, + "step": 2494, + "time_per_iteration": 2.5790021419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139931, + "balance_loss_mlp": 1.1042639, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.061123462827233396, + "language_loss": 0.87048668, + "learning_rate": 0.000556579251183743, + "loss": 0.88188601, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.35693359, + "step": 2495, + "time_per_iteration": 2.6916205883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134229, + "balance_loss_mlp": 1.0992769, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.05789705924573782, + "language_loss": 0.80256224, + "learning_rate": 0.0005562696989629936, + "loss": 0.81390452, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.34960938, + "step": 2496, + "time_per_iteration": 2.690638542175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133544, + "balance_loss_mlp": 1.0990684, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.06114364023526716, + "language_loss": 0.82642174, + "learning_rate": 0.0005559601248964095, + "loss": 0.83775711, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.34521484, + "step": 2497, + "time_per_iteration": 2.6249618530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135944, + "balance_loss_mlp": 1.10249412, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.06899971908711858, + "language_loss": 0.85956562, + "learning_rate": 0.0005556505291041783, + "loss": 0.87092507, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.33447266, + "step": 2498, + "time_per_iteration": 2.7098748683929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135161, + "balance_loss_mlp": 1.10097158, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.055207166893370456, + "language_loss": 0.84689957, + "learning_rate": 0.0005553409117064954, + "loss": 0.85825121, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.34228516, + "step": 2499, + "time_per_iteration": 2.8708267211914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_mlp": 1.10242295, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.06687134527330599, + "language_loss": 0.8476308, + "learning_rate": 0.0005550312728235654, + "loss": 0.85899949, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.34448242, + "step": 2500, + "time_per_iteration": 2.6980721950531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128863, + "balance_loss_mlp": 1.09500802, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07829313389837793, + "language_loss": 0.83860761, + "learning_rate": 0.0005547216125756003, + "loss": 0.84989619, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.33862305, + "step": 2501, + "time_per_iteration": 2.737539291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140174, + "balance_loss_mlp": 1.10729611, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.06644553638954338, + "language_loss": 0.82266629, + "learning_rate": 0.0005544119310828211, + "loss": 0.83406806, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.32885742, + "step": 2502, + "time_per_iteration": 3.082392930984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125836, + "balance_loss_mlp": 1.09245706, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.061244964440333595, + "language_loss": 0.85365945, + "learning_rate": 0.0005541022284654568, + "loss": 0.86491781, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.33398438, + "step": 2503, + "time_per_iteration": 2.9372761249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125034, + "balance_loss_mlp": 1.09189391, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06168262746563105, + "language_loss": 0.84156048, + "learning_rate": 0.0005537925048437446, + "loss": 0.8528108, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.33154297, + "step": 2504, + "time_per_iteration": 2.589538097381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_mlp": 1.04296899, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.02361726537833674, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.7680397, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09521484, + "step": 2505, + "time_per_iteration": 4.908772230148315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111219, + "balance_loss_mlp": 1.07819104, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.056356974386017084, + "language_loss": 0.88423991, + "learning_rate": 0.0005531729950682664, + "loss": 0.89536178, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.34008789, + "step": 2506, + "time_per_iteration": 3.003096580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108061, + "balance_loss_mlp": 1.0739913, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.08388532833554185, + "language_loss": 0.85083711, + "learning_rate": 0.000552863209155015, + "loss": 0.86191773, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.34082031, + "step": 2507, + "time_per_iteration": 2.511463165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.07331145, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.05856414722035687, + "language_loss": 0.82348502, + "learning_rate": 0.0005525534027184461, + "loss": 0.83454525, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.32714844, + "step": 2508, + "time_per_iteration": 2.6477487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102119, + "balance_loss_mlp": 1.06993294, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.054304228935087996, + "language_loss": 0.83357495, + "learning_rate": 0.0005522435758788365, + "loss": 0.84459615, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.32177734, + "step": 2509, + "time_per_iteration": 2.715082883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_mlp": 1.06741309, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.07316920081788965, + "language_loss": 0.80354846, + "learning_rate": 0.0005519337287564721, + "loss": 0.81456852, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.34594727, + "step": 2510, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103913, + "balance_loss_mlp": 1.07225132, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07052632360826482, + "language_loss": 0.83703697, + "learning_rate": 0.000551623861471646, + "loss": 0.84807611, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.31640625, + "step": 2511, + "time_per_iteration": 2.7521867752075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_mlp": 1.01886296, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.02307493847576384, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79847658, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09960938, + "step": 2512, + "time_per_iteration": 4.850410461425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110192, + "balance_loss_mlp": 1.06877947, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.060960940408773784, + "language_loss": 0.86943817, + "learning_rate": 0.0005510040668958211, + "loss": 0.88045734, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.33154297, + "step": 2513, + "time_per_iteration": 2.5581674575805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011471, + "balance_loss_mlp": 1.00145698, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.01573295897448314, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78772056, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.10009766, + "step": 2514, + "time_per_iteration": 4.821207523345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101449, + "balance_loss_mlp": 1.06876206, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.06635931409503217, + "language_loss": 0.8316704, + "learning_rate": 0.0005503841931138645, + "loss": 0.84268492, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.3269043, + "step": 2515, + "time_per_iteration": 2.6826930046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109492, + "balance_loss_mlp": 1.06247151, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07963111819885421, + "language_loss": 0.81975293, + "learning_rate": 0.0005500742268214025, + "loss": 0.83070219, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.32446289, + "step": 2516, + "time_per_iteration": 2.4913811683654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109084, + "balance_loss_mlp": 1.07763672, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.057140457991015275, + "language_loss": 0.85559756, + "learning_rate": 0.0005497642410884014, + "loss": 0.86668837, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.31420898, + "step": 2517, + "time_per_iteration": 2.7807135581970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101598, + "balance_loss_mlp": 1.06855321, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.05176470538316484, + "language_loss": 0.85257566, + "learning_rate": 0.0005494542360352085, + "loss": 0.86359167, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.33056641, + "step": 2518, + "time_per_iteration": 2.653507947921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115114, + "balance_loss_mlp": 1.08285642, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.0599313447084905, + "language_loss": 0.85512084, + "learning_rate": 0.0005491442117821783, + "loss": 0.86627203, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.32226562, + "step": 2519, + "time_per_iteration": 2.717984676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08325005, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.0649010079315795, + "language_loss": 0.87622237, + "learning_rate": 0.0005488341684496732, + "loss": 0.88739175, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.33691406, + "step": 2520, + "time_per_iteration": 2.652135133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108566, + "balance_loss_mlp": 1.07606971, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06559854904132026, + "language_loss": 0.92200404, + "learning_rate": 0.0005485241061580624, + "loss": 0.93308973, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.32495117, + "step": 2521, + "time_per_iteration": 2.7108826637268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102755, + "balance_loss_mlp": 1.07037747, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.055876909605250345, + "language_loss": 0.84836948, + "learning_rate": 0.0005482140250277228, + "loss": 0.85939705, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.32373047, + "step": 2522, + "time_per_iteration": 2.997586965560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105629, + "balance_loss_mlp": 1.07408667, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.07027884549034326, + "language_loss": 0.87641776, + "learning_rate": 0.0005479039251790387, + "loss": 0.88747412, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.31518555, + "step": 2523, + "time_per_iteration": 2.611929416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096851, + "balance_loss_mlp": 1.06478369, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.061509725516535926, + "language_loss": 0.8502717, + "learning_rate": 0.0005475938067324014, + "loss": 0.86124021, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.32055664, + "step": 2524, + "time_per_iteration": 2.8200628757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07339168, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.064836171654712, + "language_loss": 0.83736813, + "learning_rate": 0.0005472836698082098, + "loss": 0.84842694, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.32495117, + "step": 2525, + "time_per_iteration": 2.4986329078674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100608, + "balance_loss_mlp": 1.06763458, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.05406459595211624, + "language_loss": 0.8394289, + "learning_rate": 0.0005469735145268694, + "loss": 0.8504349, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.32983398, + "step": 2526, + "time_per_iteration": 2.7246296405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107934, + "balance_loss_mlp": 1.07455492, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0623071528474554, + "language_loss": 0.8099308, + "learning_rate": 0.0005466633410087933, + "loss": 0.82101017, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.33398438, + "step": 2527, + "time_per_iteration": 2.660274028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049375, + "balance_loss_mlp": 1.03955197, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.029762737629489368, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78310198, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.09814453, + "step": 2528, + "time_per_iteration": 4.886114835739136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098205, + "balance_loss_mlp": 1.06663859, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.05348067523581763, + "language_loss": 0.88341582, + "learning_rate": 0.0005460429397441214, + "loss": 0.89439785, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.31542969, + "step": 2529, + "time_per_iteration": 2.556168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06572175, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.07361694113297405, + "language_loss": 0.86787206, + "learning_rate": 0.0005457327122383866, + "loss": 0.87883973, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.31030273, + "step": 2530, + "time_per_iteration": 2.6101198196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_mlp": 1.02248013, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.016416545513431694, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75668502, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.09521484, + "step": 2531, + "time_per_iteration": 4.807017803192139 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102878, + "balance_loss_mlp": 1.071383, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.061169122564006716, + "language_loss": 0.75803703, + "learning_rate": 0.0005451122040823244, + "loss": 0.7690658, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.31469727, + "step": 2532, + "time_per_iteration": 2.778230667114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110046, + "balance_loss_mlp": 1.07611895, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.05283553568044795, + "language_loss": 0.77404439, + "learning_rate": 0.0005448019236728997, + "loss": 0.78514493, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.33959961, + "step": 2533, + "time_per_iteration": 2.8531336784362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106047, + "balance_loss_mlp": 1.07521987, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.06480756266699016, + "language_loss": 0.84952033, + "learning_rate": 0.0005444916258698255, + "loss": 0.8605808, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.30810547, + "step": 2534, + "time_per_iteration": 2.5989930629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108181, + "balance_loss_mlp": 1.07701969, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.058540646847924545, + "language_loss": 0.8623631, + "learning_rate": 0.0005441813107935704, + "loss": 0.87344491, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.31152344, + "step": 2535, + "time_per_iteration": 2.6970572471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.0836966, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.06249509461195645, + "language_loss": 0.85908329, + "learning_rate": 0.0005438709785646091, + "loss": 0.87024212, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.32177734, + "step": 2536, + "time_per_iteration": 2.5461835861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109072, + "balance_loss_mlp": 1.07688498, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06859245202813889, + "language_loss": 0.87149572, + "learning_rate": 0.0005435606293034234, + "loss": 0.88258648, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.32177734, + "step": 2537, + "time_per_iteration": 2.6585540771484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_mlp": 1.07018018, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.07107602922960535, + "language_loss": 0.84916604, + "learning_rate": 0.0005432502631305016, + "loss": 0.86016917, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.30126953, + "step": 2538, + "time_per_iteration": 2.6976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103363, + "balance_loss_mlp": 1.07055688, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.04961852862161645, + "language_loss": 0.83663356, + "learning_rate": 0.0005429398801663386, + "loss": 0.84766722, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.32788086, + "step": 2539, + "time_per_iteration": 2.9294815063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101134, + "balance_loss_mlp": 1.06916165, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.06193008336457455, + "language_loss": 0.83023834, + "learning_rate": 0.0005426294805314355, + "loss": 0.84124964, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.31958008, + "step": 2540, + "time_per_iteration": 2.5207223892211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099167, + "balance_loss_mlp": 1.06593108, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.0603925409034683, + "language_loss": 0.80357647, + "learning_rate": 0.0005423190643463003, + "loss": 0.8145681, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.33251953, + "step": 2541, + "time_per_iteration": 3.0720365047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101012, + "balance_loss_mlp": 1.06915879, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.0609118032347285, + "language_loss": 0.83149743, + "learning_rate": 0.0005420086317314473, + "loss": 0.84250748, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.31835938, + "step": 2542, + "time_per_iteration": 2.7291080951690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06470084, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.056070719415307675, + "language_loss": 0.81426919, + "learning_rate": 0.0005416981828073971, + "loss": 0.8252514, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.33544922, + "step": 2543, + "time_per_iteration": 2.7784368991851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_mlp": 1.02441669, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.02316516352555082, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78148878, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09423828, + "step": 2544, + "time_per_iteration": 4.838131666183472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.07023609, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07449943721079477, + "language_loss": 0.85317016, + "learning_rate": 0.000541077236513819, + "loss": 0.86419702, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.32446289, + "step": 2545, + "time_per_iteration": 2.5264503955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101803, + "balance_loss_mlp": 1.07071328, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.056060473734182076, + "language_loss": 0.82499588, + "learning_rate": 0.0005407667393853638, + "loss": 0.83601391, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31054688, + "step": 2546, + "time_per_iteration": 2.66180157661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06699038, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06590134685442105, + "language_loss": 0.83337891, + "learning_rate": 0.0005404562264298569, + "loss": 0.84437472, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32592773, + "step": 2547, + "time_per_iteration": 2.8525304794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098759, + "balance_loss_mlp": 1.06390238, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.05425762766620139, + "language_loss": 0.83855128, + "learning_rate": 0.0005401456977678498, + "loss": 0.84953886, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.34838867, + "step": 2548, + "time_per_iteration": 2.6519198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098656, + "balance_loss_mlp": 1.06561112, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.06384769679028596, + "language_loss": 0.77718782, + "learning_rate": 0.0005398351535199008, + "loss": 0.78817439, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.33056641, + "step": 2549, + "time_per_iteration": 3.0877339839935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096351, + "balance_loss_mlp": 1.06499887, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.053089286344054805, + "language_loss": 0.83930391, + "learning_rate": 0.0005395245938065735, + "loss": 0.85026741, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31347656, + "step": 2550, + "time_per_iteration": 2.8241264820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099597, + "balance_loss_mlp": 1.06669557, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.0641036113016549, + "language_loss": 0.82636213, + "learning_rate": 0.0005392140187484379, + "loss": 0.83735812, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.32885742, + "step": 2551, + "time_per_iteration": 2.593710422515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105531, + "balance_loss_mlp": 1.07332087, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.06156906510059403, + "language_loss": 0.89866853, + "learning_rate": 0.0005389034284660701, + "loss": 0.90972388, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.32202148, + "step": 2552, + "time_per_iteration": 2.8167800903320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112217, + "balance_loss_mlp": 1.07957709, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.06543971253041776, + "language_loss": 0.82440078, + "learning_rate": 0.000538592823080052, + "loss": 0.83552289, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.32641602, + "step": 2553, + "time_per_iteration": 3.190459966659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110985, + "balance_loss_mlp": 1.07817876, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.061393832790464745, + "language_loss": 0.85407627, + "learning_rate": 0.000538282202710971, + "loss": 0.8651861, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.328125, + "step": 2554, + "time_per_iteration": 2.5911953449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111409, + "balance_loss_mlp": 1.07907963, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.06886607309109279, + "language_loss": 0.82350785, + "learning_rate": 0.000537971567479421, + "loss": 0.83462197, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.32324219, + "step": 2555, + "time_per_iteration": 2.7882654666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110188, + "balance_loss_mlp": 1.07783484, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.07781814230506547, + "language_loss": 0.87956369, + "learning_rate": 0.0005376609175060011, + "loss": 0.89066565, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32348633, + "step": 2556, + "time_per_iteration": 2.6131739616394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121597, + "balance_loss_mlp": 1.08850408, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07736545907619681, + "language_loss": 0.80871999, + "learning_rate": 0.0005373502529113162, + "loss": 0.81993598, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.33105469, + "step": 2557, + "time_per_iteration": 2.8115434646606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125925, + "balance_loss_mlp": 1.09154499, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.06369400741363575, + "language_loss": 0.81534445, + "learning_rate": 0.0005370395738159773, + "loss": 0.82660365, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.34375, + "step": 2558, + "time_per_iteration": 2.645482063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134081, + "balance_loss_mlp": 1.10120285, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.06840745530844954, + "language_loss": 0.83582544, + "learning_rate": 0.0005367288803406003, + "loss": 0.84716624, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.32885742, + "step": 2559, + "time_per_iteration": 2.6290056705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113348, + "balance_loss_mlp": 1.09895754, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.06026988921967747, + "language_loss": 0.81393933, + "learning_rate": 0.0005364181726058073, + "loss": 0.82527417, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.34545898, + "step": 2560, + "time_per_iteration": 2.683072805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113164, + "balance_loss_mlp": 1.09771323, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.10364093826622443, + "language_loss": 0.8257041, + "learning_rate": 0.0005361074507322261, + "loss": 0.83702052, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.33935547, + "step": 2561, + "time_per_iteration": 2.5988388061523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127176, + "balance_loss_mlp": 1.09420276, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.08607714934124724, + "language_loss": 0.81995922, + "learning_rate": 0.000535796714840489, + "loss": 0.831231, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.32983398, + "step": 2562, + "time_per_iteration": 2.617560625076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124157, + "balance_loss_mlp": 1.09099317, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.06602924000575079, + "language_loss": 0.84137893, + "learning_rate": 0.0005354859650512348, + "loss": 0.85262048, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.33154297, + "step": 2563, + "time_per_iteration": 2.7547245025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118883, + "balance_loss_mlp": 1.08707833, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.060127327089604984, + "language_loss": 0.87529951, + "learning_rate": 0.0005351752014851074, + "loss": 0.88648832, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31787109, + "step": 2564, + "time_per_iteration": 2.5543923377990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115017, + "balance_loss_mlp": 1.08199644, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.057267908508465526, + "language_loss": 0.83867848, + "learning_rate": 0.0005348644242627553, + "loss": 0.84982872, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.33032227, + "step": 2565, + "time_per_iteration": 2.7361738681793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074248, + "balance_loss_mlp": 1.06585574, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.028047824457769776, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76360869, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.08398438, + "step": 2566, + "time_per_iteration": 4.955476760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126385, + "balance_loss_mlp": 1.09605825, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.0818104780923525, + "language_loss": 0.81442422, + "learning_rate": 0.0005342428293320013, + "loss": 0.825688, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30297852, + "step": 2567, + "time_per_iteration": 2.7417242527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133289, + "balance_loss_mlp": 1.10160363, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.06602747501781048, + "language_loss": 0.83786738, + "learning_rate": 0.0005339320118649238, + "loss": 0.84920025, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.31665039, + "step": 2568, + "time_per_iteration": 2.6943705081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.11111128, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.08080827100230976, + "language_loss": 0.86562729, + "learning_rate": 0.000533621181224271, + "loss": 0.87704599, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30737305, + "step": 2569, + "time_per_iteration": 2.7706520557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140818, + "balance_loss_mlp": 1.10748696, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.0686138609954652, + "language_loss": 0.81810164, + "learning_rate": 0.0005333103375307182, + "loss": 0.82950985, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.33349609, + "step": 2570, + "time_per_iteration": 2.86440372467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114456, + "balance_loss_mlp": 1.11196864, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06689740684779927, + "language_loss": 0.86211395, + "learning_rate": 0.0005329994809049451, + "loss": 0.87355959, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.32592773, + "step": 2571, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.10243487, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.10119095251173513, + "language_loss": 0.87867194, + "learning_rate": 0.0005326886114676375, + "loss": 0.89004534, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.34936523, + "step": 2572, + "time_per_iteration": 2.7414114475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122524, + "balance_loss_mlp": 1.09017086, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06560191593845013, + "language_loss": 0.8820219, + "learning_rate": 0.0005323777293394854, + "loss": 0.89324713, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.32348633, + "step": 2573, + "time_per_iteration": 2.5354294776916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120926, + "balance_loss_mlp": 1.08838177, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.057507807941180766, + "language_loss": 0.8235743, + "learning_rate": 0.000532066834641184, + "loss": 0.83478361, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32543945, + "step": 2574, + "time_per_iteration": 2.6555819511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110618, + "balance_loss_mlp": 1.07401729, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.06325814646706406, + "language_loss": 0.85261214, + "learning_rate": 0.0005317559274934334, + "loss": 0.86367393, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.3215332, + "step": 2575, + "time_per_iteration": 2.7056500911712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109208, + "balance_loss_mlp": 1.07559085, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.06593319291759459, + "language_loss": 0.81090045, + "learning_rate": 0.0005314450080169382, + "loss": 0.82199252, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33642578, + "step": 2576, + "time_per_iteration": 2.6029012203216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06544995, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.07692863745295915, + "language_loss": 0.80917549, + "learning_rate": 0.0005311340763324083, + "loss": 0.82014352, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.31323242, + "step": 2577, + "time_per_iteration": 2.5615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092477, + "balance_loss_mlp": 1.06081462, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06627487899009786, + "language_loss": 0.82433712, + "learning_rate": 0.0005308231325605578, + "loss": 0.83526182, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.31665039, + "step": 2578, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096329, + "balance_loss_mlp": 1.06473827, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.053568999050238396, + "language_loss": 0.77453893, + "learning_rate": 0.0005305121768221061, + "loss": 0.7855022, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.31542969, + "step": 2579, + "time_per_iteration": 3.0817010402679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_mlp": 1.00046897, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.016247003132607515, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76047277, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08496094, + "step": 2580, + "time_per_iteration": 4.813999176025391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.05099821, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06693938938040958, + "language_loss": 0.92087269, + "learning_rate": 0.0005298902299282984, + "loss": 0.93170166, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.3190918, + "step": 2581, + "time_per_iteration": 2.622823715209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096273, + "balance_loss_mlp": 1.0638243, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.06032323910602905, + "language_loss": 0.84543586, + "learning_rate": 0.0005295792390144033, + "loss": 0.85639858, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.32446289, + "step": 2582, + "time_per_iteration": 2.68511962890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110589, + "balance_loss_mlp": 1.07236862, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.06277392630469315, + "language_loss": 0.84023589, + "learning_rate": 0.0005292682366168294, + "loss": 0.85129476, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.33544922, + "step": 2583, + "time_per_iteration": 2.5309059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095632, + "balance_loss_mlp": 1.06256378, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06727867389441711, + "language_loss": 0.79973817, + "learning_rate": 0.0005289572228563181, + "loss": 0.81069446, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.33081055, + "step": 2584, + "time_per_iteration": 4.178269386291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095977, + "balance_loss_mlp": 1.06362402, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.05530053735156927, + "language_loss": 0.83410299, + "learning_rate": 0.000528646197853616, + "loss": 0.84506273, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.32373047, + "step": 2585, + "time_per_iteration": 2.706878900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101894, + "balance_loss_mlp": 1.07032776, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.05706454291548468, + "language_loss": 0.86111611, + "learning_rate": 0.0005283351617294735, + "loss": 0.87213504, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.31567383, + "step": 2586, + "time_per_iteration": 2.9042582511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017241, + "balance_loss_mlp": 1.00732255, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.020630801148902787, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77653909, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.09912109, + "step": 2587, + "time_per_iteration": 4.9974682331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099928, + "balance_loss_mlp": 1.06676388, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.07253805127360792, + "language_loss": 0.86542678, + "learning_rate": 0.0005277130565998916, + "loss": 0.87642598, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.33178711, + "step": 2588, + "time_per_iteration": 2.7639453411102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092536, + "balance_loss_mlp": 1.06144667, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05247127963424023, + "language_loss": 0.82351577, + "learning_rate": 0.0005274019878359748, + "loss": 0.83444113, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.31054688, + "step": 2589, + "time_per_iteration": 2.706843137741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109391, + "balance_loss_mlp": 1.05943429, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.06499700543891603, + "language_loss": 0.87299156, + "learning_rate": 0.0005270909084336628, + "loss": 0.88393074, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.34472656, + "step": 2590, + "time_per_iteration": 2.627092123031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095191, + "balance_loss_mlp": 1.06174052, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.06358626343280155, + "language_loss": 0.89192379, + "learning_rate": 0.0005267798185137276, + "loss": 0.90287566, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.33447266, + "step": 2591, + "time_per_iteration": 2.6053519248962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098275, + "balance_loss_mlp": 1.06434834, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.06851868017892651, + "language_loss": 0.89230084, + "learning_rate": 0.0005264687181969444, + "loss": 0.9032836, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.33959961, + "step": 2592, + "time_per_iteration": 2.7227771282196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097456, + "balance_loss_mlp": 1.06255198, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.06920907227035335, + "language_loss": 0.75419706, + "learning_rate": 0.0005261576076040937, + "loss": 0.76517165, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.34936523, + "step": 2593, + "time_per_iteration": 3.2559545040130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096802, + "balance_loss_mlp": 1.06430554, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.06727068797895107, + "language_loss": 0.84462249, + "learning_rate": 0.0005258464868559591, + "loss": 0.85559052, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.32519531, + "step": 2594, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06432104, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.05920105575352037, + "language_loss": 0.88943779, + "learning_rate": 0.0005255353560733284, + "loss": 0.90040118, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.32006836, + "step": 2595, + "time_per_iteration": 2.5696520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_mlp": 1.02894819, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.021649763717819466, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76616704, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.09130859, + "step": 2596, + "time_per_iteration": 4.785402059555054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096305, + "balance_loss_mlp": 1.06354642, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.055871474183400556, + "language_loss": 0.83429074, + "learning_rate": 0.0005249130648877492, + "loss": 0.84525383, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.32763672, + "step": 2597, + "time_per_iteration": 2.768077850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096166, + "balance_loss_mlp": 1.0628823, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.06479225622172138, + "language_loss": 0.85305572, + "learning_rate": 0.0005246019047263953, + "loss": 0.86401737, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.33300781, + "step": 2598, + "time_per_iteration": 2.4575202465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109757, + "balance_loss_mlp": 1.06471562, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.06552285864087816, + "language_loss": 0.82716942, + "learning_rate": 0.0005242907350137353, + "loss": 0.83814514, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.32836914, + "step": 2599, + "time_per_iteration": 2.545402765274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.06773996, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.060184934170799446, + "language_loss": 0.79316103, + "learning_rate": 0.0005239795558705754, + "loss": 0.80416048, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.32202148, + "step": 2600, + "time_per_iteration": 2.6259560585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094505, + "balance_loss_mlp": 1.06279588, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.07180292739942261, + "language_loss": 0.89506614, + "learning_rate": 0.0005236683674177264, + "loss": 0.90601116, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.31713867, + "step": 2601, + "time_per_iteration": 2.6216633319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.06531632, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.05820446715743302, + "language_loss": 0.82377899, + "learning_rate": 0.0005233571697760021, + "loss": 0.83476663, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.3347168, + "step": 2602, + "time_per_iteration": 2.8286540508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107785, + "balance_loss_mlp": 1.07447851, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06262770013006937, + "language_loss": 0.83391154, + "learning_rate": 0.0005230459630662203, + "loss": 0.84498942, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.33325195, + "step": 2603, + "time_per_iteration": 2.9667811393737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107928, + "balance_loss_mlp": 1.07562184, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.06520686758548196, + "language_loss": 0.81425881, + "learning_rate": 0.0005227347474092022, + "loss": 0.82533813, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.32250977, + "step": 2604, + "time_per_iteration": 2.7840375900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109186, + "balance_loss_mlp": 1.07616544, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.04693444517106987, + "language_loss": 0.83613992, + "learning_rate": 0.0005224235229257724, + "loss": 0.84723175, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.33032227, + "step": 2605, + "time_per_iteration": 2.6730735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_mlp": 1.06970012, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.05305580167320912, + "language_loss": 0.87095463, + "learning_rate": 0.0005221122897367589, + "loss": 0.88196945, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.31762695, + "step": 2606, + "time_per_iteration": 2.804161310195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106275, + "balance_loss_mlp": 1.07384968, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.07402045106641765, + "language_loss": 0.81512845, + "learning_rate": 0.0005218010479629932, + "loss": 0.82619125, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.32421875, + "step": 2607, + "time_per_iteration": 2.6673223972320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111463, + "balance_loss_mlp": 1.0777508, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.06695708261577327, + "language_loss": 0.82331049, + "learning_rate": 0.0005214897977253102, + "loss": 0.83442515, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.33740234, + "step": 2608, + "time_per_iteration": 2.641615390777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109683, + "balance_loss_mlp": 1.06538224, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.057424183285493445, + "language_loss": 0.84299719, + "learning_rate": 0.0005211785391445473, + "loss": 0.85396552, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.31445312, + "step": 2609, + "time_per_iteration": 2.736565589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098049, + "balance_loss_mlp": 1.06381226, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.15505754048194495, + "language_loss": 0.79028511, + "learning_rate": 0.0005208672723415467, + "loss": 0.8012656, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.3425293, + "step": 2610, + "time_per_iteration": 2.7740700244903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06371355, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.06293902841757802, + "language_loss": 0.79232705, + "learning_rate": 0.0005205559974371525, + "loss": 0.80331105, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.34716797, + "step": 2611, + "time_per_iteration": 2.7674527168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096957, + "balance_loss_mlp": 1.06455564, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06270244311506845, + "language_loss": 0.82445353, + "learning_rate": 0.0005202447145522123, + "loss": 0.83542311, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.32397461, + "step": 2612, + "time_per_iteration": 2.6602847576141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100141, + "balance_loss_mlp": 1.06700087, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.1708463718003921, + "language_loss": 0.79453385, + "learning_rate": 0.0005199334238075769, + "loss": 0.80553526, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.33154297, + "step": 2613, + "time_per_iteration": 2.5568900108337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_mlp": 1.06802678, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.0528689770317124, + "language_loss": 0.92217171, + "learning_rate": 0.0005196221253241, + "loss": 0.93318725, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.3347168, + "step": 2614, + "time_per_iteration": 2.6126556396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099044, + "balance_loss_mlp": 1.06490254, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.060608661488991786, + "language_loss": 0.83149332, + "learning_rate": 0.0005193108192226383, + "loss": 0.84248376, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.34155273, + "step": 2615, + "time_per_iteration": 2.74265456199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099568, + "balance_loss_mlp": 1.06599879, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.05036532075051116, + "language_loss": 0.87427437, + "learning_rate": 0.000518999505624052, + "loss": 0.88527, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.33569336, + "step": 2616, + "time_per_iteration": 2.6870973110198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098357, + "balance_loss_mlp": 1.06483543, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.047696485592571475, + "language_loss": 0.83320528, + "learning_rate": 0.000518688184649203, + "loss": 0.84418881, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.33544922, + "step": 2617, + "time_per_iteration": 2.8016743659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.06434643, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.046578345586746416, + "language_loss": 0.83902323, + "learning_rate": 0.0005183768564189577, + "loss": 0.85000026, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.33374023, + "step": 2618, + "time_per_iteration": 2.5473384857177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103149, + "balance_loss_mlp": 1.07158208, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.06435350107251939, + "language_loss": 0.81610096, + "learning_rate": 0.0005180655210541838, + "loss": 0.82713246, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31542969, + "step": 2619, + "time_per_iteration": 2.6063601970672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109319, + "balance_loss_mlp": 1.07362747, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.07554849641883571, + "language_loss": 0.83428431, + "learning_rate": 0.0005177541786757527, + "loss": 0.8453775, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.35693359, + "step": 2620, + "time_per_iteration": 2.7651278972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109868, + "balance_loss_mlp": 1.07589293, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.07801269652965341, + "language_loss": 0.8344717, + "learning_rate": 0.000517442829404538, + "loss": 0.84557039, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.33959961, + "step": 2621, + "time_per_iteration": 2.991288661956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07401848, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07509105999805234, + "language_loss": 0.87522292, + "learning_rate": 0.0005171314733614166, + "loss": 0.8862952, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.33227539, + "step": 2622, + "time_per_iteration": 2.8980941772460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107621, + "balance_loss_mlp": 1.07357442, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.05402993794527385, + "language_loss": 0.78464615, + "learning_rate": 0.0005168201106672671, + "loss": 0.79572237, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.34057617, + "step": 2623, + "time_per_iteration": 2.7572929859161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106537, + "balance_loss_mlp": 1.07394505, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.0666138467605724, + "language_loss": 0.85413206, + "learning_rate": 0.0005165087414429717, + "loss": 0.86519742, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.32592773, + "step": 2624, + "time_per_iteration": 2.6197690963745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104325, + "balance_loss_mlp": 1.07178128, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.0890371890087988, + "language_loss": 0.83553296, + "learning_rate": 0.0005161973658094144, + "loss": 0.84657621, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.32543945, + "step": 2625, + "time_per_iteration": 2.688664436340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114503, + "balance_loss_mlp": 1.08188796, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.10293664596100507, + "language_loss": 0.82534152, + "learning_rate": 0.000515885983887482, + "loss": 0.83648658, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.32592773, + "step": 2626, + "time_per_iteration": 2.7382290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117287, + "balance_loss_mlp": 1.08467126, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.06112991005583596, + "language_loss": 0.84654796, + "learning_rate": 0.0005155745957980636, + "loss": 0.85772085, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.32617188, + "step": 2627, + "time_per_iteration": 2.5833873748779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.0852921, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.05493055898841465, + "language_loss": 0.88454115, + "learning_rate": 0.000515263201662051, + "loss": 0.89571404, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.31982422, + "step": 2628, + "time_per_iteration": 2.6362485885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112533, + "balance_loss_mlp": 1.09264278, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.05313724215790835, + "language_loss": 0.8271699, + "learning_rate": 0.0005149518016003378, + "loss": 0.83842319, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.3269043, + "step": 2629, + "time_per_iteration": 3.1579666137695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121904, + "balance_loss_mlp": 1.09109998, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.05858406869857789, + "language_loss": 0.82627785, + "learning_rate": 0.0005146403957338206, + "loss": 0.83749688, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30786133, + "step": 2630, + "time_per_iteration": 2.5554275512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128543, + "balance_loss_mlp": 1.09664297, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.05139775445636508, + "language_loss": 0.82087231, + "learning_rate": 0.0005143289841833975, + "loss": 0.83215779, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31884766, + "step": 2631, + "time_per_iteration": 2.866208076477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136223, + "balance_loss_mlp": 1.10332084, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.07049680225310351, + "language_loss": 0.82485932, + "learning_rate": 0.0005140175670699696, + "loss": 0.83622158, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.32885742, + "step": 2632, + "time_per_iteration": 2.589662551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136954, + "balance_loss_mlp": 1.10464883, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04937719013853961, + "language_loss": 0.83023763, + "learning_rate": 0.0005137061445144395, + "loss": 0.84160721, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.32299805, + "step": 2633, + "time_per_iteration": 2.907914161682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145225, + "balance_loss_mlp": 1.11308646, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06298038708728138, + "language_loss": 0.87351924, + "learning_rate": 0.000513394716637712, + "loss": 0.8849715, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.32128906, + "step": 2634, + "time_per_iteration": 2.7392778396606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_mlp": 1.05677319, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03015814476855984, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80255967, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.07470703, + "step": 2635, + "time_per_iteration": 4.8476762771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138418, + "balance_loss_mlp": 1.10549188, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0660835824728649, + "language_loss": 0.80952996, + "learning_rate": 0.0005127718454042958, + "loss": 0.82091409, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.3293457, + "step": 2636, + "time_per_iteration": 2.801945447921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122948, + "balance_loss_mlp": 1.09083319, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.06804864770708682, + "language_loss": 0.8454951, + "learning_rate": 0.0005124604022894269, + "loss": 0.85672456, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.32104492, + "step": 2637, + "time_per_iteration": 2.9412965774536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_mlp": 1.0316422, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.020904454547095577, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78227401, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.07519531, + "step": 2638, + "time_per_iteration": 4.857941389083862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_mlp": 1.09507418, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.058859738864391845, + "language_loss": 0.83504963, + "learning_rate": 0.0005118375016679325, + "loss": 0.84632552, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.32495117, + "step": 2639, + "time_per_iteration": 2.7467126846313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115219, + "balance_loss_mlp": 1.08169687, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.06748446003243579, + "language_loss": 0.80393875, + "learning_rate": 0.0005115260444031382, + "loss": 0.81509095, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.33544922, + "step": 2640, + "time_per_iteration": 2.5831897258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_mlp": 1.01354098, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011909310640322752, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79752946, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.07568359, + "step": 2641, + "time_per_iteration": 4.96182656288147 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118257, + "balance_loss_mlp": 1.08506942, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.06566448453374539, + "language_loss": 0.87279713, + "learning_rate": 0.0005109031165700483, + "loss": 0.88397968, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.33178711, + "step": 2642, + "time_per_iteration": 2.5608396530151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114089, + "balance_loss_mlp": 1.08228409, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07470030174236865, + "language_loss": 0.83423924, + "learning_rate": 0.0005105916462435945, + "loss": 0.84538019, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.31787109, + "step": 2643, + "time_per_iteration": 2.840092420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114248, + "balance_loss_mlp": 1.08272934, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0540496938056118, + "language_loss": 0.8565858, + "learning_rate": 0.0005102801718050989, + "loss": 0.86772823, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.31494141, + "step": 2644, + "time_per_iteration": 2.687993288040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111024, + "balance_loss_mlp": 1.08024383, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.0657522571772089, + "language_loss": 0.89181781, + "learning_rate": 0.0005099686933754867, + "loss": 0.90292799, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.30737305, + "step": 2645, + "time_per_iteration": 2.676555633544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110963, + "balance_loss_mlp": 1.07589364, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.06525501329559952, + "language_loss": 0.84646904, + "learning_rate": 0.0005096572110756845, + "loss": 0.85756534, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.33740234, + "step": 2646, + "time_per_iteration": 2.722046136856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098497, + "balance_loss_mlp": 1.06502318, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.055343813231999515, + "language_loss": 0.85733652, + "learning_rate": 0.0005093457250266205, + "loss": 0.86832154, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.33496094, + "step": 2647, + "time_per_iteration": 2.726637363433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105884, + "balance_loss_mlp": 1.07260132, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.07566246752622155, + "language_loss": 0.83174831, + "learning_rate": 0.000509034235349224, + "loss": 0.84280717, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.33276367, + "step": 2648, + "time_per_iteration": 2.7163400650024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06480372, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.05726246002698667, + "language_loss": 0.81403017, + "learning_rate": 0.0005087227421644266, + "loss": 0.82501602, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.33813477, + "step": 2649, + "time_per_iteration": 2.753593683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090769, + "balance_loss_mlp": 1.05836821, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.062073163804743356, + "language_loss": 0.86567879, + "learning_rate": 0.0005084112455931602, + "loss": 0.87658644, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.32397461, + "step": 2650, + "time_per_iteration": 2.6115548610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109227, + "balance_loss_mlp": 1.05986929, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.07224314043681272, + "language_loss": 0.85185993, + "learning_rate": 0.0005080997457563586, + "loss": 0.8627826, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.32397461, + "step": 2651, + "time_per_iteration": 2.562626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091424, + "balance_loss_mlp": 1.05797434, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.12059659360832554, + "language_loss": 0.79420835, + "learning_rate": 0.0005077882427749569, + "loss": 0.80512255, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.3347168, + "step": 2652, + "time_per_iteration": 2.532801866531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.06072092, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09167141678281196, + "language_loss": 0.85065627, + "learning_rate": 0.0005074767367698913, + "loss": 0.86160588, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.34277344, + "step": 2653, + "time_per_iteration": 2.718952178955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06184387, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.05265423612140712, + "language_loss": 0.83726275, + "learning_rate": 0.0005071652278620988, + "loss": 0.84820282, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.3215332, + "step": 2654, + "time_per_iteration": 3.0578973293304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093541, + "balance_loss_mlp": 1.06082976, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.057781922950613636, + "language_loss": 0.8368457, + "learning_rate": 0.0005068537161725186, + "loss": 0.84778106, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.32714844, + "step": 2655, + "time_per_iteration": 2.763050079345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109333, + "balance_loss_mlp": 1.06035662, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.06748478853261292, + "language_loss": 0.84411526, + "learning_rate": 0.0005065422018220893, + "loss": 0.85504854, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.32983398, + "step": 2656, + "time_per_iteration": 2.8346335887908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099653, + "balance_loss_mlp": 1.06744266, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.05948045399752535, + "language_loss": 0.80220234, + "learning_rate": 0.0005062306849317521, + "loss": 0.8131988, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.32226562, + "step": 2657, + "time_per_iteration": 2.8443868160247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011092, + "balance_loss_mlp": 1.07832527, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.06625791562361402, + "language_loss": 0.83381897, + "learning_rate": 0.0005059191656224487, + "loss": 0.84491098, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30859375, + "step": 2658, + "time_per_iteration": 2.7093002796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110636, + "balance_loss_mlp": 1.07883072, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.06672155578926672, + "language_loss": 0.88962573, + "learning_rate": 0.0005056076440151212, + "loss": 0.90073204, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.31787109, + "step": 2659, + "time_per_iteration": 2.6441903114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072549, + "balance_loss_mlp": 1.06272602, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.032966871601824974, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77360666, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.09814453, + "step": 2660, + "time_per_iteration": 4.922346353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124135, + "balance_loss_mlp": 1.09111381, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06875691586697516, + "language_loss": 0.87086922, + "learning_rate": 0.0005049845943901691, + "loss": 0.8821106, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.33032227, + "step": 2661, + "time_per_iteration": 2.8344130516052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107104, + "balance_loss_mlp": 1.07703924, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.06167047048505293, + "language_loss": 0.86829108, + "learning_rate": 0.0005046730666144338, + "loss": 0.87936211, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.30078125, + "step": 2662, + "time_per_iteration": 2.7832746505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110467, + "balance_loss_mlp": 1.07780349, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.05618387348962469, + "language_loss": 0.8811537, + "learning_rate": 0.0005043615370244532, + "loss": 0.89225835, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.32666016, + "step": 2663, + "time_per_iteration": 3.3585264682769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_mlp": 1.02664995, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02051261915929333, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79279995, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.08984375, + "step": 2664, + "time_per_iteration": 4.639116048812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.07670689, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.057959232824292994, + "language_loss": 0.85514903, + "learning_rate": 0.0005037384728855425, + "loss": 0.86621535, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.29882812, + "step": 2665, + "time_per_iteration": 2.7972493171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106635, + "balance_loss_mlp": 1.07456732, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08985416920229425, + "language_loss": 0.84974313, + "learning_rate": 0.0005034269385785075, + "loss": 0.86080956, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.3203125, + "step": 2666, + "time_per_iteration": 2.6164255142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.08135498, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09072509808708462, + "language_loss": 0.85031348, + "learning_rate": 0.0005031154029410168, + "loss": 0.86144769, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.32055664, + "step": 2667, + "time_per_iteration": 2.5188395977020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112022, + "balance_loss_mlp": 1.07873833, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.08345403251216076, + "language_loss": 0.86623496, + "learning_rate": 0.0005028038660940197, + "loss": 0.87735522, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.33300781, + "step": 2668, + "time_per_iteration": 2.5099217891693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104597, + "balance_loss_mlp": 1.07360303, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.051835294009996306, + "language_loss": 0.8459934, + "learning_rate": 0.0005024923281584648, + "loss": 0.85703939, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.30981445, + "step": 2669, + "time_per_iteration": 2.6409177780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113407, + "balance_loss_mlp": 1.08103013, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.05618222104131465, + "language_loss": 0.82660598, + "learning_rate": 0.0005021807892553026, + "loss": 0.83774006, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.32397461, + "step": 2670, + "time_per_iteration": 2.7168080806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105439, + "balance_loss_mlp": 1.07458735, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.052268384876698444, + "language_loss": 0.84909296, + "learning_rate": 0.0005018692495054828, + "loss": 0.86014736, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.30834961, + "step": 2671, + "time_per_iteration": 2.769845485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07063007, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.059994941655344296, + "language_loss": 0.80935681, + "learning_rate": 0.0005015577090299561, + "loss": 0.82036185, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.29833984, + "step": 2672, + "time_per_iteration": 2.681316375732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_mlp": 1.07245326, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.05683100055240327, + "language_loss": 0.86631596, + "learning_rate": 0.0005012461679496729, + "loss": 0.87733757, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.29711914, + "step": 2673, + "time_per_iteration": 2.5961544513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100162, + "balance_loss_mlp": 1.06883335, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.05638845856922635, + "language_loss": 0.88303345, + "learning_rate": 0.0005009346263855848, + "loss": 0.8940351, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.31323242, + "step": 2674, + "time_per_iteration": 2.607531785964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.06903887, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.05523698149533188, + "language_loss": 0.84251857, + "learning_rate": 0.0005006230844586422, + "loss": 0.85352582, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.31665039, + "step": 2675, + "time_per_iteration": 2.766676664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106245, + "balance_loss_mlp": 1.07384396, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.054179282011379754, + "language_loss": 0.79421759, + "learning_rate": 0.0005003115422897968, + "loss": 0.80528009, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.32397461, + "step": 2676, + "time_per_iteration": 2.7511518001556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101702, + "balance_loss_mlp": 1.0696342, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06371145669365144, + "language_loss": 0.86998433, + "learning_rate": 0.0005, + "loss": 0.88100135, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.32055664, + "step": 2677, + "time_per_iteration": 2.6361911296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096983, + "balance_loss_mlp": 1.06508231, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06720272484805691, + "language_loss": 0.79773581, + "learning_rate": 0.0004996884577102033, + "loss": 0.80870569, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.3190918, + "step": 2678, + "time_per_iteration": 3.078381299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101165, + "balance_loss_mlp": 1.06726193, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.05338815308435362, + "language_loss": 0.84963048, + "learning_rate": 0.000499376915541358, + "loss": 0.86064208, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.33911133, + "step": 2679, + "time_per_iteration": 2.6979198455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096582, + "balance_loss_mlp": 1.06506324, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.0530977146452018, + "language_loss": 0.8140825, + "learning_rate": 0.0004990653736144155, + "loss": 0.82504833, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31494141, + "step": 2680, + "time_per_iteration": 2.8514578342437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098157, + "balance_loss_mlp": 1.06547022, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.091547983778046, + "language_loss": 0.86229038, + "learning_rate": 0.0004987538320503271, + "loss": 0.87327194, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.3269043, + "step": 2681, + "time_per_iteration": 2.478638172149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_mlp": 1.06798983, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.07018643811750969, + "language_loss": 0.83312553, + "learning_rate": 0.0004984422909700442, + "loss": 0.8441304, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.32495117, + "step": 2682, + "time_per_iteration": 2.6546084880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.06783557, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.15069020701750013, + "language_loss": 0.84435642, + "learning_rate": 0.0004981307504945173, + "loss": 0.85534728, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31225586, + "step": 2683, + "time_per_iteration": 2.71260929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110147, + "balance_loss_mlp": 1.06914032, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.0559262102608404, + "language_loss": 0.89665949, + "learning_rate": 0.0004978192107446976, + "loss": 0.90767419, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.32324219, + "step": 2684, + "time_per_iteration": 2.767662763595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097385, + "balance_loss_mlp": 1.06650972, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.06901755479732997, + "language_loss": 0.87345654, + "learning_rate": 0.0004975076718415353, + "loss": 0.88443041, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30834961, + "step": 2685, + "time_per_iteration": 2.6287574768066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110088, + "balance_loss_mlp": 1.06988525, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.05502113672837593, + "language_loss": 0.91023147, + "learning_rate": 0.0004971961339059806, + "loss": 0.92124021, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.30957031, + "step": 2686, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_mlp": 1.07256198, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06476684801011888, + "language_loss": 0.84195554, + "learning_rate": 0.0004968845970589832, + "loss": 0.85300732, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.32617188, + "step": 2687, + "time_per_iteration": 2.6715877056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102414, + "balance_loss_mlp": 1.06896389, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0648303600022088, + "language_loss": 0.84613401, + "learning_rate": 0.0004965730614214926, + "loss": 0.85715812, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.3347168, + "step": 2688, + "time_per_iteration": 2.6734025478363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099959, + "balance_loss_mlp": 1.06720066, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.05675548235902804, + "language_loss": 0.85410345, + "learning_rate": 0.0004962615271144576, + "loss": 0.86510307, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.32739258, + "step": 2689, + "time_per_iteration": 2.4930050373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101842, + "balance_loss_mlp": 1.0703702, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.06418610502647971, + "language_loss": 0.82956815, + "learning_rate": 0.0004959499942588264, + "loss": 0.8405866, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.31469727, + "step": 2690, + "time_per_iteration": 2.904674768447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.070189, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.04799778536167862, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79278797, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.0859375, + "step": 2691, + "time_per_iteration": 4.761531591415405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105601, + "balance_loss_mlp": 1.07255602, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.051278898576550616, + "language_loss": 0.85877872, + "learning_rate": 0.0004953269333855661, + "loss": 0.86983472, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.33032227, + "step": 2692, + "time_per_iteration": 2.729318857192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104257, + "balance_loss_mlp": 1.07328665, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.05911618517599564, + "language_loss": 0.84474307, + "learning_rate": 0.0004950154056098309, + "loss": 0.85578561, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.30932617, + "step": 2693, + "time_per_iteration": 2.6833436489105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.09158325, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.059128614865360495, + "language_loss": 0.83972096, + "learning_rate": 0.0004947038797692867, + "loss": 0.85096538, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.32861328, + "step": 2694, + "time_per_iteration": 2.82362961769104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119523, + "balance_loss_mlp": 1.08635902, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.05692767589933962, + "language_loss": 0.77609885, + "learning_rate": 0.0004943923559848789, + "loss": 0.78729415, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.33178711, + "step": 2695, + "time_per_iteration": 2.7919468879699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123482, + "balance_loss_mlp": 1.09112859, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06299979408052762, + "language_loss": 0.90267843, + "learning_rate": 0.0004940808343775515, + "loss": 0.91391325, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.32348633, + "step": 2696, + "time_per_iteration": 2.6863224506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112015, + "balance_loss_mlp": 1.08748627, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.06289973384355804, + "language_loss": 0.82184958, + "learning_rate": 0.0004937693150682479, + "loss": 0.83305109, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.32666016, + "step": 2697, + "time_per_iteration": 2.5169589519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124428, + "balance_loss_mlp": 1.09109747, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.0748090565006246, + "language_loss": 0.76575571, + "learning_rate": 0.0004934577981779107, + "loss": 0.77699995, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.33325195, + "step": 2698, + "time_per_iteration": 2.65891432762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111403, + "balance_loss_mlp": 1.08103275, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.34709701447359415, + "language_loss": 0.81575179, + "learning_rate": 0.0004931462838274817, + "loss": 0.82689214, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.33007812, + "step": 2699, + "time_per_iteration": 2.829094648361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113032, + "balance_loss_mlp": 1.09694147, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.06002024337813523, + "language_loss": 0.84538823, + "learning_rate": 0.0004928347721379011, + "loss": 0.85669148, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.33398438, + "step": 2700, + "time_per_iteration": 2.685887098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128853, + "balance_loss_mlp": 1.09499812, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.07280089907997458, + "language_loss": 0.82063133, + "learning_rate": 0.0004925232632301089, + "loss": 0.83191985, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.33886719, + "step": 2701, + "time_per_iteration": 2.5586745738983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139592, + "balance_loss_mlp": 1.10711944, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.05869071142497867, + "language_loss": 0.7981168, + "learning_rate": 0.0004922117572250431, + "loss": 0.80951279, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.32495117, + "step": 2702, + "time_per_iteration": 2.652883768081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154601, + "balance_loss_mlp": 1.12041199, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08372395695209851, + "language_loss": 0.80792272, + "learning_rate": 0.0004919002542436414, + "loss": 0.8194688, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.34155273, + "step": 2703, + "time_per_iteration": 2.8069591522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156311, + "balance_loss_mlp": 1.12131107, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.06918407740604555, + "language_loss": 0.81692028, + "learning_rate": 0.0004915887544068399, + "loss": 0.82848334, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.35009766, + "step": 2704, + "time_per_iteration": 2.6484997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159384, + "balance_loss_mlp": 1.12228656, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.0754612517988151, + "language_loss": 0.78528553, + "learning_rate": 0.0004912772578355736, + "loss": 0.79687935, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.37084961, + "step": 2705, + "time_per_iteration": 2.889177083969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115407, + "balance_loss_mlp": 1.11825967, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.06509959827239385, + "language_loss": 0.83146906, + "learning_rate": 0.000490965764650776, + "loss": 0.84300983, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.3581543, + "step": 2706, + "time_per_iteration": 2.885923385620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115916, + "balance_loss_mlp": 1.12346911, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06296986612889613, + "language_loss": 0.82775491, + "learning_rate": 0.0004906542749733798, + "loss": 0.83934653, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.35693359, + "step": 2707, + "time_per_iteration": 3.6151185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152032, + "balance_loss_mlp": 1.11653161, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.046885737032271585, + "language_loss": 0.85312223, + "learning_rate": 0.0004903427889243156, + "loss": 0.86464256, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.35498047, + "step": 2708, + "time_per_iteration": 2.8592212200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169903, + "balance_loss_mlp": 1.13335371, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07702072033180815, + "language_loss": 0.85470927, + "learning_rate": 0.0004900313066245134, + "loss": 0.86640829, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.36547852, + "step": 2709, + "time_per_iteration": 2.7046992778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155719, + "balance_loss_mlp": 1.12145817, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.049948125939344834, + "language_loss": 0.80970949, + "learning_rate": 0.0004897198281949012, + "loss": 0.82126665, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.34277344, + "step": 2710, + "time_per_iteration": 2.728750228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164738, + "balance_loss_mlp": 1.12837923, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.06520397862885238, + "language_loss": 0.77818954, + "learning_rate": 0.0004894083537564057, + "loss": 0.78983688, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.36352539, + "step": 2711, + "time_per_iteration": 2.7362277507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163972, + "balance_loss_mlp": 1.12913883, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.051241123094768644, + "language_loss": 0.81174654, + "learning_rate": 0.0004890968834299519, + "loss": 0.82338625, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.34838867, + "step": 2712, + "time_per_iteration": 2.768146514892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156146, + "balance_loss_mlp": 1.12026405, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.05945211160457726, + "language_loss": 0.78877795, + "learning_rate": 0.0004887854173364633, + "loss": 0.80033934, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.35913086, + "step": 2713, + "time_per_iteration": 2.8356804847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149792, + "balance_loss_mlp": 1.1157217, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.05274159181021226, + "language_loss": 0.81621301, + "learning_rate": 0.0004884739555968617, + "loss": 0.82771093, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.34057617, + "step": 2714, + "time_per_iteration": 2.831137180328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.09369898, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.02923312307597506, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8007924, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08496094, + "step": 2715, + "time_per_iteration": 4.95891547203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149677, + "balance_loss_mlp": 1.11534512, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06614932878153669, + "language_loss": 0.86865598, + "learning_rate": 0.0004878510456629992, + "loss": 0.88015276, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.34326172, + "step": 2716, + "time_per_iteration": 2.968658924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145792, + "balance_loss_mlp": 1.1120801, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.05224698034347332, + "language_loss": 0.8526777, + "learning_rate": 0.00048753959771057314, + "loss": 0.86413562, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.33740234, + "step": 2717, + "time_per_iteration": 2.6395833492279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140286, + "balance_loss_mlp": 1.10736012, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.0584811227693513, + "language_loss": 0.83152837, + "learning_rate": 0.0004872281545957044, + "loss": 0.84293115, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.3293457, + "step": 2718, + "time_per_iteration": 2.7039849758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135383, + "balance_loss_mlp": 1.10069275, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.050310473622198856, + "language_loss": 0.85946554, + "learning_rate": 0.0004869167164393055, + "loss": 0.87081933, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.34692383, + "step": 2719, + "time_per_iteration": 2.91475510597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132518, + "balance_loss_mlp": 1.10028338, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.0697291023285212, + "language_loss": 0.89398658, + "learning_rate": 0.00048660528336228793, + "loss": 0.90531176, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.32226562, + "step": 2720, + "time_per_iteration": 2.792276620864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124032, + "balance_loss_mlp": 1.09115386, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05026677719306565, + "language_loss": 0.90367562, + "learning_rate": 0.0004862938554855606, + "loss": 0.91491592, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.32885742, + "step": 2721, + "time_per_iteration": 2.7964749336242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129388, + "balance_loss_mlp": 1.09643817, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.0663768296863652, + "language_loss": 0.86310339, + "learning_rate": 0.0004859824329300304, + "loss": 0.87439728, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.32958984, + "step": 2722, + "time_per_iteration": 2.6039419174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128053, + "balance_loss_mlp": 1.09403062, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.0581387375581185, + "language_loss": 0.84092689, + "learning_rate": 0.00048567101581660244, + "loss": 0.85220736, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.34033203, + "step": 2723, + "time_per_iteration": 2.5987517833709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125005, + "balance_loss_mlp": 1.09227037, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.06184026942262611, + "language_loss": 0.87479013, + "learning_rate": 0.00048535960426617956, + "loss": 0.88604021, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.32739258, + "step": 2724, + "time_per_iteration": 2.6038565635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121549, + "balance_loss_mlp": 1.08724082, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.05825945935903347, + "language_loss": 0.81925243, + "learning_rate": 0.0004850481983996621, + "loss": 0.83046794, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.34350586, + "step": 2725, + "time_per_iteration": 2.7633490562438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122144, + "balance_loss_mlp": 1.08907521, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.06367267368201004, + "language_loss": 0.88101065, + "learning_rate": 0.0004847367983379492, + "loss": 0.89223206, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.33081055, + "step": 2726, + "time_per_iteration": 2.520050287246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119938, + "balance_loss_mlp": 1.08837104, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.059069616726974465, + "language_loss": 0.79169118, + "learning_rate": 0.00048442540420193643, + "loss": 0.80289054, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.31567383, + "step": 2727, + "time_per_iteration": 2.9363925457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125304, + "balance_loss_mlp": 1.09278345, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.06091521023817234, + "language_loss": 0.7936945, + "learning_rate": 0.0004841140161125182, + "loss": 0.8049475, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.32543945, + "step": 2728, + "time_per_iteration": 3.5786640644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127178, + "balance_loss_mlp": 1.09666038, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.054648351094499156, + "language_loss": 0.85262787, + "learning_rate": 0.0004838026341905857, + "loss": 0.86389971, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.30517578, + "step": 2729, + "time_per_iteration": 2.7021641731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113035, + "balance_loss_mlp": 1.09909368, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.06068419443661206, + "language_loss": 0.85131037, + "learning_rate": 0.00048349125855702844, + "loss": 0.8626138, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.3125, + "step": 2730, + "time_per_iteration": 2.794691562652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129298, + "balance_loss_mlp": 1.09754109, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.0500444792759443, + "language_loss": 0.81508827, + "learning_rate": 0.00048317988933273287, + "loss": 0.82638121, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.31738281, + "step": 2731, + "time_per_iteration": 2.7251734733581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124341, + "balance_loss_mlp": 1.09291768, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.06596294225314246, + "language_loss": 0.82520533, + "learning_rate": 0.00048286852663858367, + "loss": 0.83644867, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.31420898, + "step": 2732, + "time_per_iteration": 2.972963571548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.0889498, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.055500139325311094, + "language_loss": 0.84107697, + "learning_rate": 0.000482557170595462, + "loss": 0.85228211, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.31542969, + "step": 2733, + "time_per_iteration": 2.858245849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112503, + "balance_loss_mlp": 1.09401202, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.13743136293517658, + "language_loss": 0.87933344, + "learning_rate": 0.0004822458213242475, + "loss": 0.89058375, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31005859, + "step": 2734, + "time_per_iteration": 2.522383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112386, + "balance_loss_mlp": 1.08115363, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.05651199089550523, + "language_loss": 0.86197513, + "learning_rate": 0.00048193447894581627, + "loss": 0.87309897, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.31201172, + "step": 2735, + "time_per_iteration": 3.0866682529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111368, + "balance_loss_mlp": 1.08235216, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06879211849592783, + "language_loss": 0.88187921, + "learning_rate": 0.00048162314358104243, + "loss": 0.89301598, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.31298828, + "step": 2736, + "time_per_iteration": 2.5985138416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108841, + "balance_loss_mlp": 1.07713127, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.05778047820427569, + "language_loss": 0.83687961, + "learning_rate": 0.0004813118153507969, + "loss": 0.84796798, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.31713867, + "step": 2737, + "time_per_iteration": 2.73371958732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_mlp": 1.01416731, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.01810308130118829, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83469975, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.08691406, + "step": 2738, + "time_per_iteration": 4.790890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097772, + "balance_loss_mlp": 1.06670594, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.05745954748436515, + "language_loss": 0.83672923, + "learning_rate": 0.00048068918077736163, + "loss": 0.84770691, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.31030273, + "step": 2739, + "time_per_iteration": 3.239821195602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06309009, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06477195420820829, + "language_loss": 0.81728363, + "learning_rate": 0.0004803778746759001, + "loss": 0.82822424, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.30932617, + "step": 2740, + "time_per_iteration": 2.942760944366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.06614065, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.05799868370730736, + "language_loss": 0.81935298, + "learning_rate": 0.00048006657619242317, + "loss": 0.83032262, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.30810547, + "step": 2741, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06550419, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07558439368734231, + "language_loss": 0.78591353, + "learning_rate": 0.00047975528544778775, + "loss": 0.79689896, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.33056641, + "step": 2742, + "time_per_iteration": 2.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.06058371, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.06405052151098177, + "language_loss": 0.88749677, + "learning_rate": 0.00047944400256284754, + "loss": 0.89840853, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.30566406, + "step": 2743, + "time_per_iteration": 2.6816787719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098065, + "balance_loss_mlp": 1.06809616, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07088810283562207, + "language_loss": 0.80461031, + "learning_rate": 0.0004791327276584532, + "loss": 0.81559092, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.29956055, + "step": 2744, + "time_per_iteration": 2.8708317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098246, + "balance_loss_mlp": 1.06596446, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.06685009455993486, + "language_loss": 0.8087393, + "learning_rate": 0.00047882146085545264, + "loss": 0.81972182, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.32250977, + "step": 2745, + "time_per_iteration": 2.610027551651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_mlp": 1.01204121, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.008936429220158798, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76423383, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.08984375, + "step": 2746, + "time_per_iteration": 5.000555038452148 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097643, + "balance_loss_mlp": 1.06767416, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.06348628312729114, + "language_loss": 0.79553157, + "learning_rate": 0.00047819895203700684, + "loss": 0.806508, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29907227, + "step": 2747, + "time_per_iteration": 2.7115635871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017614, + "balance_loss_mlp": 1.0085541, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.007776557121409109, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76530045, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09082031, + "step": 2748, + "time_per_iteration": 4.672155141830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092605, + "balance_loss_mlp": 1.06263614, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.06781776650114792, + "language_loss": 0.8852309, + "learning_rate": 0.0004775764770742277, + "loss": 0.89615691, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29931641, + "step": 2749, + "time_per_iteration": 2.8029801845550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06542146, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.07126893850665976, + "language_loss": 0.86776084, + "learning_rate": 0.00047726525259079777, + "loss": 0.87873781, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.32299805, + "step": 2750, + "time_per_iteration": 2.7803709506988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097184, + "balance_loss_mlp": 1.06568849, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.07487878206236488, + "language_loss": 0.88641649, + "learning_rate": 0.0004769540369337798, + "loss": 0.89738834, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.31469727, + "step": 2751, + "time_per_iteration": 2.7477662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103352, + "balance_loss_mlp": 1.07166588, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06303354467879724, + "language_loss": 0.86111081, + "learning_rate": 0.00047664283022399794, + "loss": 0.87214434, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.31665039, + "step": 2752, + "time_per_iteration": 2.8321616649627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111513, + "balance_loss_mlp": 1.08142424, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.1009265551294561, + "language_loss": 0.81372654, + "learning_rate": 0.00047633163258227376, + "loss": 0.82484162, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.30053711, + "step": 2753, + "time_per_iteration": 2.866710662841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107072, + "balance_loss_mlp": 1.07536244, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06597410250171662, + "language_loss": 0.85720521, + "learning_rate": 0.0004760204441294247, + "loss": 0.86827588, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.31689453, + "step": 2754, + "time_per_iteration": 2.635411500930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123028, + "balance_loss_mlp": 1.09172344, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06814428712155127, + "language_loss": 0.86859232, + "learning_rate": 0.00047570926498626486, + "loss": 0.87982261, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31274414, + "step": 2755, + "time_per_iteration": 2.678027629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.10846841, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05259166917973927, + "language_loss": 0.8179211, + "learning_rate": 0.00047539809527360474, + "loss": 0.82931906, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31298828, + "step": 2756, + "time_per_iteration": 2.8505630493164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139868, + "balance_loss_mlp": 1.1087544, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.23589307030508885, + "language_loss": 0.82282543, + "learning_rate": 0.0004750869351122511, + "loss": 0.83422416, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.31079102, + "step": 2757, + "time_per_iteration": 3.007599353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114789, + "balance_loss_mlp": 1.11598992, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.06932827369161218, + "language_loss": 0.81883401, + "learning_rate": 0.00047477578462300685, + "loss": 0.83031291, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.31884766, + "step": 2758, + "time_per_iteration": 2.7112765312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.11215043, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.060390901611552056, + "language_loss": 0.79751188, + "learning_rate": 0.0004744646439266718, + "loss": 0.80895996, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.32641602, + "step": 2759, + "time_per_iteration": 2.9956624507904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_mlp": 1.10905194, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.0692957942514688, + "language_loss": 0.92371601, + "learning_rate": 0.000474153513144041, + "loss": 0.93513119, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.32470703, + "step": 2760, + "time_per_iteration": 2.902304172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114025, + "balance_loss_mlp": 1.10756326, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06953135792158749, + "language_loss": 0.87208283, + "learning_rate": 0.00047384239239590633, + "loss": 0.88348538, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.3269043, + "step": 2761, + "time_per_iteration": 2.9197542667388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127002, + "balance_loss_mlp": 1.09414792, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06520154041113266, + "language_loss": 0.89041948, + "learning_rate": 0.0004735312818030556, + "loss": 0.90168953, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.32861328, + "step": 2762, + "time_per_iteration": 2.699882745742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128123, + "balance_loss_mlp": 1.0964613, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.0963196289257929, + "language_loss": 0.83125454, + "learning_rate": 0.0004732201814862727, + "loss": 0.84253573, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31640625, + "step": 2763, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113884, + "balance_loss_mlp": 1.08155453, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.058489246415432364, + "language_loss": 0.81845987, + "learning_rate": 0.0004729090915663373, + "loss": 0.82959872, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.32324219, + "step": 2764, + "time_per_iteration": 2.880218029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112044, + "balance_loss_mlp": 1.07930923, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.08176902294326427, + "language_loss": 0.85593212, + "learning_rate": 0.00047259801216402534, + "loss": 0.86705256, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.32739258, + "step": 2765, + "time_per_iteration": 2.5215423107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.0809716, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.0984419544464696, + "language_loss": 0.86589384, + "learning_rate": 0.00047228694340010845, + "loss": 0.87702894, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.32543945, + "step": 2766, + "time_per_iteration": 2.615323781967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106832, + "balance_loss_mlp": 1.07288122, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.06857994992356635, + "language_loss": 0.85894436, + "learning_rate": 0.0004719758853953544, + "loss": 0.87001264, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.33984375, + "step": 2767, + "time_per_iteration": 3.580965042114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109799, + "balance_loss_mlp": 1.07475162, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.07966941077078553, + "language_loss": 0.84403044, + "learning_rate": 0.00047166483827052645, + "loss": 0.85512847, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.35083008, + "step": 2768, + "time_per_iteration": 2.3937976360321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112761, + "balance_loss_mlp": 1.09797895, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.05218838233145069, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78191251, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.14746094, + "step": 2769, + "time_per_iteration": 4.980372905731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112083, + "balance_loss_mlp": 1.07910919, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.05422451751257763, + "language_loss": 0.8393681, + "learning_rate": 0.000471042777143682, + "loss": 0.8504889, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.32958984, + "step": 2770, + "time_per_iteration": 3.2559990882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109219, + "balance_loss_mlp": 1.07576907, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.05619534531580183, + "language_loss": 0.79500479, + "learning_rate": 0.0004707317633831707, + "loss": 0.80609697, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.3347168, + "step": 2771, + "time_per_iteration": 2.580369472503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113861, + "balance_loss_mlp": 1.07976723, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.07426752742264173, + "language_loss": 0.78140616, + "learning_rate": 0.00047042076098559673, + "loss": 0.79254484, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.34130859, + "step": 2772, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115026, + "balance_loss_mlp": 1.08131373, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07148667655520102, + "language_loss": 0.74185407, + "learning_rate": 0.00047010977007170174, + "loss": 0.75300431, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.3371582, + "step": 2773, + "time_per_iteration": 3.2167580127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103553, + "balance_loss_mlp": 1.07079434, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.05649801417476766, + "language_loss": 0.82702589, + "learning_rate": 0.00046979879076222334, + "loss": 0.83806139, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.32763672, + "step": 2774, + "time_per_iteration": 2.6618025302886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109156, + "balance_loss_mlp": 1.07689798, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.05944272870619304, + "language_loss": 0.85247773, + "learning_rate": 0.0004694878231778939, + "loss": 0.86356932, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.32250977, + "step": 2775, + "time_per_iteration": 3.381577968597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105801, + "balance_loss_mlp": 1.07459164, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.05869389504796052, + "language_loss": 0.84721255, + "learning_rate": 0.0004691768674394423, + "loss": 0.85827059, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.31176758, + "step": 2776, + "time_per_iteration": 2.9549882411956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_mlp": 1.03230345, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.020468065913813137, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85525757, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09423828, + "step": 2777, + "time_per_iteration": 4.780264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_mlp": 1.03013933, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.02045845897293101, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77692783, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09130859, + "step": 2778, + "time_per_iteration": 5.030272960662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101269, + "balance_loss_mlp": 1.06870127, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06089610481991967, + "language_loss": 0.7961477, + "learning_rate": 0.00046824407250656676, + "loss": 0.80716044, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.32568359, + "step": 2779, + "time_per_iteration": 2.6681063175201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096395, + "balance_loss_mlp": 1.06537652, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.04990324067280663, + "language_loss": 0.83819127, + "learning_rate": 0.0004679331653588161, + "loss": 0.84915525, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.30981445, + "step": 2780, + "time_per_iteration": 2.635774612426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092346, + "balance_loss_mlp": 1.05999231, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.06684745885443293, + "language_loss": 0.85806221, + "learning_rate": 0.0004676222706605147, + "loss": 0.86898565, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.32348633, + "step": 2781, + "time_per_iteration": 2.6137733459472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092057, + "balance_loss_mlp": 1.05886936, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08426708268962642, + "language_loss": 0.85464495, + "learning_rate": 0.0004673113885323626, + "loss": 0.86556554, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.33203125, + "step": 2782, + "time_per_iteration": 2.861581802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083804, + "balance_loss_mlp": 1.05083072, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.060311716473253056, + "language_loss": 0.78792584, + "learning_rate": 0.00046700051909505494, + "loss": 0.79876387, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.32983398, + "step": 2783, + "time_per_iteration": 3.182298183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089723, + "balance_loss_mlp": 1.05407953, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678042842361867, + "language_loss": 0.84239137, + "learning_rate": 0.000466689662469282, + "loss": 0.85328859, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.35644531, + "step": 2784, + "time_per_iteration": 2.6519503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.04891968, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06002174049054728, + "language_loss": 0.83905756, + "learning_rate": 0.00046637881877572917, + "loss": 0.84987772, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.33105469, + "step": 2785, + "time_per_iteration": 3.1058127880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_mlp": 1.051754, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.0580195679012457, + "language_loss": 0.8490684, + "learning_rate": 0.0004660679881350764, + "loss": 0.85991538, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.32958984, + "step": 2786, + "time_per_iteration": 2.77021861076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053559, + "balance_loss_mlp": 1.0447371, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.032864625150969516, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76661706, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.08837891, + "step": 2787, + "time_per_iteration": 5.029211044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087215, + "balance_loss_mlp": 1.05335903, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07679411484967892, + "language_loss": 0.77928644, + "learning_rate": 0.0004654463664951667, + "loss": 0.79015857, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.33886719, + "step": 2788, + "time_per_iteration": 2.9762089252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088019, + "balance_loss_mlp": 1.05464029, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06025701653165108, + "language_loss": 0.83150423, + "learning_rate": 0.0004651355757372447, + "loss": 0.84238434, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.33398438, + "step": 2789, + "time_per_iteration": 2.5971946716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089252, + "balance_loss_mlp": 1.05604005, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.08338964083328992, + "language_loss": 0.8607617, + "learning_rate": 0.00046482479851489274, + "loss": 0.87165421, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.33227539, + "step": 2790, + "time_per_iteration": 2.6431193351745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109405, + "balance_loss_mlp": 1.06119633, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.07763218475438792, + "language_loss": 0.77609432, + "learning_rate": 0.00046451403494876525, + "loss": 0.78703481, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.32861328, + "step": 2791, + "time_per_iteration": 2.860164165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092942, + "balance_loss_mlp": 1.05918157, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.06279789357775317, + "language_loss": 0.84532517, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625458, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.33789062, + "step": 2792, + "time_per_iteration": 2.7511003017425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106074, + "balance_loss_mlp": 1.07081246, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.05029896863334896, + "language_loss": 0.85103881, + "learning_rate": 0.00046389254926777404, + "loss": 0.86209953, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.35253906, + "step": 2793, + "time_per_iteration": 2.7946324348449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_mlp": 1.07229924, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.05473465194283574, + "language_loss": 0.78127646, + "learning_rate": 0.0004635818273941926, + "loss": 0.79232681, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.32739258, + "step": 2794, + "time_per_iteration": 3.5742921829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109863, + "balance_loss_mlp": 1.07641304, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.07615315185796866, + "language_loss": 0.82079315, + "learning_rate": 0.0004632711196593997, + "loss": 0.83189178, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.3347168, + "step": 2795, + "time_per_iteration": 2.7694544792175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110741, + "balance_loss_mlp": 1.07907939, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.07020702036152926, + "language_loss": 0.85457337, + "learning_rate": 0.00046296042618402297, + "loss": 0.86568069, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.31640625, + "step": 2796, + "time_per_iteration": 3.0587034225463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109738, + "balance_loss_mlp": 1.07883883, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06759922925686453, + "language_loss": 0.7969842, + "learning_rate": 0.0004626497470886839, + "loss": 0.80808163, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30883789, + "step": 2797, + "time_per_iteration": 3.002824068069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105945, + "balance_loss_mlp": 1.07299602, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.07466819588637175, + "language_loss": 0.82158947, + "learning_rate": 0.00046233908249399897, + "loss": 0.83264899, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.32958984, + "step": 2798, + "time_per_iteration": 2.7746241092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097876, + "balance_loss_mlp": 1.06559372, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.05453000178981586, + "language_loss": 0.78238356, + "learning_rate": 0.00046202843252057905, + "loss": 0.79336226, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.32275391, + "step": 2799, + "time_per_iteration": 2.581350803375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097308, + "balance_loss_mlp": 1.06478727, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06584834464031906, + "language_loss": 0.84020996, + "learning_rate": 0.00046171779728902896, + "loss": 0.85118306, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.32495117, + "step": 2800, + "time_per_iteration": 2.577760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.05988431, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.0769580423168035, + "language_loss": 0.85918987, + "learning_rate": 0.000461407176919948, + "loss": 0.87011129, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.32250977, + "step": 2801, + "time_per_iteration": 2.5490942001342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093913, + "balance_loss_mlp": 1.06189322, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.05361052263899676, + "language_loss": 0.85168314, + "learning_rate": 0.00046109657153392997, + "loss": 0.86262226, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.32006836, + "step": 2802, + "time_per_iteration": 2.7699196338653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095158, + "balance_loss_mlp": 1.06132686, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07003946535384918, + "language_loss": 0.82877356, + "learning_rate": 0.0004607859812515622, + "loss": 0.83972514, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.33862305, + "step": 2803, + "time_per_iteration": 2.6007485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093716, + "balance_loss_mlp": 1.06198251, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06322278970979951, + "language_loss": 0.88066649, + "learning_rate": 0.00046047540619342667, + "loss": 0.89160359, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.31713867, + "step": 2804, + "time_per_iteration": 2.5943124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06163239, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.060964528711389604, + "language_loss": 0.80115181, + "learning_rate": 0.00046016484648009933, + "loss": 0.81207782, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30957031, + "step": 2805, + "time_per_iteration": 2.707387924194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096878, + "balance_loss_mlp": 1.0659312, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.05960799154457967, + "language_loss": 0.80838758, + "learning_rate": 0.0004598543022321501, + "loss": 0.81935638, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.30908203, + "step": 2806, + "time_per_iteration": 2.606360673904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103257, + "balance_loss_mlp": 1.07080865, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.059370042319646085, + "language_loss": 0.80030453, + "learning_rate": 0.0004595437735701433, + "loss": 0.81133705, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.32446289, + "step": 2807, + "time_per_iteration": 2.674914836883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.06448901, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.07129928038264445, + "language_loss": 0.83467567, + "learning_rate": 0.00045923326061463623, + "loss": 0.84564078, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.32006836, + "step": 2808, + "time_per_iteration": 2.7732136249542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093728, + "balance_loss_mlp": 1.0615654, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.061183409959599915, + "language_loss": 0.81861985, + "learning_rate": 0.00045892276348618113, + "loss": 0.82955706, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.3215332, + "step": 2809, + "time_per_iteration": 2.9496963024139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_mlp": 1.03318524, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.03295349175272743, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79301834, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.078125, + "step": 2810, + "time_per_iteration": 4.980771064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095175, + "balance_loss_mlp": 1.06375122, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.048089637178950914, + "language_loss": 0.80807102, + "learning_rate": 0.000458301817192603, + "loss": 0.81902277, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.31396484, + "step": 2811, + "time_per_iteration": 2.819394111633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014174, + "balance_loss_mlp": 1.00659227, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.018125943247431338, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81855953, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.07568359, + "step": 2812, + "time_per_iteration": 4.830869197845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094625, + "balance_loss_mlp": 1.06312966, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.07142535441885249, + "language_loss": 0.8774603, + "learning_rate": 0.00045768093565369983, + "loss": 0.88840652, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31494141, + "step": 2813, + "time_per_iteration": 2.7351324558258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06911242, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.0566212514723048, + "language_loss": 0.82215679, + "learning_rate": 0.0004573705194685646, + "loss": 0.83316934, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.32128906, + "step": 2814, + "time_per_iteration": 2.6945576667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100716, + "balance_loss_mlp": 1.06860089, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.06333436634677812, + "language_loss": 0.85428321, + "learning_rate": 0.00045706011983366157, + "loss": 0.86529034, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.32080078, + "step": 2815, + "time_per_iteration": 2.681619882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108066, + "balance_loss_mlp": 1.07623768, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.068256039366798, + "language_loss": 0.8269453, + "learning_rate": 0.00045674973686949847, + "loss": 0.83802599, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.31835938, + "step": 2816, + "time_per_iteration": 2.5405073165893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109346, + "balance_loss_mlp": 1.07830381, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.0555657817841838, + "language_loss": 0.85590029, + "learning_rate": 0.0004564393706965766, + "loss": 0.86699367, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 2.9834089279174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102481, + "balance_loss_mlp": 1.07079506, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.052731051534337416, + "language_loss": 0.81111342, + "learning_rate": 0.00045612902143539116, + "loss": 0.82213825, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31665039, + "step": 2818, + "time_per_iteration": 2.5867249965667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06935942, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.08027643777933474, + "language_loss": 0.82169372, + "learning_rate": 0.00045581868920642986, + "loss": 0.83268583, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.29833984, + "step": 2819, + "time_per_iteration": 2.538219928741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100605, + "balance_loss_mlp": 1.06968212, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.056746529630016036, + "language_loss": 0.79290533, + "learning_rate": 0.00045550837413017457, + "loss": 0.80391139, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30883789, + "step": 2820, + "time_per_iteration": 2.6461877822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100089, + "balance_loss_mlp": 1.06995249, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06471497165860861, + "language_loss": 0.85196662, + "learning_rate": 0.0004551980763271005, + "loss": 0.86296749, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30102539, + "step": 2821, + "time_per_iteration": 2.6883745193481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_mlp": 1.07015133, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.058885459141671155, + "language_loss": 0.84080005, + "learning_rate": 0.0004548877959176756, + "loss": 0.85182083, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.3190918, + "step": 2822, + "time_per_iteration": 2.861867666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.06595802, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06589540393120931, + "language_loss": 0.86233151, + "learning_rate": 0.00045457753302236166, + "loss": 0.87329865, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30737305, + "step": 2823, + "time_per_iteration": 2.687164068222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097063, + "balance_loss_mlp": 1.06685555, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07425338305054356, + "language_loss": 0.87034917, + "learning_rate": 0.00045426728776161353, + "loss": 0.88131976, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30175781, + "step": 2824, + "time_per_iteration": 2.7938835620880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104052, + "balance_loss_mlp": 1.07224679, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.05711338707468448, + "language_loss": 0.81608665, + "learning_rate": 0.00045395706025587863, + "loss": 0.82712722, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.31787109, + "step": 2825, + "time_per_iteration": 2.6212074756622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099159, + "balance_loss_mlp": 1.06907105, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.07865669635555295, + "language_loss": 0.8299852, + "learning_rate": 0.00045364685062559843, + "loss": 0.84097683, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30078125, + "step": 2826, + "time_per_iteration": 2.8868184089660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104022, + "balance_loss_mlp": 1.07505381, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06023434626032839, + "language_loss": 0.91765273, + "learning_rate": 0.0004533366589912067, + "loss": 0.92869294, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.28955078, + "step": 2827, + "time_per_iteration": 2.9981062412261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105445, + "balance_loss_mlp": 1.07557106, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.06990968055660145, + "language_loss": 0.78070033, + "learning_rate": 0.0004530264854731306, + "loss": 0.79175478, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29858398, + "step": 2828, + "time_per_iteration": 3.0054330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107215, + "balance_loss_mlp": 1.07605386, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05020371190449787, + "language_loss": 0.84383601, + "learning_rate": 0.00045271633019179034, + "loss": 0.85490811, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.3112793, + "step": 2829, + "time_per_iteration": 2.775956630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107068, + "balance_loss_mlp": 1.07605028, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05805566098722391, + "language_loss": 0.88203323, + "learning_rate": 0.0004524061932675986, + "loss": 0.8931039, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.30981445, + "step": 2830, + "time_per_iteration": 2.8221793174743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106595, + "balance_loss_mlp": 1.07555294, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.0740029895366448, + "language_loss": 0.87459874, + "learning_rate": 0.00045209607482096125, + "loss": 0.8856647, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.31005859, + "step": 2831, + "time_per_iteration": 3.0393142700195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102187, + "balance_loss_mlp": 1.0710969, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.08209208283258153, + "language_loss": 0.84651136, + "learning_rate": 0.0004517859749722772, + "loss": 0.85753322, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.31054688, + "step": 2832, + "time_per_iteration": 2.6821095943450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105232, + "balance_loss_mlp": 1.07368898, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.07359331276456935, + "language_loss": 0.79572821, + "learning_rate": 0.0004514758938419376, + "loss": 0.80678058, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.31518555, + "step": 2833, + "time_per_iteration": 2.8375093936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080375, + "balance_loss_mlp": 1.07288861, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03314547284214794, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78000963, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.07470703, + "step": 2834, + "time_per_iteration": 4.963228225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_mlp": 1.06930006, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.057187543491433894, + "language_loss": 0.83827722, + "learning_rate": 0.00045085578821782175, + "loss": 0.84927469, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.3046875, + "step": 2835, + "time_per_iteration": 2.5562217235565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054355, + "balance_loss_mlp": 1.04696393, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.02358753311446476, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77189088, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.07373047, + "step": 2836, + "time_per_iteration": 4.959676742553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100094, + "balance_loss_mlp": 1.06983829, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.0408398110042356, + "language_loss": 0.80949795, + "learning_rate": 0.00045023575891159866, + "loss": 0.82049894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30200195, + "step": 2837, + "time_per_iteration": 2.74700665473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_mlp": 1.01894093, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.01524116386105569, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75790191, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.07421875, + "step": 2838, + "time_per_iteration": 4.9733850955963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.07366681, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.05292635351535042, + "language_loss": 0.78244042, + "learning_rate": 0.0004496158068861354, + "loss": 0.79347491, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29760742, + "step": 2839, + "time_per_iteration": 2.8023805618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110962, + "balance_loss_mlp": 1.08010423, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.0535580092110964, + "language_loss": 0.80844593, + "learning_rate": 0.00044930586015455207, + "loss": 0.81954211, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.29492188, + "step": 2840, + "time_per_iteration": 2.816567897796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118684, + "balance_loss_mlp": 1.08804703, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.06541969342762931, + "language_loss": 0.89212978, + "learning_rate": 0.000448995933104179, + "loss": 0.90331668, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.3059082, + "step": 2841, + "time_per_iteration": 2.903371810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115887, + "balance_loss_mlp": 1.08601356, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06848140377985366, + "language_loss": 0.80388117, + "learning_rate": 0.00044868602585534077, + "loss": 0.81504011, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.29833984, + "step": 2842, + "time_per_iteration": 2.870833396911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104882, + "balance_loss_mlp": 1.07519853, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.06871095275450309, + "language_loss": 0.89058006, + "learning_rate": 0.0004483761385283541, + "loss": 0.90162885, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.29663086, + "step": 2843, + "time_per_iteration": 2.5367324352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.06863523, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.05633892340966096, + "language_loss": 0.81610817, + "learning_rate": 0.0004480662712435281, + "loss": 0.82710731, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.3125, + "step": 2844, + "time_per_iteration": 2.8301496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092142, + "balance_loss_mlp": 1.0627687, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.05986468354955699, + "language_loss": 0.88694894, + "learning_rate": 0.0004477564241211635, + "loss": 0.89787042, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.2935791, + "step": 2845, + "time_per_iteration": 2.5813820362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086916, + "balance_loss_mlp": 1.05787718, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.059098326299960216, + "language_loss": 0.87329561, + "learning_rate": 0.0004474465972815541, + "loss": 0.88416475, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.2902832, + "step": 2846, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_mlp": 1.05730796, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05595262091427783, + "language_loss": 0.87812984, + "learning_rate": 0.000447136790844985, + "loss": 0.88898313, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.28027344, + "step": 2847, + "time_per_iteration": 2.698451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086514, + "balance_loss_mlp": 1.05726016, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.06538513229207209, + "language_loss": 0.81294727, + "learning_rate": 0.00044682700493173385, + "loss": 0.82381248, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.29223633, + "step": 2848, + "time_per_iteration": 2.8252742290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05441868, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06259253721450928, + "language_loss": 0.80796725, + "learning_rate": 0.00044651723966207004, + "loss": 0.81881809, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.30639648, + "step": 2849, + "time_per_iteration": 3.093806505203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083876, + "balance_loss_mlp": 1.05424023, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.05680096345280931, + "language_loss": 0.78538483, + "learning_rate": 0.00044620749515625536, + "loss": 0.79622364, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.29614258, + "step": 2850, + "time_per_iteration": 2.759477376937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_mlp": 1.0532248, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.054672764420471885, + "language_loss": 0.85281622, + "learning_rate": 0.00044589777153454334, + "loss": 0.86365175, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30297852, + "step": 2851, + "time_per_iteration": 2.7247886657714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082527, + "balance_loss_mlp": 1.0519855, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.05927586181396917, + "language_loss": 0.83792317, + "learning_rate": 0.00044558806891717895, + "loss": 0.84874845, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30493164, + "step": 2852, + "time_per_iteration": 2.480499267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078563, + "balance_loss_mlp": 1.04847419, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06995220773511122, + "language_loss": 0.79820019, + "learning_rate": 0.0004452783874243998, + "loss": 0.80898583, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.30053711, + "step": 2853, + "time_per_iteration": 2.815159559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_mlp": 1.05354142, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.0871194319773747, + "language_loss": 0.8473509, + "learning_rate": 0.00044496872717643475, + "loss": 0.85818863, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.30200195, + "step": 2854, + "time_per_iteration": 2.671760320663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_mlp": 1.02099681, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.022692984636718958, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7811873, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.08447266, + "step": 2855, + "time_per_iteration": 4.943760633468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_mlp": 1.05152166, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08481580298187671, + "language_loss": 0.82385266, + "learning_rate": 0.0004443494708958217, + "loss": 0.83465844, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.2902832, + "step": 2856, + "time_per_iteration": 2.9592692852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081194, + "balance_loss_mlp": 1.05131996, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.054737825906261944, + "language_loss": 0.81019336, + "learning_rate": 0.0004440398751035906, + "loss": 0.82100528, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29858398, + "step": 2857, + "time_per_iteration": 2.8660449981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086545, + "balance_loss_mlp": 1.05612314, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.07506614425197558, + "language_loss": 0.84203708, + "learning_rate": 0.00044373030103700645, + "loss": 0.85290253, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.30395508, + "step": 2858, + "time_per_iteration": 2.589571475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.05769968, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.06400511299844665, + "language_loss": 0.80211353, + "learning_rate": 0.000443420748816257, + "loss": 0.81297493, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28442383, + "step": 2859, + "time_per_iteration": 2.775573492050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089751, + "balance_loss_mlp": 1.05894732, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.05990515883462961, + "language_loss": 0.78525764, + "learning_rate": 0.0004431112185615208, + "loss": 0.7961551, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.30786133, + "step": 2860, + "time_per_iteration": 2.79428768157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099065, + "balance_loss_mlp": 1.06942964, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.08012396807897051, + "language_loss": 0.80142951, + "learning_rate": 0.00044280171039296845, + "loss": 0.81242013, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29589844, + "step": 2861, + "time_per_iteration": 2.6075713634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097414, + "balance_loss_mlp": 1.06808829, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.055527438655266555, + "language_loss": 0.88317382, + "learning_rate": 0.0004424922244307616, + "loss": 0.89414799, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.29321289, + "step": 2862, + "time_per_iteration": 2.6453704833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093253, + "balance_loss_mlp": 1.06306958, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.0988044596240084, + "language_loss": 0.82273299, + "learning_rate": 0.00044218276079505315, + "loss": 0.83366549, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.30151367, + "step": 2863, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093494, + "balance_loss_mlp": 1.0636915, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.15366013773450377, + "language_loss": 0.74783754, + "learning_rate": 0.0004418733196059876, + "loss": 0.75877243, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29760742, + "step": 2864, + "time_per_iteration": 2.6593546867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092739, + "balance_loss_mlp": 1.06398571, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.05392307081741782, + "language_loss": 0.80104017, + "learning_rate": 0.0004415639009837008, + "loss": 0.81196761, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28759766, + "step": 2865, + "time_per_iteration": 2.8184585571289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096337, + "balance_loss_mlp": 1.06660628, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.0621710106813525, + "language_loss": 0.8235333, + "learning_rate": 0.00044125450504831955, + "loss": 0.83449662, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.29711914, + "step": 2866, + "time_per_iteration": 2.734349489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086542, + "balance_loss_mlp": 1.05592918, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.06271512147953057, + "language_loss": 0.82752901, + "learning_rate": 0.0004409451319199622, + "loss": 0.83839446, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.30566406, + "step": 2867, + "time_per_iteration": 2.683742046356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095264, + "balance_loss_mlp": 1.06417394, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07258101504897169, + "language_loss": 0.84457368, + "learning_rate": 0.0004406357817187381, + "loss": 0.85552633, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.31054688, + "step": 2868, + "time_per_iteration": 3.0147883892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103312, + "balance_loss_mlp": 1.07379591, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.05164398294731223, + "language_loss": 0.81673765, + "learning_rate": 0.0004403264545647474, + "loss": 0.82777071, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29492188, + "step": 2869, + "time_per_iteration": 3.5095975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107006, + "balance_loss_mlp": 1.07603574, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.04919714399659635, + "language_loss": 0.85006267, + "learning_rate": 0.00044001715057808154, + "loss": 0.86113274, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.30932617, + "step": 2870, + "time_per_iteration": 2.759791851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114514, + "balance_loss_mlp": 1.08330536, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06727866309699267, + "language_loss": 0.81942332, + "learning_rate": 0.0004397078698788232, + "loss": 0.83056843, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.31176758, + "step": 2871, + "time_per_iteration": 3.21431040763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104908, + "balance_loss_mlp": 1.09441757, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.04310408533027141, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81547272, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.10498047, + "step": 2872, + "time_per_iteration": 4.941087484359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114234, + "balance_loss_mlp": 1.082739, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.05898932962157328, + "language_loss": 0.78340954, + "learning_rate": 0.00043908937882281343, + "loss": 0.79455185, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.31469727, + "step": 2873, + "time_per_iteration": 2.577866554260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116939, + "balance_loss_mlp": 1.08501506, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.05969066171006231, + "language_loss": 0.82846034, + "learning_rate": 0.0004387801687061814, + "loss": 0.83962971, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.3190918, + "step": 2874, + "time_per_iteration": 2.8184196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117603, + "balance_loss_mlp": 1.08489251, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.05481480847886404, + "language_loss": 0.80685902, + "learning_rate": 0.0004384709823571958, + "loss": 0.81803501, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.32714844, + "step": 2875, + "time_per_iteration": 2.7496426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.07519674, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0703745986604158, + "language_loss": 0.83230788, + "learning_rate": 0.0004381618198958932, + "loss": 0.84336388, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.30371094, + "step": 2876, + "time_per_iteration": 3.4905495643615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110669, + "balance_loss_mlp": 1.07662511, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.06448913307816859, + "language_loss": 0.84021735, + "learning_rate": 0.00043785268144230137, + "loss": 0.85128427, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.30029297, + "step": 2877, + "time_per_iteration": 2.907133102416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102032, + "balance_loss_mlp": 1.07203865, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.0731230974557418, + "language_loss": 0.82496381, + "learning_rate": 0.00043754356711643837, + "loss": 0.83598411, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29980469, + "step": 2878, + "time_per_iteration": 2.715023994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097061, + "balance_loss_mlp": 1.06609011, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.0760081782140183, + "language_loss": 0.83909559, + "learning_rate": 0.0004372344770383132, + "loss": 0.85006618, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30932617, + "step": 2879, + "time_per_iteration": 2.822368621826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097203, + "balance_loss_mlp": 1.06756735, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.06372253861737541, + "language_loss": 0.83305293, + "learning_rate": 0.00043692541132792507, + "loss": 0.84402496, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29614258, + "step": 2880, + "time_per_iteration": 2.7154414653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093507, + "balance_loss_mlp": 1.06349051, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.057885594640944824, + "language_loss": 0.83464789, + "learning_rate": 0.00043661637010526384, + "loss": 0.84558296, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.30004883, + "step": 2881, + "time_per_iteration": 2.507059097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092859, + "balance_loss_mlp": 1.06255555, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.08329174894233551, + "language_loss": 0.83249325, + "learning_rate": 0.00043630735349031025, + "loss": 0.84342188, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30273438, + "step": 2882, + "time_per_iteration": 2.644418478012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06216836, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.047753182436236, + "language_loss": 0.81861913, + "learning_rate": 0.00043599836160303495, + "loss": 0.82952571, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.28491211, + "step": 2883, + "time_per_iteration": 2.8971407413482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090292, + "balance_loss_mlp": 1.06160986, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.057456379562134556, + "language_loss": 0.77759755, + "learning_rate": 0.0004356893945633995, + "loss": 0.78850043, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.28649902, + "step": 2884, + "time_per_iteration": 2.937133312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094576, + "balance_loss_mlp": 1.06620383, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.05754228747661135, + "language_loss": 0.81617516, + "learning_rate": 0.0004353804524913551, + "loss": 0.82712096, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.28344727, + "step": 2885, + "time_per_iteration": 2.579535722732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109656, + "balance_loss_mlp": 1.08028293, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.06485446309889223, + "language_loss": 0.81926423, + "learning_rate": 0.0004350715355068441, + "loss": 0.83036083, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.29345703, + "step": 2886, + "time_per_iteration": 2.709717273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111013, + "balance_loss_mlp": 1.08142567, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.066893347852213, + "language_loss": 0.7961694, + "learning_rate": 0.00043476264372979847, + "loss": 0.80727959, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.2956543, + "step": 2887, + "time_per_iteration": 2.5216078758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113, + "balance_loss_mlp": 1.08441329, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.0640996529430707, + "language_loss": 0.78604692, + "learning_rate": 0.0004344537772801408, + "loss": 0.7971769, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.28540039, + "step": 2888, + "time_per_iteration": 3.8132436275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065874, + "balance_loss_mlp": 1.05838752, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.028482200170008867, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.7448833, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.07470703, + "step": 2889, + "time_per_iteration": 4.947216987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117814, + "balance_loss_mlp": 1.08801198, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.06792095354006551, + "language_loss": 0.83771884, + "learning_rate": 0.0004338361208426298, + "loss": 0.84889698, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.29760742, + "step": 2890, + "time_per_iteration": 2.631476879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113406, + "balance_loss_mlp": 1.08350825, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.05967481099781226, + "language_loss": 0.81602627, + "learning_rate": 0.00043352733109457164, + "loss": 0.82716036, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.29858398, + "step": 2891, + "time_per_iteration": 2.907500743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111722, + "balance_loss_mlp": 1.08194315, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.04670195587242621, + "language_loss": 0.84789026, + "learning_rate": 0.00043321856715349244, + "loss": 0.85900748, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29760742, + "step": 2892, + "time_per_iteration": 2.9401984214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110696, + "balance_loss_mlp": 1.0810132, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.05439165995621742, + "language_loss": 0.80422115, + "learning_rate": 0.00043290982913926466, + "loss": 0.81532812, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.29614258, + "step": 2893, + "time_per_iteration": 2.7956430912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113402, + "balance_loss_mlp": 1.08312285, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.09922355360532673, + "language_loss": 0.8448714, + "learning_rate": 0.0004326011171717514, + "loss": 0.85600543, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30297852, + "step": 2894, + "time_per_iteration": 2.8997769355773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108454, + "balance_loss_mlp": 1.07676816, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06224988402754836, + "language_loss": 0.81240308, + "learning_rate": 0.0004322924313708051, + "loss": 0.82348764, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.31689453, + "step": 2895, + "time_per_iteration": 2.511643648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07825518, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.0621054083596477, + "language_loss": 0.84500259, + "learning_rate": 0.0004319837718562681, + "loss": 0.85607862, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.29321289, + "step": 2896, + "time_per_iteration": 2.580003023147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106887, + "balance_loss_mlp": 1.07667959, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.05844968671659234, + "language_loss": 0.83570629, + "learning_rate": 0.0004316751387479726, + "loss": 0.84677517, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30175781, + "step": 2897, + "time_per_iteration": 2.7676987648010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122549, + "balance_loss_mlp": 1.0925082, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.06543352873326957, + "language_loss": 0.82800293, + "learning_rate": 0.0004313665321657409, + "loss": 0.83922845, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.30004883, + "step": 2898, + "time_per_iteration": 3.7584402561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120576, + "balance_loss_mlp": 1.08917618, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.06787906742385669, + "language_loss": 0.80272007, + "learning_rate": 0.00043105795222938436, + "loss": 0.81392586, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.31396484, + "step": 2899, + "time_per_iteration": 2.718045711517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.07795143, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.06698960298708169, + "language_loss": 0.78827435, + "learning_rate": 0.00043074939905870467, + "loss": 0.79937094, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.31713867, + "step": 2900, + "time_per_iteration": 2.639775514602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.08230579, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.09759490745534659, + "language_loss": 0.80356312, + "learning_rate": 0.0004304408727734927, + "loss": 0.81467754, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.29125977, + "step": 2901, + "time_per_iteration": 2.6272940635681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107959, + "balance_loss_mlp": 1.07889545, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.06875313821095587, + "language_loss": 0.89200485, + "learning_rate": 0.0004301323734935288, + "loss": 0.9030844, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.29052734, + "step": 2902, + "time_per_iteration": 2.652219533920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_mlp": 1.07164121, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.05706751216847301, + "language_loss": 0.87298477, + "learning_rate": 0.000429823901338583, + "loss": 0.8839913, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.2902832, + "step": 2903, + "time_per_iteration": 2.611798048019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099623, + "balance_loss_mlp": 1.06872356, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.053536411753063035, + "language_loss": 0.87032712, + "learning_rate": 0.00042951545642841513, + "loss": 0.88132328, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.30883789, + "step": 2904, + "time_per_iteration": 3.067237377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099073, + "balance_loss_mlp": 1.06979561, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.04987560026618122, + "language_loss": 0.86746645, + "learning_rate": 0.0004292070388827737, + "loss": 0.87845719, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.29272461, + "step": 2905, + "time_per_iteration": 2.5981948375701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06426287, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06265536693897518, + "language_loss": 0.81292248, + "learning_rate": 0.00042889864882139753, + "loss": 0.82385433, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.2890625, + "step": 2906, + "time_per_iteration": 2.581113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107989, + "balance_loss_mlp": 1.07811511, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06493240221059006, + "language_loss": 0.81897962, + "learning_rate": 0.0004285902863640139, + "loss": 0.83005953, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29858398, + "step": 2907, + "time_per_iteration": 2.6115305423736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109856, + "balance_loss_mlp": 1.06973481, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.07849480564056018, + "language_loss": 0.8626982, + "learning_rate": 0.00042828195163033966, + "loss": 0.87368375, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.28833008, + "step": 2908, + "time_per_iteration": 2.6564390659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110723, + "balance_loss_mlp": 1.07654572, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07707498056388652, + "language_loss": 0.79454792, + "learning_rate": 0.0004279736447400812, + "loss": 0.80562025, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30664062, + "step": 2909, + "time_per_iteration": 2.580448627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_mlp": 1.07343817, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.055339920225342294, + "language_loss": 0.78979003, + "learning_rate": 0.00042766536581293385, + "loss": 0.80081677, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.29223633, + "step": 2910, + "time_per_iteration": 2.714306116104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112106, + "balance_loss_mlp": 1.09004188, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.06660982321180627, + "language_loss": 0.79863673, + "learning_rate": 0.0004273571149685819, + "loss": 0.80984735, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30981445, + "step": 2911, + "time_per_iteration": 2.738189220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117749, + "balance_loss_mlp": 1.08794653, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.07453286241806684, + "language_loss": 0.83875954, + "learning_rate": 0.00042704889232669937, + "loss": 0.84993702, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29785156, + "step": 2912, + "time_per_iteration": 2.7153878211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119265, + "balance_loss_mlp": 1.09003508, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.06505374842280261, + "language_loss": 0.85808718, + "learning_rate": 0.0004267406980069484, + "loss": 0.8692798, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29248047, + "step": 2913, + "time_per_iteration": 2.7438042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105259, + "balance_loss_mlp": 1.07490873, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.045730944132966495, + "language_loss": 0.79707301, + "learning_rate": 0.0004264325321289808, + "loss": 0.80812562, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.30322266, + "step": 2914, + "time_per_iteration": 2.787429094314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101375, + "balance_loss_mlp": 1.07131052, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.05941371213730478, + "language_loss": 0.8624413, + "learning_rate": 0.00042612439481243736, + "loss": 0.87345505, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.30078125, + "step": 2915, + "time_per_iteration": 2.7993295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_mlp": 1.06113064, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.06435914288601326, + "language_loss": 0.90124059, + "learning_rate": 0.00042581628617694735, + "loss": 0.91214895, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.296875, + "step": 2916, + "time_per_iteration": 2.744046449661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089958, + "balance_loss_mlp": 1.06032228, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.05771140503361017, + "language_loss": 0.81953394, + "learning_rate": 0.0004255082063421296, + "loss": 0.83043355, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.29638672, + "step": 2917, + "time_per_iteration": 2.705963134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.0655117, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.0764674514791775, + "language_loss": 0.84947777, + "learning_rate": 0.00042520015542759065, + "loss": 0.86043298, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.29980469, + "step": 2918, + "time_per_iteration": 2.9078075885772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085954, + "balance_loss_mlp": 1.05662882, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.049198929687541054, + "language_loss": 0.88353539, + "learning_rate": 0.00042489213355292687, + "loss": 0.89439487, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.29296875, + "step": 2919, + "time_per_iteration": 2.862194776535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093265, + "balance_loss_mlp": 1.06300998, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.0619251317266344, + "language_loss": 0.81301886, + "learning_rate": 0.00042458414083772276, + "loss": 0.82395148, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.30224609, + "step": 2920, + "time_per_iteration": 2.5329933166503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095136, + "balance_loss_mlp": 1.0651195, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.05517349890350355, + "language_loss": 0.8525691, + "learning_rate": 0.000424276177401552, + "loss": 0.86352038, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.29956055, + "step": 2921, + "time_per_iteration": 2.787318468093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092355, + "balance_loss_mlp": 1.06200445, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06500569481536145, + "language_loss": 0.85831988, + "learning_rate": 0.0004239682433639763, + "loss": 0.86924338, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.3034668, + "step": 2922, + "time_per_iteration": 2.697091817855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093283, + "balance_loss_mlp": 1.06386256, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.08309086608315261, + "language_loss": 0.85596514, + "learning_rate": 0.0004236603388445467, + "loss": 0.86689794, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.29394531, + "step": 2923, + "time_per_iteration": 2.5720105171203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097102, + "balance_loss_mlp": 1.0683012, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07246274776201297, + "language_loss": 0.82229364, + "learning_rate": 0.00042335246396280166, + "loss": 0.83326471, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.28808594, + "step": 2924, + "time_per_iteration": 2.7669975757598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093178, + "balance_loss_mlp": 1.06320906, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06414999121973448, + "language_loss": 0.90646857, + "learning_rate": 0.0004230446188382693, + "loss": 0.91740036, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.29956055, + "step": 2925, + "time_per_iteration": 2.5662741661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088889, + "balance_loss_mlp": 1.0595876, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.05389869275215176, + "language_loss": 0.80918074, + "learning_rate": 0.0004227368035904654, + "loss": 0.82006967, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.29296875, + "step": 2926, + "time_per_iteration": 2.964599370956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092675, + "balance_loss_mlp": 1.06249142, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06261422618617216, + "language_loss": 0.82895541, + "learning_rate": 0.00042242901833889474, + "loss": 0.83988214, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30151367, + "step": 2927, + "time_per_iteration": 2.6312665939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093424, + "balance_loss_mlp": 1.06376481, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.06041695665754469, + "language_loss": 0.86030155, + "learning_rate": 0.0004221212632030501, + "loss": 0.87123579, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.29614258, + "step": 2928, + "time_per_iteration": 3.0977063179016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094061, + "balance_loss_mlp": 1.06351972, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.06366283736150324, + "language_loss": 0.80857551, + "learning_rate": 0.0004218135383024124, + "loss": 0.81951618, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30541992, + "step": 2929, + "time_per_iteration": 2.749244213104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088519, + "balance_loss_mlp": 1.0590266, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.12143433952472552, + "language_loss": 0.85715157, + "learning_rate": 0.0004215058437564511, + "loss": 0.86803675, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.29467773, + "step": 2930, + "time_per_iteration": 2.593238115310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_mlp": 1.05512953, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.056033125460513485, + "language_loss": 0.82132083, + "learning_rate": 0.00042119817968462397, + "loss": 0.83216375, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.29125977, + "step": 2931, + "time_per_iteration": 2.591958522796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092676, + "balance_loss_mlp": 1.06275427, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.07812059497351068, + "language_loss": 0.87152535, + "learning_rate": 0.0004208905462063766, + "loss": 0.88245207, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.29907227, + "step": 2932, + "time_per_iteration": 2.6288535594940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086522, + "balance_loss_mlp": 1.0571723, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.06389283518633071, + "language_loss": 0.84869772, + "learning_rate": 0.00042058294344114315, + "loss": 0.85956293, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.29345703, + "step": 2933, + "time_per_iteration": 2.6064674854278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05672109, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.05718901807458546, + "language_loss": 0.77749109, + "learning_rate": 0.0004202753715083456, + "loss": 0.78835702, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.29858398, + "step": 2934, + "time_per_iteration": 3.075186014175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093891, + "balance_loss_mlp": 1.0630157, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07168087831316133, + "language_loss": 0.81911719, + "learning_rate": 0.0004199678305273936, + "loss": 0.83005607, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30883789, + "step": 2935, + "time_per_iteration": 2.6289923191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091967, + "balance_loss_mlp": 1.06316626, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.0664481148229904, + "language_loss": 0.81315005, + "learning_rate": 0.0004196603206176854, + "loss": 0.82406974, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.28808594, + "step": 2936, + "time_per_iteration": 2.941150426864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093274, + "balance_loss_mlp": 1.06404424, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07427925135014142, + "language_loss": 0.83779049, + "learning_rate": 0.000419352841898607, + "loss": 0.84872323, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29199219, + "step": 2937, + "time_per_iteration": 2.977189302444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092016, + "balance_loss_mlp": 1.06273842, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.061049572757767595, + "language_loss": 0.77780819, + "learning_rate": 0.000419045394489532, + "loss": 0.78872836, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29296875, + "step": 2938, + "time_per_iteration": 2.6722819805145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086194, + "balance_loss_mlp": 1.05622458, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.05727154642915785, + "language_loss": 0.77326584, + "learning_rate": 0.0004187379785098224, + "loss": 0.78412783, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.29931641, + "step": 2939, + "time_per_iteration": 3.100283622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.05665886, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.06949350877551969, + "language_loss": 0.83849806, + "learning_rate": 0.00041843059407882744, + "loss": 0.84936267, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.29785156, + "step": 2940, + "time_per_iteration": 2.9837162494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.05257499, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.068553917777786, + "language_loss": 0.82768112, + "learning_rate": 0.0004181232413158842, + "loss": 0.83850372, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.29638672, + "step": 2941, + "time_per_iteration": 2.636819839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083664, + "balance_loss_mlp": 1.05371857, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.06960240931548377, + "language_loss": 0.82932127, + "learning_rate": 0.0004178159203403179, + "loss": 0.84015793, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29931641, + "step": 2942, + "time_per_iteration": 2.822134494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_mlp": 1.0547837, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.05318601865014104, + "language_loss": 0.81807715, + "learning_rate": 0.0004175086312714409, + "loss": 0.8289094, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.28442383, + "step": 2943, + "time_per_iteration": 2.571985960006714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086509, + "balance_loss_mlp": 1.05625343, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05713331418457596, + "language_loss": 0.84120524, + "learning_rate": 0.00041720137422855366, + "loss": 0.85207033, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.30224609, + "step": 2944, + "time_per_iteration": 2.7213711738586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086525, + "balance_loss_mlp": 1.05758142, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.1661240742061477, + "language_loss": 0.79230917, + "learning_rate": 0.00041689414933094383, + "loss": 0.80317438, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.28930664, + "step": 2945, + "time_per_iteration": 2.628525733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088063, + "balance_loss_mlp": 1.05954862, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.06338169436240754, + "language_loss": 0.81427538, + "learning_rate": 0.00041658695669788653, + "loss": 0.82515597, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.28515625, + "step": 2946, + "time_per_iteration": 2.736955404281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084859, + "balance_loss_mlp": 1.0541029, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.0612697940531113, + "language_loss": 0.81293368, + "learning_rate": 0.00041627979644864453, + "loss": 0.82378221, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.30712891, + "step": 2947, + "time_per_iteration": 2.780796766281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_mlp": 1.05436563, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06710047446863547, + "language_loss": 0.81410027, + "learning_rate": 0.0004159726687024683, + "loss": 0.82493049, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.28662109, + "step": 2948, + "time_per_iteration": 2.6072115898132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108621, + "balance_loss_mlp": 1.05757558, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.06141378811636639, + "language_loss": 0.79485345, + "learning_rate": 0.00041566557357859506, + "loss": 0.80571556, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.28613281, + "step": 2949, + "time_per_iteration": 2.911865234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.05443358, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.052257384193144164, + "language_loss": 0.79611081, + "learning_rate": 0.0004153585111962502, + "loss": 0.806961, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.30566406, + "step": 2950, + "time_per_iteration": 3.2808187007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_mlp": 1.05606341, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06672147261233864, + "language_loss": 0.84739614, + "learning_rate": 0.0004150514816746453, + "loss": 0.85826337, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.30639648, + "step": 2951, + "time_per_iteration": 2.680326461791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089698, + "balance_loss_mlp": 1.0602051, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.06944116544696582, + "language_loss": 0.85944223, + "learning_rate": 0.0004147444851329802, + "loss": 0.87033927, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29443359, + "step": 2952, + "time_per_iteration": 2.6477670669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086586, + "balance_loss_mlp": 1.05680704, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.054427499920313586, + "language_loss": 0.86026949, + "learning_rate": 0.00041443752169044126, + "loss": 0.87113535, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.29736328, + "step": 2953, + "time_per_iteration": 2.997781276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092153, + "balance_loss_mlp": 1.061993, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.055407826164880256, + "language_loss": 0.84948021, + "learning_rate": 0.0004141305914662025, + "loss": 0.86040175, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.30126953, + "step": 2954, + "time_per_iteration": 2.704019069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05688024, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0673052072270573, + "language_loss": 0.80911326, + "learning_rate": 0.0004138236945794246, + "loss": 0.81998718, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.30493164, + "step": 2955, + "time_per_iteration": 2.88403058052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108918, + "balance_loss_mlp": 1.05911565, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.06799730214965168, + "language_loss": 0.8379457, + "learning_rate": 0.00041351683114925576, + "loss": 0.84883749, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.30053711, + "step": 2956, + "time_per_iteration": 3.0439462661743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087169, + "balance_loss_mlp": 1.0562458, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06948923214023794, + "language_loss": 0.86469889, + "learning_rate": 0.0004132100012948308, + "loss": 0.87557054, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.30883789, + "step": 2957, + "time_per_iteration": 2.6431198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090494, + "balance_loss_mlp": 1.05959463, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.0655655158566539, + "language_loss": 0.84699452, + "learning_rate": 0.00041290320513527145, + "loss": 0.85789943, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.30883789, + "step": 2958, + "time_per_iteration": 2.5978519916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_mlp": 1.05528057, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.05333030562061355, + "language_loss": 0.8519215, + "learning_rate": 0.0004125964427896867, + "loss": 0.86277229, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29760742, + "step": 2959, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_mlp": 1.05468178, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06459683266000829, + "language_loss": 0.79222417, + "learning_rate": 0.0004122897143771723, + "loss": 0.80306756, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.29663086, + "step": 2960, + "time_per_iteration": 2.5457372665405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_mlp": 1.05713725, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.057309213891239566, + "language_loss": 0.81961918, + "learning_rate": 0.0004119830200168109, + "loss": 0.83049381, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.30297852, + "step": 2961, + "time_per_iteration": 2.66658091545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.05180621, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.0578679247611712, + "language_loss": 0.88614476, + "learning_rate": 0.0004116763598276714, + "loss": 0.89694846, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.28564453, + "step": 2962, + "time_per_iteration": 2.5355417728424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083269, + "balance_loss_mlp": 1.05394387, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.05524318032555551, + "language_loss": 0.81030452, + "learning_rate": 0.00041136973392881017, + "loss": 0.82113719, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.29345703, + "step": 2963, + "time_per_iteration": 2.8497612476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.05540633, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.06477225886122127, + "language_loss": 0.82179135, + "learning_rate": 0.00041106314243926983, + "loss": 0.83264679, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.30102539, + "step": 2964, + "time_per_iteration": 2.735269069671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080389, + "balance_loss_mlp": 1.05103993, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.05516182837620622, + "language_loss": 0.87329233, + "learning_rate": 0.0004107565854780798, + "loss": 0.88409621, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29296875, + "step": 2965, + "time_per_iteration": 2.6157355308532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.05596685, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.07414316825555053, + "language_loss": 0.81466991, + "learning_rate": 0.000410450063164256, + "loss": 0.82552361, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29370117, + "step": 2966, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083362, + "balance_loss_mlp": 1.05291581, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.06746080357230834, + "language_loss": 0.82004952, + "learning_rate": 0.00041014357561680115, + "loss": 0.83088315, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30395508, + "step": 2967, + "time_per_iteration": 2.51119065284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085478, + "balance_loss_mlp": 1.05519855, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.053142332405834165, + "language_loss": 0.86128843, + "learning_rate": 0.0004098371229547039, + "loss": 0.87214315, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.30249023, + "step": 2968, + "time_per_iteration": 2.6994621753692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022253, + "balance_loss_mlp": 1.01390862, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.025900339106917806, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81033063, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.08349609, + "step": 2969, + "time_per_iteration": 4.718291997909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092241, + "balance_loss_mlp": 1.06179523, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.05366083523781242, + "language_loss": 0.80585647, + "learning_rate": 0.00040922432276247107, + "loss": 0.8167789, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.30419922, + "step": 2970, + "time_per_iteration": 2.55259108543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.0609777, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.049420251796361614, + "language_loss": 0.84874177, + "learning_rate": 0.0004089179754702457, + "loss": 0.85966122, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.30932617, + "step": 2971, + "time_per_iteration": 2.771068572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109211, + "balance_loss_mlp": 1.06090152, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.06283275659801735, + "language_loss": 0.7981565, + "learning_rate": 0.00040861166353919843, + "loss": 0.80907762, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.31176758, + "step": 2972, + "time_per_iteration": 2.7827725410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091117, + "balance_loss_mlp": 1.06069493, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06507135137823726, + "language_loss": 0.818784, + "learning_rate": 0.00040830538708824983, + "loss": 0.82969517, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.30395508, + "step": 2973, + "time_per_iteration": 2.845456600189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108959, + "balance_loss_mlp": 1.05966854, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.07493195148818688, + "language_loss": 0.81968939, + "learning_rate": 0.000407999146236307, + "loss": 0.8305853, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29882812, + "step": 2974, + "time_per_iteration": 2.531430244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093173, + "balance_loss_mlp": 1.06284618, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.06121365308687838, + "language_loss": 0.8362776, + "learning_rate": 0.0004076929411022634, + "loss": 0.84720927, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.30322266, + "step": 2975, + "time_per_iteration": 2.645341634750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096437, + "balance_loss_mlp": 1.06591964, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.05509159729755976, + "language_loss": 0.79606473, + "learning_rate": 0.0004073867718049982, + "loss": 0.80702913, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.30493164, + "step": 2976, + "time_per_iteration": 3.085145950317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.07137978, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.06232705756749319, + "language_loss": 0.82691067, + "learning_rate": 0.00040708063846337704, + "loss": 0.83793509, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.31054688, + "step": 2977, + "time_per_iteration": 2.738443613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099181, + "balance_loss_mlp": 1.06813931, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.061703964741206326, + "language_loss": 0.81214464, + "learning_rate": 0.00040677454119625143, + "loss": 0.82313639, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.31005859, + "step": 2978, + "time_per_iteration": 2.6232175827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108887, + "balance_loss_mlp": 1.07758296, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07073355508195153, + "language_loss": 0.83247018, + "learning_rate": 0.0004064684801224587, + "loss": 0.84355903, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.31274414, + "step": 2979, + "time_per_iteration": 2.577918767929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101163, + "balance_loss_mlp": 1.07085991, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.05699497583041508, + "language_loss": 0.80492741, + "learning_rate": 0.00040616245536082224, + "loss": 0.81593907, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30273438, + "step": 2980, + "time_per_iteration": 2.6298904418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101523, + "balance_loss_mlp": 1.07167256, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.04979780559516064, + "language_loss": 0.81357765, + "learning_rate": 0.00040585646703015165, + "loss": 0.82459289, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29833984, + "step": 2981, + "time_per_iteration": 2.8170647621154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_mlp": 1.07118809, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07486213422343042, + "language_loss": 0.78689104, + "learning_rate": 0.0004055505152492419, + "loss": 0.79791927, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.31616211, + "step": 2982, + "time_per_iteration": 2.6379241943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_mlp": 1.06825066, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.05681665302183781, + "language_loss": 0.74231875, + "learning_rate": 0.00040524460013687425, + "loss": 0.75331908, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.31762695, + "step": 2983, + "time_per_iteration": 2.7545318603515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097699, + "balance_loss_mlp": 1.0663712, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.04476617807489617, + "language_loss": 0.81250238, + "learning_rate": 0.0004049387218118155, + "loss": 0.82347941, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.31298828, + "step": 2984, + "time_per_iteration": 2.9756665229797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108902, + "balance_loss_mlp": 1.05816841, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07928255171477795, + "language_loss": 0.85245347, + "learning_rate": 0.00040463288039281777, + "loss": 0.8633436, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30810547, + "step": 2985, + "time_per_iteration": 2.706669807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_mlp": 1.02681565, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.02538869827055974, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78910911, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.07666016, + "step": 2986, + "time_per_iteration": 4.949368953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.05462396, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.060127228305881374, + "language_loss": 0.82366645, + "learning_rate": 0.0004040213087479444, + "loss": 0.83451408, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.30102539, + "step": 2987, + "time_per_iteration": 2.9205455780029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086178, + "balance_loss_mlp": 1.05723405, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.05965622667733625, + "language_loss": 0.85328299, + "learning_rate": 0.0004037155787595018, + "loss": 0.86414474, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.2890625, + "step": 2988, + "time_per_iteration": 2.574509859085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088474, + "balance_loss_mlp": 1.0593158, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.05784717048255493, + "language_loss": 0.80869853, + "learning_rate": 0.000403409886151987, + "loss": 0.8195833, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29125977, + "step": 2989, + "time_per_iteration": 2.945080041885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016432, + "balance_loss_mlp": 1.00894582, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.009946927491071988, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83015537, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.07470703, + "step": 2990, + "time_per_iteration": 4.807205677032471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015586, + "balance_loss_mlp": 1.00809932, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.009078458393910433, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79214191, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.07470703, + "step": 2991, + "time_per_iteration": 4.805190563201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.05380619, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.05637441563568418, + "language_loss": 0.76644433, + "learning_rate": 0.00040249303380173807, + "loss": 0.77728564, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.30322266, + "step": 2992, + "time_per_iteration": 3.049729108810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.05780125, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06616333205601678, + "language_loss": 0.79290402, + "learning_rate": 0.00040218749190459126, + "loss": 0.80381036, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.32836914, + "step": 2993, + "time_per_iteration": 2.7314000129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087925, + "balance_loss_mlp": 1.05795622, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.06422497492556134, + "language_loss": 0.82827115, + "learning_rate": 0.00040188198798162775, + "loss": 0.83915043, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29956055, + "step": 2994, + "time_per_iteration": 2.605794668197632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089955, + "balance_loss_mlp": 1.06022453, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.05264744908201922, + "language_loss": 0.85358101, + "learning_rate": 0.000401576522151455, + "loss": 0.8644805, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29711914, + "step": 2995, + "time_per_iteration": 2.8504650592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_mlp": 1.05664682, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.05051873290535222, + "language_loss": 0.83133811, + "learning_rate": 0.0004012710945326651, + "loss": 0.8421973, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.29248047, + "step": 2996, + "time_per_iteration": 2.7823193073272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094348, + "balance_loss_mlp": 1.06545174, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.0711371625716349, + "language_loss": 0.81093514, + "learning_rate": 0.0004009657052438355, + "loss": 0.82187867, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28881836, + "step": 2997, + "time_per_iteration": 2.7743020057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091289, + "balance_loss_mlp": 1.06184435, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.06367440987852575, + "language_loss": 0.85650682, + "learning_rate": 0.00040066035440352904, + "loss": 0.86741972, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.29418945, + "step": 2998, + "time_per_iteration": 2.6359331607818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014946, + "balance_loss_mlp": 1.0071255, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.01828635150904939, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8030808, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.078125, + "step": 2999, + "time_per_iteration": 4.881432056427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104047, + "balance_loss_mlp": 1.07417345, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.0709915390631299, + "language_loss": 0.76451176, + "learning_rate": 0.00040004976854266145, + "loss": 0.77555221, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.2980957, + "step": 3000, + "time_per_iteration": 2.5374131202697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101005, + "balance_loss_mlp": 1.07017779, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.051209677129469174, + "language_loss": 0.81337965, + "learning_rate": 0.0003997445337591505, + "loss": 0.8243897, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.30810547, + "step": 3001, + "time_per_iteration": 2.647610902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102438, + "balance_loss_mlp": 1.07351804, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.0611265357111255, + "language_loss": 0.74261576, + "learning_rate": 0.0003994393378982635, + "loss": 0.75364017, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28979492, + "step": 3002, + "time_per_iteration": 2.602245330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013935, + "balance_loss_mlp": 1.00611448, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.01032263408282017, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80551934, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.078125, + "step": 3003, + "time_per_iteration": 4.818480968475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104089, + "balance_loss_mlp": 1.07304692, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.0604287320481862, + "language_loss": 0.88041145, + "learning_rate": 0.0003988290634182961, + "loss": 0.89145231, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.31005859, + "step": 3004, + "time_per_iteration": 2.7484169006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.07284904, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.06655998832299866, + "language_loss": 0.80592918, + "learning_rate": 0.0003985239850361453, + "loss": 0.81695324, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.29541016, + "step": 3005, + "time_per_iteration": 2.6148018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_mlp": 1.07281876, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.0659443256400084, + "language_loss": 0.84734911, + "learning_rate": 0.0003982189460504777, + "loss": 0.85836959, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.29199219, + "step": 3006, + "time_per_iteration": 2.7011501789093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105808, + "balance_loss_mlp": 1.07540917, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.06531961229333205, + "language_loss": 0.7939682, + "learning_rate": 0.00039791394657971935, + "loss": 0.80502629, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.30371094, + "step": 3007, + "time_per_iteration": 2.7082760334014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102193, + "balance_loss_mlp": 1.07234263, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.06476760562978502, + "language_loss": 0.8421638, + "learning_rate": 0.00039760898674228205, + "loss": 0.85318571, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.29858398, + "step": 3008, + "time_per_iteration": 2.650878429412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105056, + "balance_loss_mlp": 1.07475293, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.05525540739637584, + "language_loss": 0.80765337, + "learning_rate": 0.0003973040666565613, + "loss": 0.81870395, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.30273438, + "step": 3009, + "time_per_iteration": 3.1226985454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100227, + "balance_loss_mlp": 1.07030547, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06024611276807751, + "language_loss": 0.82195163, + "learning_rate": 0.000396999186440938, + "loss": 0.83295393, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.29882812, + "step": 3010, + "time_per_iteration": 2.844270944595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096543, + "balance_loss_mlp": 1.06533396, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.06262665363935188, + "language_loss": 0.85208702, + "learning_rate": 0.000396694346213777, + "loss": 0.86305249, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.31176758, + "step": 3011, + "time_per_iteration": 2.613032817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109368, + "balance_loss_mlp": 1.06492627, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.05937601459412264, + "language_loss": 0.83947617, + "learning_rate": 0.0003963895460934276, + "loss": 0.85041296, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.28735352, + "step": 3012, + "time_per_iteration": 3.124514102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091421, + "balance_loss_mlp": 1.05992579, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07020347624432877, + "language_loss": 0.8493948, + "learning_rate": 0.00039608478619822376, + "loss": 0.86030906, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.31494141, + "step": 3013, + "time_per_iteration": 2.411346912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_mlp": 1.05544281, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.05715104374994747, + "language_loss": 0.826662, + "learning_rate": 0.00039578006664648394, + "loss": 0.83750206, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.28564453, + "step": 3014, + "time_per_iteration": 2.7363553047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.05310702, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06684904609650524, + "language_loss": 0.81588256, + "learning_rate": 0.0003954753875565105, + "loss": 0.82670951, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.2956543, + "step": 3015, + "time_per_iteration": 3.089769124984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107747, + "balance_loss_mlp": 1.04890752, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06478579772376787, + "language_loss": 0.82758343, + "learning_rate": 0.00039517074904659057, + "loss": 0.83835804, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.28564453, + "step": 3016, + "time_per_iteration": 2.7099101543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084798, + "balance_loss_mlp": 1.05454302, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.05410468367994604, + "language_loss": 0.84939837, + "learning_rate": 0.00039486615123499535, + "loss": 0.8602463, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.30224609, + "step": 3017, + "time_per_iteration": 2.8504526615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085654, + "balance_loss_mlp": 1.05532694, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.05526317953318916, + "language_loss": 0.85137427, + "learning_rate": 0.00039456159423997996, + "loss": 0.86223084, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.30297852, + "step": 3018, + "time_per_iteration": 2.633484363555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.0523833, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.07104600615119407, + "language_loss": 0.8999185, + "learning_rate": 0.00039425707817978406, + "loss": 0.91074204, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29956055, + "step": 3019, + "time_per_iteration": 2.6299033164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082814, + "balance_loss_mlp": 1.05241609, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.05724387536038855, + "language_loss": 0.83951199, + "learning_rate": 0.00039395260317263124, + "loss": 0.85034013, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.30395508, + "step": 3020, + "time_per_iteration": 2.5456759929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080319, + "balance_loss_mlp": 1.04996824, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.07612516842687451, + "language_loss": 0.85048491, + "learning_rate": 0.0003936481693367291, + "loss": 0.86128807, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.3034668, + "step": 3021, + "time_per_iteration": 2.7192864418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094567, + "balance_loss_mlp": 1.06259549, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08707963459833061, + "language_loss": 0.882092, + "learning_rate": 0.0003933437767902697, + "loss": 0.89303768, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.31958008, + "step": 3022, + "time_per_iteration": 2.7938294410705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.05792677, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07541432505918821, + "language_loss": 0.7834546, + "learning_rate": 0.00039303942565142825, + "loss": 0.79433668, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.30249023, + "step": 3023, + "time_per_iteration": 2.7417471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091198, + "balance_loss_mlp": 1.06089532, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.05482425315239383, + "language_loss": 0.76731157, + "learning_rate": 0.0003927351160383644, + "loss": 0.77822357, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.30249023, + "step": 3024, + "time_per_iteration": 2.804474353790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091546, + "balance_loss_mlp": 1.06193483, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05202928961884776, + "language_loss": 0.77983212, + "learning_rate": 0.000392430848069222, + "loss": 0.79074758, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.530200958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097141, + "balance_loss_mlp": 1.06814933, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.058580785743037773, + "language_loss": 0.82503867, + "learning_rate": 0.00039212662186212795, + "loss": 0.8360101, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.28979492, + "step": 3026, + "time_per_iteration": 2.592423677444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.06676841, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.04855017878997747, + "language_loss": 0.7719928, + "learning_rate": 0.0003918224375351934, + "loss": 0.78294182, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.28149414, + "step": 3027, + "time_per_iteration": 2.7347710132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101546, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.05175541468331668, + "language_loss": 0.7881335, + "learning_rate": 0.0003915182952065135, + "loss": 0.79914892, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29858398, + "step": 3028, + "time_per_iteration": 2.698678493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_mlp": 1.07684946, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.051679899573834884, + "language_loss": 0.87814313, + "learning_rate": 0.0003912141949941664, + "loss": 0.88920105, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.2890625, + "step": 3029, + "time_per_iteration": 2.703824520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107968, + "balance_loss_mlp": 1.07675922, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.07311487113166662, + "language_loss": 0.82985795, + "learning_rate": 0.0003909101370162143, + "loss": 0.84093761, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.31201172, + "step": 3030, + "time_per_iteration": 2.601590633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101355, + "balance_loss_mlp": 1.00611103, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.01566462127280147, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73447442, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.07421875, + "step": 3031, + "time_per_iteration": 4.907916307449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103812, + "balance_loss_mlp": 1.07403314, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.05748921462157389, + "language_loss": 0.8307178, + "learning_rate": 0.0003903021482356622, + "loss": 0.84175599, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29760742, + "step": 3032, + "time_per_iteration": 2.8251240253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_mlp": 1.07525432, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.054780146703337314, + "language_loss": 0.82722723, + "learning_rate": 0.00038999821766910465, + "loss": 0.83827209, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.29248047, + "step": 3033, + "time_per_iteration": 2.9882729053497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108478, + "balance_loss_mlp": 1.07996285, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.08031037628307693, + "language_loss": 0.86154497, + "learning_rate": 0.00038969432980902606, + "loss": 0.87262976, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.28540039, + "step": 3034, + "time_per_iteration": 2.597313642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018692, + "balance_loss_mlp": 1.01149189, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.013503469394203483, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80803192, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.07177734, + "step": 3035, + "time_per_iteration": 4.801652669906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113026, + "balance_loss_mlp": 1.08374798, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0646542819028206, + "language_loss": 0.82506442, + "learning_rate": 0.00038908668268020953, + "loss": 0.83619463, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29223633, + "step": 3036, + "time_per_iteration": 2.6857457160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112518, + "balance_loss_mlp": 1.08381224, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.21422512196310703, + "language_loss": 0.85166728, + "learning_rate": 0.00038878292364738097, + "loss": 0.86279243, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.28662109, + "step": 3037, + "time_per_iteration": 2.776686191558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106641, + "balance_loss_mlp": 1.07726789, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0719771880124652, + "language_loss": 0.87355781, + "learning_rate": 0.0003884792077928508, + "loss": 0.88462424, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.5682616233825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_mlp": 1.07304573, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.06153670645771429, + "language_loss": 0.7661767, + "learning_rate": 0.0003881755352345322, + "loss": 0.77719897, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29174805, + "step": 3039, + "time_per_iteration": 2.5531814098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104927, + "balance_loss_mlp": 1.07560194, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05739173880603102, + "language_loss": 0.86896229, + "learning_rate": 0.0003878719060903207, + "loss": 0.88001162, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29296875, + "step": 3040, + "time_per_iteration": 2.593386650085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098868, + "balance_loss_mlp": 1.06908977, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.068924296543817, + "language_loss": 0.84256113, + "learning_rate": 0.0003875683204780961, + "loss": 0.85354984, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29785156, + "step": 3041, + "time_per_iteration": 2.6921916007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_mlp": 1.07145464, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.07404975426077917, + "language_loss": 0.85055083, + "learning_rate": 0.00038726477851572043, + "loss": 0.86155903, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.29394531, + "step": 3042, + "time_per_iteration": 2.76772403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090937, + "balance_loss_mlp": 1.06249356, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.06423863125550561, + "language_loss": 0.80573255, + "learning_rate": 0.0003869612803210395, + "loss": 0.81664193, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.28442383, + "step": 3043, + "time_per_iteration": 2.6271820068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092493, + "balance_loss_mlp": 1.06314421, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.07232729129784332, + "language_loss": 0.83455092, + "learning_rate": 0.0003866578260118817, + "loss": 0.84547591, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29345703, + "step": 3044, + "time_per_iteration": 2.583698272705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05616593, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.059856611418728146, + "language_loss": 0.83175647, + "learning_rate": 0.0003863544157060581, + "loss": 0.84260201, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.28369141, + "step": 3045, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090685, + "balance_loss_mlp": 1.06166923, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.05199684229497746, + "language_loss": 0.82254589, + "learning_rate": 0.0003860510495213634, + "loss": 0.8334527, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.28979492, + "step": 3046, + "time_per_iteration": 2.7998342514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090034, + "balance_loss_mlp": 1.05946922, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08208062967584176, + "language_loss": 0.78349328, + "learning_rate": 0.0003857477275755746, + "loss": 0.7943936, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.30517578, + "step": 3047, + "time_per_iteration": 2.6120448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088733, + "balance_loss_mlp": 1.05940795, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0525859268526321, + "language_loss": 0.83988523, + "learning_rate": 0.00038544444998645167, + "loss": 0.8507725, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.29296875, + "step": 3048, + "time_per_iteration": 2.9847609996795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085173, + "balance_loss_mlp": 1.0563724, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.06739730522499447, + "language_loss": 0.82059789, + "learning_rate": 0.00038514121687173767, + "loss": 0.83144969, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.28808594, + "step": 3049, + "time_per_iteration": 2.619170904159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081984, + "balance_loss_mlp": 1.0529443, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07072588382777995, + "language_loss": 0.82076973, + "learning_rate": 0.00038483802834915807, + "loss": 0.83158958, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.29003906, + "step": 3050, + "time_per_iteration": 2.9947521686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04742062, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.0556694240307722, + "language_loss": 0.7980268, + "learning_rate": 0.00038453488453642074, + "loss": 0.80879277, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29174805, + "step": 3051, + "time_per_iteration": 2.659647226333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081993, + "balance_loss_mlp": 1.05133235, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.055022006168623364, + "language_loss": 0.8682425, + "learning_rate": 0.00038423178555121697, + "loss": 0.87906241, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.30664062, + "step": 3052, + "time_per_iteration": 2.682971954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078016, + "balance_loss_mlp": 1.0489769, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.05776598371070369, + "language_loss": 0.85701603, + "learning_rate": 0.00038392873151121994, + "loss": 0.86779618, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.29052734, + "step": 3053, + "time_per_iteration": 3.060055732727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077537, + "balance_loss_mlp": 1.04883146, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.06401371867882108, + "language_loss": 0.83262593, + "learning_rate": 0.0003836257225340859, + "loss": 0.84340131, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.28686523, + "step": 3054, + "time_per_iteration": 2.680649995803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.04853082, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.058869654242756926, + "language_loss": 0.82344568, + "learning_rate": 0.00038332275873745336, + "loss": 0.83423615, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.3046875, + "step": 3055, + "time_per_iteration": 3.036266565322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108387, + "balance_loss_mlp": 1.05485463, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05256953045681507, + "language_loss": 0.83349717, + "learning_rate": 0.0003830198402389431, + "loss": 0.84433585, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.2902832, + "step": 3056, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.06163549, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.04626706953255302, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78418016, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07421875, + "step": 3057, + "time_per_iteration": 4.978636026382446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082198, + "balance_loss_mlp": 1.05349255, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.07448060145489646, + "language_loss": 0.83136308, + "learning_rate": 0.0003824141396066855, + "loss": 0.84218502, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28710938, + "step": 3058, + "time_per_iteration": 2.5531108379364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088619, + "balance_loss_mlp": 1.05910254, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.059082946906010764, + "language_loss": 0.82999164, + "learning_rate": 0.000382111357708092, + "loss": 0.84087777, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29541016, + "step": 3059, + "time_per_iteration": 2.699920654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088385, + "balance_loss_mlp": 1.05917883, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.071653907002528, + "language_loss": 0.84021831, + "learning_rate": 0.00038180862157792864, + "loss": 0.85110211, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.29174805, + "step": 3060, + "time_per_iteration": 2.8073549270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_mlp": 1.05642152, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.05679216094879844, + "language_loss": 0.82328987, + "learning_rate": 0.0003815059313337279, + "loss": 0.83413565, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28198242, + "step": 3061, + "time_per_iteration": 2.6649534702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086963, + "balance_loss_mlp": 1.05906773, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.07322136366051005, + "language_loss": 0.78155029, + "learning_rate": 0.00038120328709300436, + "loss": 0.79241997, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.27905273, + "step": 3062, + "time_per_iteration": 2.9070422649383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091769, + "balance_loss_mlp": 1.06191885, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.07246450050077374, + "language_loss": 0.83913672, + "learning_rate": 0.0003809006889732549, + "loss": 0.85005438, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.29833984, + "step": 3063, + "time_per_iteration": 2.803724527359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06420445, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.05969034427320992, + "language_loss": 0.88370293, + "learning_rate": 0.0003805981370919589, + "loss": 0.89462918, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28442383, + "step": 3064, + "time_per_iteration": 2.495248556137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086784, + "balance_loss_mlp": 1.05877018, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.05081424319280643, + "language_loss": 0.83982229, + "learning_rate": 0.0003802956315665771, + "loss": 0.85069013, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28027344, + "step": 3065, + "time_per_iteration": 2.6511592864990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091365, + "balance_loss_mlp": 1.06182539, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.06728201091458674, + "language_loss": 0.81791949, + "learning_rate": 0.0003799931725145529, + "loss": 0.8288331, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.29516602, + "step": 3066, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095665, + "balance_loss_mlp": 1.06729341, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.05193283223246739, + "language_loss": 0.86020327, + "learning_rate": 0.00037969076005331083, + "loss": 0.87115991, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28369141, + "step": 3067, + "time_per_iteration": 2.763853073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06713736, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.05663918686290471, + "language_loss": 0.88129491, + "learning_rate": 0.00037938839430025817, + "loss": 0.89225829, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.29248047, + "step": 3068, + "time_per_iteration": 2.6258280277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089417, + "balance_loss_mlp": 1.06092644, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.05275324094783275, + "language_loss": 0.85889924, + "learning_rate": 0.0003790860753727835, + "loss": 0.86979342, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.28491211, + "step": 3069, + "time_per_iteration": 2.7926387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086892, + "balance_loss_mlp": 1.05799568, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0573953914976859, + "language_loss": 0.8280952, + "learning_rate": 0.00037878380338825766, + "loss": 0.83896416, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28881836, + "step": 3070, + "time_per_iteration": 2.6791534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089394, + "balance_loss_mlp": 1.06209493, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.054269754776710775, + "language_loss": 0.81082213, + "learning_rate": 0.00037848157846403287, + "loss": 0.82171613, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.2734375, + "step": 3071, + "time_per_iteration": 2.897139549255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095364, + "balance_loss_mlp": 1.06792235, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.0725138562855444, + "language_loss": 0.83259237, + "learning_rate": 0.0003781794007174435, + "loss": 0.84354603, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.2746582, + "step": 3072, + "time_per_iteration": 2.724810838699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_mlp": 1.02988398, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.01939748854391394, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75111091, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.06689453, + "step": 3073, + "time_per_iteration": 4.9330198764801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090512, + "balance_loss_mlp": 1.06285512, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.048822002095482486, + "language_loss": 0.81208611, + "learning_rate": 0.0003775751872264152, + "loss": 0.82299125, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.27661133, + "step": 3074, + "time_per_iteration": 2.7631497383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084721, + "balance_loss_mlp": 1.05599189, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.06348444489710649, + "language_loss": 0.86787391, + "learning_rate": 0.0003772731517165527, + "loss": 0.87872112, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28710938, + "step": 3075, + "time_per_iteration": 2.7517099380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089134, + "balance_loss_mlp": 1.06069052, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.059695821747375526, + "language_loss": 0.83545357, + "learning_rate": 0.0003769711638534784, + "loss": 0.84634489, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28466797, + "step": 3076, + "time_per_iteration": 2.9352333545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090964, + "balance_loss_mlp": 1.06209183, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.08879190082108672, + "language_loss": 0.79118001, + "learning_rate": 0.00037666922375443446, + "loss": 0.80208963, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28857422, + "step": 3077, + "time_per_iteration": 2.5947184562683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093967, + "balance_loss_mlp": 1.06578577, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06374349472109522, + "language_loss": 0.81828058, + "learning_rate": 0.00037636733153664396, + "loss": 0.82922018, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.28149414, + "step": 3078, + "time_per_iteration": 2.8191051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109303, + "balance_loss_mlp": 1.0645864, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.06406278721713668, + "language_loss": 0.80298102, + "learning_rate": 0.0003760654873173124, + "loss": 0.81391132, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.28466797, + "step": 3079, + "time_per_iteration": 2.656822919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089541, + "balance_loss_mlp": 1.06081128, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.04854482848269962, + "language_loss": 0.82530022, + "learning_rate": 0.00037576369121362566, + "loss": 0.83619559, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.28759766, + "step": 3080, + "time_per_iteration": 2.589050531387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097703, + "balance_loss_mlp": 1.06840181, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.05673956944694001, + "language_loss": 0.82090509, + "learning_rate": 0.0003754619433427516, + "loss": 0.83188212, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29272461, + "step": 3081, + "time_per_iteration": 2.8826987743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086639, + "balance_loss_mlp": 1.05845797, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.06493823771045844, + "language_loss": 0.78039849, + "learning_rate": 0.0003751602438218392, + "loss": 0.79126489, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.28222656, + "step": 3082, + "time_per_iteration": 2.815852642059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087731, + "balance_loss_mlp": 1.05952644, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.08102695368832301, + "language_loss": 0.83818078, + "learning_rate": 0.0003748585927680186, + "loss": 0.84905803, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.28198242, + "step": 3083, + "time_per_iteration": 2.6566061973571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_mlp": 1.05651248, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.0619003043193751, + "language_loss": 0.8314001, + "learning_rate": 0.00037455699029840086, + "loss": 0.84224129, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.27612305, + "step": 3084, + "time_per_iteration": 2.609382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081588, + "balance_loss_mlp": 1.05436099, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.05433571826648474, + "language_loss": 0.84684891, + "learning_rate": 0.0003742554365300787, + "loss": 0.85766476, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.27270508, + "step": 3085, + "time_per_iteration": 2.725409746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086595, + "balance_loss_mlp": 1.05927253, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.05832989485618193, + "language_loss": 0.79031849, + "learning_rate": 0.0003739539315801255, + "loss": 0.80118442, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.27331543, + "step": 3086, + "time_per_iteration": 2.9751360416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092425, + "balance_loss_mlp": 1.06493533, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.05988774460659005, + "language_loss": 0.9182803, + "learning_rate": 0.000373652475565596, + "loss": 0.92920458, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.27490234, + "step": 3087, + "time_per_iteration": 2.535181999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090266, + "balance_loss_mlp": 1.06144142, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.07303028521355714, + "language_loss": 0.81608456, + "learning_rate": 0.00037335106860352587, + "loss": 0.82698727, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.28808594, + "step": 3088, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545377, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.0577260245362681, + "language_loss": 0.83174306, + "learning_rate": 0.00037304971081093146, + "loss": 0.84268945, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.29199219, + "step": 3089, + "time_per_iteration": 2.5568172931671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06479192, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.05440667028717182, + "language_loss": 0.80792761, + "learning_rate": 0.00037274840230481024, + "loss": 0.81884158, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.26635742, + "step": 3090, + "time_per_iteration": 2.7040512561798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089877, + "balance_loss_mlp": 1.06152868, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07197994401815008, + "language_loss": 0.79483205, + "learning_rate": 0.00037244714320214077, + "loss": 0.80573082, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.28369141, + "step": 3091, + "time_per_iteration": 2.527803659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091312, + "balance_loss_mlp": 1.06317902, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06270949928795992, + "language_loss": 0.83166003, + "learning_rate": 0.000372145933619882, + "loss": 0.84257317, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.28137207, + "step": 3092, + "time_per_iteration": 2.869267225265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092404, + "balance_loss_mlp": 1.06455636, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.059066436199884755, + "language_loss": 0.82841283, + "learning_rate": 0.000371844773674974, + "loss": 0.83933693, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.27856445, + "step": 3093, + "time_per_iteration": 2.6301257610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097042, + "balance_loss_mlp": 1.06793106, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.06442112613973276, + "language_loss": 0.82118666, + "learning_rate": 0.0003715436634843375, + "loss": 0.83215708, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29101562, + "step": 3094, + "time_per_iteration": 2.8569583892822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091347, + "balance_loss_mlp": 1.06466842, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.04641072419683149, + "language_loss": 0.80758119, + "learning_rate": 0.00037124260316487355, + "loss": 0.81849468, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.26708984, + "step": 3095, + "time_per_iteration": 2.8417470455169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095419, + "balance_loss_mlp": 1.06838274, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.05475651988922655, + "language_loss": 0.89790189, + "learning_rate": 0.0003709415928334643, + "loss": 0.90885603, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.27075195, + "step": 3096, + "time_per_iteration": 2.5519328117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092318, + "balance_loss_mlp": 1.06382728, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09831894239475095, + "language_loss": 0.80721879, + "learning_rate": 0.00037064063260697233, + "loss": 0.818142, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.28491211, + "step": 3097, + "time_per_iteration": 2.8612656593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099184, + "balance_loss_mlp": 1.07157493, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.058836420710008684, + "language_loss": 0.78798771, + "learning_rate": 0.0003703397226022407, + "loss": 0.79897952, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.27612305, + "step": 3098, + "time_per_iteration": 3.069542169570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_mlp": 1.03243947, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.024027627375554906, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76539135, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.06835938, + "step": 3099, + "time_per_iteration": 4.940065860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109756, + "balance_loss_mlp": 1.06966519, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.059128365336986094, + "language_loss": 0.83247489, + "learning_rate": 0.0003697380537253339, + "loss": 0.84345049, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.27929688, + "step": 3100, + "time_per_iteration": 2.638352632522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098347, + "balance_loss_mlp": 1.06973624, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.05513129923941457, + "language_loss": 0.82084006, + "learning_rate": 0.0003694372950867471, + "loss": 0.83182353, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28637695, + "step": 3101, + "time_per_iteration": 2.7355875968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101199, + "balance_loss_mlp": 1.07282722, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.05863829677079808, + "language_loss": 0.77766848, + "learning_rate": 0.0003691365871370976, + "loss": 0.78868043, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.28393555, + "step": 3102, + "time_per_iteration": 3.0227084159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110316, + "balance_loss_mlp": 1.07533622, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06404713166930852, + "language_loss": 0.85323572, + "learning_rate": 0.00036883592999313093, + "loss": 0.86426735, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27832031, + "step": 3103, + "time_per_iteration": 2.659637689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.0700587, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.05340010645713243, + "language_loss": 0.79008019, + "learning_rate": 0.0003685353237715722, + "loss": 0.80105591, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27563477, + "step": 3104, + "time_per_iteration": 2.9019625186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109062, + "balance_loss_mlp": 1.06272471, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.053396202956180965, + "language_loss": 0.81746447, + "learning_rate": 0.0003682347685891274, + "loss": 0.82837057, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.27893066, + "step": 3105, + "time_per_iteration": 2.8479247093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093319, + "balance_loss_mlp": 1.06535256, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.061940030050424234, + "language_loss": 0.80626607, + "learning_rate": 0.0003679342645624822, + "loss": 0.81719923, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.2800293, + "step": 3106, + "time_per_iteration": 2.988600015640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088181, + "balance_loss_mlp": 1.06088209, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.06552701347411696, + "language_loss": 0.82154477, + "learning_rate": 0.0003676338118083025, + "loss": 0.83242655, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.2734375, + "step": 3107, + "time_per_iteration": 3.0211057662963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091455, + "balance_loss_mlp": 1.06372714, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.05808577111452716, + "language_loss": 0.79585344, + "learning_rate": 0.0003673334104432347, + "loss": 0.806768, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.27758789, + "step": 3108, + "time_per_iteration": 2.6277918815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109043, + "balance_loss_mlp": 1.06255877, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.05782699460566696, + "language_loss": 0.83817154, + "learning_rate": 0.0003670330605839048, + "loss": 0.84907585, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.27856445, + "step": 3109, + "time_per_iteration": 2.786181926727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094155, + "balance_loss_mlp": 1.06685627, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.06234839208499282, + "language_loss": 0.76878405, + "learning_rate": 0.0003667327623469191, + "loss": 0.77972555, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27319336, + "step": 3110, + "time_per_iteration": 2.731876850128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089583, + "balance_loss_mlp": 1.0621767, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.06451414709321307, + "language_loss": 0.78028917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79118496, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27429199, + "step": 3111, + "time_per_iteration": 2.796886682510376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088313, + "balance_loss_mlp": 1.06072783, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.06854484980093518, + "language_loss": 0.82222939, + "learning_rate": 0.00036613232120630393, + "loss": 0.83311254, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.27587891, + "step": 3112, + "time_per_iteration": 2.6065847873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_mlp": 1.05594933, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.06819300023171558, + "language_loss": 0.80318254, + "learning_rate": 0.00036583217853578643, + "loss": 0.81402361, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.28173828, + "step": 3113, + "time_per_iteration": 2.5723838806152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_mlp": 1.06200337, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.05495468357602656, + "language_loss": 0.77783948, + "learning_rate": 0.000365532087953837, + "loss": 0.78872508, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.26586914, + "step": 3114, + "time_per_iteration": 3.622190475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081565, + "balance_loss_mlp": 1.05359864, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07841874273871757, + "language_loss": 0.89431345, + "learning_rate": 0.00036523204957696065, + "loss": 0.90512908, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.27978516, + "step": 3115, + "time_per_iteration": 2.6414806842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084485, + "balance_loss_mlp": 1.05627978, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.0586823821525485, + "language_loss": 0.80958188, + "learning_rate": 0.00036493206352164324, + "loss": 0.8204267, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.28222656, + "step": 3116, + "time_per_iteration": 2.896613121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.05184269, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.05558165654665051, + "language_loss": 0.85426074, + "learning_rate": 0.000364632129904349, + "loss": 0.86506593, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.28662109, + "step": 3117, + "time_per_iteration": 2.7053070068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079719, + "balance_loss_mlp": 1.05215788, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.05806752486487043, + "language_loss": 0.78326154, + "learning_rate": 0.00036433224884152283, + "loss": 0.79405868, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.27587891, + "step": 3118, + "time_per_iteration": 2.6854429244995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083264, + "balance_loss_mlp": 1.0547967, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.06710995797512392, + "language_loss": 0.78089821, + "learning_rate": 0.00036403242044958875, + "loss": 0.79173082, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28466797, + "step": 3119, + "time_per_iteration": 2.53751540184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04949808, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.059219046094812676, + "language_loss": 0.91922826, + "learning_rate": 0.0003637326448449507, + "loss": 0.93000555, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.28222656, + "step": 3120, + "time_per_iteration": 2.7070553302764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075191, + "balance_loss_mlp": 1.04855967, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.05784920643643932, + "language_loss": 0.86244148, + "learning_rate": 0.00036343292214399177, + "loss": 0.87319338, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.2668457, + "step": 3121, + "time_per_iteration": 2.790273904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108187, + "balance_loss_mlp": 1.05368924, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.061762558273937264, + "language_loss": 0.77535498, + "learning_rate": 0.00036313325246307456, + "loss": 0.78617358, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.28149414, + "step": 3122, + "time_per_iteration": 2.8160674571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085554, + "balance_loss_mlp": 1.05804014, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.06096373394010022, + "language_loss": 0.8757152, + "learning_rate": 0.0003628336359185411, + "loss": 0.88657075, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.27539062, + "step": 3123, + "time_per_iteration": 2.6819381713867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083737, + "balance_loss_mlp": 1.05708146, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.07022869927973763, + "language_loss": 0.75776213, + "learning_rate": 0.000362534072626713, + "loss": 0.76859951, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.2668457, + "step": 3124, + "time_per_iteration": 2.740907907485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083476, + "balance_loss_mlp": 1.05572367, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.05823250121923288, + "language_loss": 0.81532884, + "learning_rate": 0.00036223456270389093, + "loss": 0.82616365, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.27758789, + "step": 3125, + "time_per_iteration": 2.9345879554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.06254041, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.05438265607227417, + "language_loss": 0.81106913, + "learning_rate": 0.00036193510626635517, + "loss": 0.82197487, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.28076172, + "step": 3126, + "time_per_iteration": 2.719505786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092388, + "balance_loss_mlp": 1.06581664, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.06352965026992909, + "language_loss": 0.8166849, + "learning_rate": 0.0003616357034303649, + "loss": 0.82760876, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.26623535, + "step": 3127, + "time_per_iteration": 2.917137861251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_mlp": 1.0748688, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.06152140222119449, + "language_loss": 0.7902928, + "learning_rate": 0.0003613363543121584, + "loss": 0.80131161, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.27050781, + "step": 3128, + "time_per_iteration": 2.8336853981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098033, + "balance_loss_mlp": 1.07082987, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.1105531777946672, + "language_loss": 0.85000741, + "learning_rate": 0.00036103705902795357, + "loss": 0.86098778, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.2722168, + "step": 3129, + "time_per_iteration": 2.6958324909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107326, + "balance_loss_mlp": 1.07933569, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.08057315277867966, + "language_loss": 0.79751796, + "learning_rate": 0.0003607378176939471, + "loss": 0.80859125, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.2800293, + "step": 3130, + "time_per_iteration": 2.609400510787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109547, + "balance_loss_mlp": 1.06817079, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.0756423011045038, + "language_loss": 0.82227194, + "learning_rate": 0.00036043863042631465, + "loss": 0.83322662, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.2734375, + "step": 3131, + "time_per_iteration": 2.6571097373962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.06409097, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.07645469837417121, + "language_loss": 0.76662207, + "learning_rate": 0.00036013949734121133, + "loss": 0.77753073, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.26782227, + "step": 3132, + "time_per_iteration": 3.118265390396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096725, + "balance_loss_mlp": 1.06995106, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.0687931043319398, + "language_loss": 0.82291925, + "learning_rate": 0.00035984041855477043, + "loss": 0.83388644, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.26794434, + "step": 3133, + "time_per_iteration": 2.777459144592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019209, + "balance_loss_mlp": 1.01186562, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.01616325084905853, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79729104, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.07324219, + "step": 3134, + "time_per_iteration": 4.925475597381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.05736887, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.06318710690497562, + "language_loss": 0.79746044, + "learning_rate": 0.00035924242434230637, + "loss": 0.80829811, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.2644043, + "step": 3135, + "time_per_iteration": 2.7011537551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085192, + "balance_loss_mlp": 1.05767858, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.07716145908862651, + "language_loss": 0.79063201, + "learning_rate": 0.00035894350914844516, + "loss": 0.80148399, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.27514648, + "step": 3136, + "time_per_iteration": 2.6126935482025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088376, + "balance_loss_mlp": 1.05995679, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.06075860838457364, + "language_loss": 0.827613, + "learning_rate": 0.0003586446487175703, + "loss": 0.83849669, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.28417969, + "step": 3137, + "time_per_iteration": 2.675171375274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088716, + "balance_loss_mlp": 1.06041527, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.0544690611172434, + "language_loss": 0.85478795, + "learning_rate": 0.0003583458431657099, + "loss": 0.86567509, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.28320312, + "step": 3138, + "time_per_iteration": 2.7620253562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089567, + "balance_loss_mlp": 1.06336451, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.07515995216766168, + "language_loss": 0.83139801, + "learning_rate": 0.00035804709260887056, + "loss": 0.84229362, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.26220703, + "step": 3139, + "time_per_iteration": 2.6879465579986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087318, + "balance_loss_mlp": 1.05985248, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.052915045266918946, + "language_loss": 0.89835536, + "learning_rate": 0.0003577483971630373, + "loss": 0.90922856, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.27514648, + "step": 3140, + "time_per_iteration": 2.6586039066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.06398129, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.045195992370632855, + "language_loss": 0.85010505, + "learning_rate": 0.00035744975694417414, + "loss": 0.86101902, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.27416992, + "step": 3141, + "time_per_iteration": 2.8448941707611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084418, + "balance_loss_mlp": 1.05757236, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.07912966064455412, + "language_loss": 0.82233572, + "learning_rate": 0.00035715117206822344, + "loss": 0.83317983, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.26867676, + "step": 3142, + "time_per_iteration": 2.7542483806610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087563, + "balance_loss_mlp": 1.06026399, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.0701313453845953, + "language_loss": 0.80890429, + "learning_rate": 0.0003568526426511065, + "loss": 0.81977987, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.27331543, + "step": 3143, + "time_per_iteration": 2.6046767234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081818, + "balance_loss_mlp": 1.05658114, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.07379330487049819, + "language_loss": 0.83015585, + "learning_rate": 0.000356554168808722, + "loss": 0.84097409, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.25244141, + "step": 3144, + "time_per_iteration": 2.9466705322265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087357, + "balance_loss_mlp": 1.06141686, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.06250797721947925, + "language_loss": 0.84944713, + "learning_rate": 0.00035625575065694837, + "loss": 0.86032069, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.25952148, + "step": 3145, + "time_per_iteration": 2.9049606323242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083147, + "balance_loss_mlp": 1.05501366, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.05947586112106144, + "language_loss": 0.77504069, + "learning_rate": 0.0003559573883116415, + "loss": 0.78587222, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.28125, + "step": 3146, + "time_per_iteration": 2.70141339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095869, + "balance_loss_mlp": 1.06964314, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.050725714839995426, + "language_loss": 0.85750544, + "learning_rate": 0.00035565908188863604, + "loss": 0.86846411, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.26269531, + "step": 3147, + "time_per_iteration": 2.822096586227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097988, + "balance_loss_mlp": 1.07181001, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.06536005907217222, + "language_loss": 0.79801714, + "learning_rate": 0.00035536083150374464, + "loss": 0.80899704, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.26220703, + "step": 3148, + "time_per_iteration": 2.883934736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_mlp": 1.01980209, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.01728788780398527, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75775194, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.07421875, + "step": 3149, + "time_per_iteration": 4.850924015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07784474, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.06213460160212929, + "language_loss": 0.85822916, + "learning_rate": 0.0003547644993114475, + "loss": 0.8692801, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.27246094, + "step": 3150, + "time_per_iteration": 2.8107762336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102124, + "balance_loss_mlp": 1.0744915, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.06674612399311457, + "language_loss": 0.79958618, + "learning_rate": 0.00035446641773555806, + "loss": 0.81060743, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.27636719, + "step": 3151, + "time_per_iteration": 2.7216579914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101102, + "balance_loss_mlp": 1.07332611, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.052040510091589255, + "language_loss": 0.87343258, + "learning_rate": 0.000354168392660816, + "loss": 0.88444364, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.27758789, + "step": 3152, + "time_per_iteration": 2.726529836654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091719, + "balance_loss_mlp": 1.06484938, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.05990276634138019, + "language_loss": 0.82825845, + "learning_rate": 0.0003538704242029252, + "loss": 0.83917564, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.26879883, + "step": 3153, + "time_per_iteration": 2.695416212081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109717, + "balance_loss_mlp": 1.06884539, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.07600523103772844, + "language_loss": 0.77972901, + "learning_rate": 0.0003535725124775672, + "loss": 0.79070067, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.28320312, + "step": 3154, + "time_per_iteration": 2.8397514820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094038, + "balance_loss_mlp": 1.0649513, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.058609076283542554, + "language_loss": 0.86659074, + "learning_rate": 0.00035327465760040126, + "loss": 0.87753117, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.29077148, + "step": 3155, + "time_per_iteration": 2.6624228954315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.05640316, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.09292554424112281, + "language_loss": 0.84951353, + "learning_rate": 0.00035297685968706526, + "loss": 0.8603462, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.26867676, + "step": 3156, + "time_per_iteration": 2.7303812503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084205, + "balance_loss_mlp": 1.05590463, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.06445223486110697, + "language_loss": 0.83064741, + "learning_rate": 0.00035267911885317454, + "loss": 0.84148943, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.28271484, + "step": 3157, + "time_per_iteration": 2.6405463218688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06032109, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.05575059306658705, + "language_loss": 0.81712598, + "learning_rate": 0.0003523814352143222, + "loss": 0.82800603, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.27709961, + "step": 3158, + "time_per_iteration": 2.830343723297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093502, + "balance_loss_mlp": 1.06400919, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.06682067437398732, + "language_loss": 0.91622639, + "learning_rate": 0.00035208380888607937, + "loss": 0.9271614, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.29455566, + "step": 3159, + "time_per_iteration": 2.796640634536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_mlp": 1.01944304, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.020734540297120695, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80488676, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.07226562, + "step": 3160, + "time_per_iteration": 4.843371391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021138, + "balance_loss_mlp": 1.01393795, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.018390389893633168, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76713371, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.07177734, + "step": 3161, + "time_per_iteration": 5.065373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06481421, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.06046903146728731, + "language_loss": 0.81903481, + "learning_rate": 0.00035119127492038446, + "loss": 0.82996982, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.28637695, + "step": 3162, + "time_per_iteration": 2.7967278957366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083192, + "balance_loss_mlp": 1.0550108, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.05880363430465999, + "language_loss": 0.82486427, + "learning_rate": 0.00035089387898984436, + "loss": 0.83569616, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.28198242, + "step": 3163, + "time_per_iteration": 3.0665948390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089596, + "balance_loss_mlp": 1.06079483, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.064612412597244, + "language_loss": 0.8164137, + "learning_rate": 0.0003505965409474343, + "loss": 0.82730967, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.28808594, + "step": 3164, + "time_per_iteration": 2.9265527725219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078635, + "balance_loss_mlp": 1.05164599, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.0535577439830692, + "language_loss": 0.86276996, + "learning_rate": 0.0003502992609085913, + "loss": 0.87355632, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.27001953, + "step": 3165, + "time_per_iteration": 2.6794493198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082317, + "balance_loss_mlp": 1.05463672, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.05150827346349905, + "language_loss": 0.82492924, + "learning_rate": 0.00035000203898872954, + "loss": 0.83575243, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.27734375, + "step": 3166, + "time_per_iteration": 2.9775314331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081161, + "balance_loss_mlp": 1.0533855, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.0631204311292361, + "language_loss": 0.84789312, + "learning_rate": 0.0003497048753032406, + "loss": 0.85870469, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.27783203, + "step": 3167, + "time_per_iteration": 2.8659260272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082152, + "balance_loss_mlp": 1.05567539, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.05504676322369481, + "language_loss": 0.80827415, + "learning_rate": 0.000349407769967494, + "loss": 0.81909573, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.26525879, + "step": 3168, + "time_per_iteration": 3.3787014484405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081057, + "balance_loss_mlp": 1.05447292, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.05919699008213893, + "language_loss": 0.84490019, + "learning_rate": 0.0003491107230968361, + "loss": 0.85571074, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.26611328, + "step": 3169, + "time_per_iteration": 2.6599555015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078954, + "balance_loss_mlp": 1.05251288, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.05367575554300243, + "language_loss": 0.81929743, + "learning_rate": 0.00034881373480659085, + "loss": 0.83008707, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.26489258, + "step": 3170, + "time_per_iteration": 2.828599214553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089157, + "balance_loss_mlp": 1.06092811, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.07372507054287164, + "language_loss": 0.77562344, + "learning_rate": 0.0003485168052120594, + "loss": 0.78651506, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.28198242, + "step": 3171, + "time_per_iteration": 2.55070161819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092713, + "balance_loss_mlp": 1.06579578, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.06238864549227849, + "language_loss": 0.80073476, + "learning_rate": 0.00034821993442851973, + "loss": 0.81166196, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.26940918, + "step": 3172, + "time_per_iteration": 2.585115909576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.06593776, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.07089619767063425, + "language_loss": 0.82434714, + "learning_rate": 0.00034792312257122735, + "loss": 0.83527064, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.26428223, + "step": 3173, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109217, + "balance_loss_mlp": 1.06535971, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.06191738998776062, + "language_loss": 0.8083055, + "learning_rate": 0.00034762636975541506, + "loss": 0.81922722, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.26843262, + "step": 3174, + "time_per_iteration": 2.661529779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097622, + "balance_loss_mlp": 1.07096648, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.07934443203127389, + "language_loss": 0.81213707, + "learning_rate": 0.0003473296760962923, + "loss": 0.82311332, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.2668457, + "step": 3175, + "time_per_iteration": 2.730571746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105783, + "balance_loss_mlp": 1.05005765, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.03785121855584389, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79591566, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.07763672, + "step": 3176, + "time_per_iteration": 4.720510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094316, + "balance_loss_mlp": 1.06782722, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.05949259191251309, + "language_loss": 0.81672812, + "learning_rate": 0.00034673646670883976, + "loss": 0.82767129, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.26538086, + "step": 3177, + "time_per_iteration": 3.025146722793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_mlp": 1.02812171, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.027049018431207196, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76750535, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.07373047, + "step": 3178, + "time_per_iteration": 5.000125885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085963, + "balance_loss_mlp": 1.05978417, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.07013069416081287, + "language_loss": 0.81980824, + "learning_rate": 0.0003461434953300865, + "loss": 0.83066785, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.26220703, + "step": 3179, + "time_per_iteration": 2.922963857650757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081362, + "balance_loss_mlp": 1.05501699, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.06339313471396843, + "language_loss": 0.81228697, + "learning_rate": 0.0003458470991817515, + "loss": 0.82310063, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.2635498, + "step": 3180, + "time_per_iteration": 2.9837453365325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.06111443, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.05673755911203457, + "language_loss": 0.84994721, + "learning_rate": 0.0003455507628808802, + "loss": 0.86083156, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.27319336, + "step": 3181, + "time_per_iteration": 2.6381750106811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088853, + "balance_loss_mlp": 1.06133974, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.08338525943087875, + "language_loss": 0.85169065, + "learning_rate": 0.00034525448654252076, + "loss": 0.86257923, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.27539062, + "step": 3182, + "time_per_iteration": 2.6688461303710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089639, + "balance_loss_mlp": 1.06263769, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.09017686395034887, + "language_loss": 0.83182716, + "learning_rate": 0.0003449582702816976, + "loss": 0.84272361, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.2701416, + "step": 3183, + "time_per_iteration": 2.6863620281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091522, + "balance_loss_mlp": 1.06479537, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.0548908554977987, + "language_loss": 0.82581168, + "learning_rate": 0.0003446621142134122, + "loss": 0.8367269, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.26757812, + "step": 3184, + "time_per_iteration": 2.673337459564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093536, + "balance_loss_mlp": 1.06649971, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.06227229540090399, + "language_loss": 0.84346098, + "learning_rate": 0.0003443660184526424, + "loss": 0.85439634, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.27050781, + "step": 3185, + "time_per_iteration": 2.4706175327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092866, + "balance_loss_mlp": 1.06506586, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.05120570826610392, + "language_loss": 0.86619818, + "learning_rate": 0.0003440699831143429, + "loss": 0.87712687, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.27832031, + "step": 3186, + "time_per_iteration": 2.778033971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095882, + "balance_loss_mlp": 1.06884551, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.05392794478467523, + "language_loss": 0.82370943, + "learning_rate": 0.0003437740083134449, + "loss": 0.83466822, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.27050781, + "step": 3187, + "time_per_iteration": 2.6768150329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.06453919, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.07437759552236513, + "language_loss": 0.8353374, + "learning_rate": 0.00034347809416485574, + "loss": 0.84625435, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.27197266, + "step": 3188, + "time_per_iteration": 2.6008822917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085287, + "balance_loss_mlp": 1.05835748, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.053009634337547046, + "language_loss": 0.81880438, + "learning_rate": 0.0003431822407834597, + "loss": 0.82965726, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.26940918, + "step": 3189, + "time_per_iteration": 2.8121964931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090882, + "balance_loss_mlp": 1.06315422, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.06178667045305147, + "language_loss": 0.84739751, + "learning_rate": 0.00034288644828411706, + "loss": 0.85830629, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.27758789, + "step": 3190, + "time_per_iteration": 3.4740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087763, + "balance_loss_mlp": 1.06052327, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.08089532706522883, + "language_loss": 0.75991279, + "learning_rate": 0.0003425907167816649, + "loss": 0.77079034, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.27258301, + "step": 3191, + "time_per_iteration": 2.8420307636260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05866492, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.06652830958672214, + "language_loss": 0.84765488, + "learning_rate": 0.00034229504639091623, + "loss": 0.85850024, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.2590332, + "step": 3192, + "time_per_iteration": 2.805717945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079151, + "balance_loss_mlp": 1.05240059, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.06825133592780937, + "language_loss": 0.80015457, + "learning_rate": 0.0003419994372266606, + "loss": 0.81094611, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.26782227, + "step": 3193, + "time_per_iteration": 3.0882303714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084925, + "balance_loss_mlp": 1.05800796, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.061422659425354964, + "language_loss": 0.82002676, + "learning_rate": 0.00034170388940366335, + "loss": 0.83087599, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.26953125, + "step": 3194, + "time_per_iteration": 2.68253755569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085529, + "balance_loss_mlp": 1.0581584, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.0574427380686639, + "language_loss": 0.801368, + "learning_rate": 0.0003414084030366667, + "loss": 0.81222332, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.27380371, + "step": 3195, + "time_per_iteration": 3.079050302505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05421329, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.05079595978056556, + "language_loss": 0.83029908, + "learning_rate": 0.0003411129782403883, + "loss": 0.84111041, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.26953125, + "step": 3196, + "time_per_iteration": 2.632840871810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086159, + "balance_loss_mlp": 1.05881214, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.06298738141067967, + "language_loss": 0.85384542, + "learning_rate": 0.0003408176151295225, + "loss": 0.86470699, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.27392578, + "step": 3197, + "time_per_iteration": 2.5977203845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_mlp": 1.05186343, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.07239010053944613, + "language_loss": 0.77357507, + "learning_rate": 0.00034052231381873944, + "loss": 0.78436762, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.2746582, + "step": 3198, + "time_per_iteration": 2.604996919631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078982, + "balance_loss_mlp": 1.05149233, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.060831146063345755, + "language_loss": 0.85285568, + "learning_rate": 0.00034022707442268494, + "loss": 0.86364555, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.27514648, + "step": 3199, + "time_per_iteration": 2.6032421588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_mlp": 1.05297375, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.04692312170218308, + "language_loss": 0.82051641, + "learning_rate": 0.0003399318970559813, + "loss": 0.83131248, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.26660156, + "step": 3200, + "time_per_iteration": 2.8085906505584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_mlp": 1.0479418, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.057177124175777416, + "language_loss": 0.8485775, + "learning_rate": 0.00033963678183322656, + "loss": 0.85931993, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.26330566, + "step": 3201, + "time_per_iteration": 3.032761335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.05809593, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.053866229864627496, + "language_loss": 0.82829523, + "learning_rate": 0.0003393417288689945, + "loss": 0.8391425, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.26623535, + "step": 3202, + "time_per_iteration": 2.6627390384674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084716, + "balance_loss_mlp": 1.05858481, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.08436696100910436, + "language_loss": 0.76289904, + "learning_rate": 0.00033904673827783504, + "loss": 0.77374619, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.26171875, + "step": 3203, + "time_per_iteration": 2.914370059967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082648, + "balance_loss_mlp": 1.05615926, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.06562773431598554, + "language_loss": 0.81864727, + "learning_rate": 0.00033875181017427357, + "loss": 0.82947373, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.26501465, + "step": 3204, + "time_per_iteration": 2.5992236137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.05155659, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.05911238695185789, + "language_loss": 0.8101759, + "learning_rate": 0.00033845694467281133, + "loss": 0.82095909, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.26782227, + "step": 3205, + "time_per_iteration": 2.857226848602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079906, + "balance_loss_mlp": 1.05366778, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.056333384320929165, + "language_loss": 0.83590877, + "learning_rate": 0.00033816214188792516, + "loss": 0.84670782, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.26281738, + "step": 3206, + "time_per_iteration": 3.133683443069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108227, + "balance_loss_mlp": 1.05523372, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.06870835299002895, + "language_loss": 0.85362953, + "learning_rate": 0.00033786740193406784, + "loss": 0.86445218, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.27050781, + "step": 3207, + "time_per_iteration": 2.5766866207122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_mlp": 1.05775416, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.16525433487855157, + "language_loss": 0.81557286, + "learning_rate": 0.00033757272492566736, + "loss": 0.82641208, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.26184082, + "step": 3208, + "time_per_iteration": 2.8717997074127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.05363393, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.050446978523455026, + "language_loss": 0.8752228, + "learning_rate": 0.0003372781109771278, + "loss": 0.88603711, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.27832031, + "step": 3209, + "time_per_iteration": 2.740673303604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_mlp": 1.05973852, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.060596341147957054, + "language_loss": 0.76554525, + "learning_rate": 0.0003369835602028281, + "loss": 0.77641892, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.27661133, + "step": 3210, + "time_per_iteration": 2.813253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078855, + "balance_loss_mlp": 1.05222404, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.060877692494739295, + "language_loss": 0.7966795, + "learning_rate": 0.0003366890727171232, + "loss": 0.80746806, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.26647949, + "step": 3211, + "time_per_iteration": 2.7572054862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083411, + "balance_loss_mlp": 1.05717349, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.07113437774281188, + "language_loss": 0.78650188, + "learning_rate": 0.00033639464863434313, + "loss": 0.79733604, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.26257324, + "step": 3212, + "time_per_iteration": 2.616605520248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_mlp": 1.0275538, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.020977694975075144, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79477704, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.07666016, + "step": 3213, + "time_per_iteration": 4.706260919570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077401, + "balance_loss_mlp": 1.05035281, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.055780003903401355, + "language_loss": 0.79908657, + "learning_rate": 0.00033580599113475543, + "loss": 0.80986065, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.27087402, + "step": 3214, + "time_per_iteration": 2.976040840148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068298, + "balance_loss_mlp": 1.04207242, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.06538485262612419, + "language_loss": 0.86450571, + "learning_rate": 0.00033551175794648507, + "loss": 0.87518871, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.2623291, + "step": 3215, + "time_per_iteration": 2.5857200622558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074795, + "balance_loss_mlp": 1.0478301, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.05115792818317019, + "language_loss": 0.81974953, + "learning_rate": 0.00033521758861821365, + "loss": 0.8304975, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.27001953, + "step": 3216, + "time_per_iteration": 2.6541965007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070889, + "balance_loss_mlp": 1.04368544, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.053233870679950265, + "language_loss": 0.89132476, + "learning_rate": 0.0003349234832641479, + "loss": 0.90203357, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.27246094, + "step": 3217, + "time_per_iteration": 2.5898375511169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072387, + "balance_loss_mlp": 1.04567194, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.06188675281587152, + "language_loss": 0.81109393, + "learning_rate": 0.00033462944199846975, + "loss": 0.82181776, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.26721191, + "step": 3218, + "time_per_iteration": 3.049302101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068592, + "balance_loss_mlp": 1.04186571, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07980114958498462, + "language_loss": 0.86682892, + "learning_rate": 0.00033433546493533606, + "loss": 0.87751484, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.26757812, + "step": 3219, + "time_per_iteration": 2.4988718032836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072803, + "balance_loss_mlp": 1.04562318, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.06437622918216304, + "language_loss": 0.84503907, + "learning_rate": 0.00033404155218887897, + "loss": 0.85576707, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.27246094, + "step": 3220, + "time_per_iteration": 2.755687952041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069297, + "balance_loss_mlp": 1.04323733, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.054816937967161604, + "language_loss": 0.87677366, + "learning_rate": 0.00033374770387320534, + "loss": 0.88746661, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.26074219, + "step": 3221, + "time_per_iteration": 2.806687831878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073987, + "balance_loss_mlp": 1.0476656, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.05319951203525016, + "language_loss": 0.85096419, + "learning_rate": 0.00033345392010239737, + "loss": 0.86170411, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.2635498, + "step": 3222, + "time_per_iteration": 2.726924419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078737, + "balance_loss_mlp": 1.05248737, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.06204822794999188, + "language_loss": 0.82752097, + "learning_rate": 0.0003331602009905118, + "loss": 0.83830827, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.26245117, + "step": 3223, + "time_per_iteration": 2.8067080974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074324, + "balance_loss_mlp": 1.04770494, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.06248384558092708, + "language_loss": 0.83894855, + "learning_rate": 0.00033286654665158085, + "loss": 0.84969175, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.26635742, + "step": 3224, + "time_per_iteration": 2.973839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071798, + "balance_loss_mlp": 1.04578674, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.058715923927156195, + "language_loss": 0.87385452, + "learning_rate": 0.0003325729571996109, + "loss": 0.88457251, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.26037598, + "step": 3225, + "time_per_iteration": 2.6299448013305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_mlp": 1.05295992, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.05622680554800681, + "language_loss": 0.84078681, + "learning_rate": 0.000332279432748584, + "loss": 0.85158628, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.27001953, + "step": 3226, + "time_per_iteration": 2.713651180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_mlp": 1.05334759, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.05334260963219639, + "language_loss": 0.8767364, + "learning_rate": 0.00033198597341245576, + "loss": 0.88753092, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.26147461, + "step": 3227, + "time_per_iteration": 2.5617635250091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_mlp": 1.05208337, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.05016111197588362, + "language_loss": 0.82129073, + "learning_rate": 0.00033169257930515763, + "loss": 0.83207709, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.26611328, + "step": 3228, + "time_per_iteration": 3.025502920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080729, + "balance_loss_mlp": 1.0543834, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.08161989388785439, + "language_loss": 0.82274306, + "learning_rate": 0.0003313992505405951, + "loss": 0.83355033, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.26367188, + "step": 3229, + "time_per_iteration": 2.705948829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083196, + "balance_loss_mlp": 1.05582547, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.06417417083544033, + "language_loss": 0.81270546, + "learning_rate": 0.0003311059872326487, + "loss": 0.82353741, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.27368164, + "step": 3230, + "time_per_iteration": 2.6827783584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080325, + "balance_loss_mlp": 1.05426574, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.060558133954529886, + "language_loss": 0.79513329, + "learning_rate": 0.0003308127894951734, + "loss": 0.80593657, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.26074219, + "step": 3231, + "time_per_iteration": 2.621156692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_mlp": 1.05295873, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.05872122895707264, + "language_loss": 0.86601388, + "learning_rate": 0.00033051965744199834, + "loss": 0.87681365, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.27075195, + "step": 3232, + "time_per_iteration": 2.7616896629333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089938, + "balance_loss_mlp": 1.06414127, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.05765951293021458, + "language_loss": 0.90613365, + "learning_rate": 0.0003302265911869276, + "loss": 0.91703308, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.25830078, + "step": 3233, + "time_per_iteration": 2.911309242248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.04950833, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.0568406918617455, + "language_loss": 0.84234416, + "learning_rate": 0.0003299335908437397, + "loss": 0.8531056, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.26660156, + "step": 3234, + "time_per_iteration": 2.5690464973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083439, + "balance_loss_mlp": 1.05614042, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08458123774573062, + "language_loss": 0.79892743, + "learning_rate": 0.0003296406565261873, + "loss": 0.80976182, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.27294922, + "step": 3235, + "time_per_iteration": 2.519242763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082513, + "balance_loss_mlp": 1.05619192, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.04986850206195379, + "language_loss": 0.85312378, + "learning_rate": 0.0003293477883479978, + "loss": 0.86394894, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.26367188, + "step": 3236, + "time_per_iteration": 2.8095037937164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_mlp": 1.05704379, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.105420899843356, + "language_loss": 0.79857445, + "learning_rate": 0.0003290549864228727, + "loss": 0.80941153, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.2668457, + "step": 3237, + "time_per_iteration": 2.9599437713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092317, + "balance_loss_mlp": 1.0648514, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.05485346042827634, + "language_loss": 0.86677277, + "learning_rate": 0.0003287622508644875, + "loss": 0.87769592, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.27514648, + "step": 3238, + "time_per_iteration": 2.7735140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108562, + "balance_loss_mlp": 1.05971575, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.06697855581394702, + "language_loss": 0.86312807, + "learning_rate": 0.0003284695817864923, + "loss": 0.87398434, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.25939941, + "step": 3239, + "time_per_iteration": 2.5213680267333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086258, + "balance_loss_mlp": 1.05822039, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.0670229685198235, + "language_loss": 0.84466362, + "learning_rate": 0.0003281769793025116, + "loss": 0.85552621, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.28051758, + "step": 3240, + "time_per_iteration": 2.7121944427490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_mlp": 1.05725467, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.0702959592195009, + "language_loss": 0.89746368, + "learning_rate": 0.00032788444352614346, + "loss": 0.90830505, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.2689209, + "step": 3241, + "time_per_iteration": 2.5015249252319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05672646, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.06846716492041297, + "language_loss": 0.80880868, + "learning_rate": 0.0003275919745709606, + "loss": 0.81963438, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.25793457, + "step": 3242, + "time_per_iteration": 2.5576865673065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108516, + "balance_loss_mlp": 1.05925632, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.07943089105939449, + "language_loss": 0.82206035, + "learning_rate": 0.00032729957255050936, + "loss": 0.83291197, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.25939941, + "step": 3243, + "time_per_iteration": 2.6432876586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088614, + "balance_loss_mlp": 1.06160164, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.05697537119999913, + "language_loss": 0.81798017, + "learning_rate": 0.0003270072375783102, + "loss": 0.82886636, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.2701416, + "step": 3244, + "time_per_iteration": 2.8988003730773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.06048417, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.06396151885165319, + "language_loss": 0.79621661, + "learning_rate": 0.00032671496976785774, + "loss": 0.80708826, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.2668457, + "step": 3245, + "time_per_iteration": 2.619020938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_mlp": 1.054075, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06315966353761295, + "language_loss": 0.75718981, + "learning_rate": 0.0003264227692326205, + "loss": 0.76798642, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.25610352, + "step": 3246, + "time_per_iteration": 3.0977470874786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092736, + "balance_loss_mlp": 1.0656991, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.05900529395790117, + "language_loss": 0.86342973, + "learning_rate": 0.00032613063608604055, + "loss": 0.8743571, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.27075195, + "step": 3247, + "time_per_iteration": 2.535694122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088316, + "balance_loss_mlp": 1.06239939, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.06304682930858534, + "language_loss": 0.83798397, + "learning_rate": 0.0003258385704415343, + "loss": 0.84886706, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.25952148, + "step": 3248, + "time_per_iteration": 2.5745623111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108485, + "balance_loss_mlp": 1.05835032, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.05590667245526839, + "language_loss": 0.83388865, + "learning_rate": 0.0003255465724124915, + "loss": 0.84473717, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.26550293, + "step": 3249, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088994, + "balance_loss_mlp": 1.06236219, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.05421846052548684, + "language_loss": 0.83201844, + "learning_rate": 0.00032525464211227587, + "loss": 0.84290838, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.2668457, + "step": 3250, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089648, + "balance_loss_mlp": 1.0646019, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.05949618394649944, + "language_loss": 0.85687059, + "learning_rate": 0.0003249627796542249, + "loss": 0.8677671, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.25048828, + "step": 3251, + "time_per_iteration": 2.657060384750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086593, + "balance_loss_mlp": 1.06070042, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06427979506588448, + "language_loss": 0.84404004, + "learning_rate": 0.00032467098515164943, + "loss": 0.85490596, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.25927734, + "step": 3252, + "time_per_iteration": 2.849217414855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095705, + "balance_loss_mlp": 1.06928802, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.07156536364550367, + "language_loss": 0.8424539, + "learning_rate": 0.00032437925871783456, + "loss": 0.85341096, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.26428223, + "step": 3253, + "time_per_iteration": 2.6556756496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089818, + "balance_loss_mlp": 1.06392598, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.06713167353527402, + "language_loss": 0.84369826, + "learning_rate": 0.00032408760046603803, + "loss": 0.85459638, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.25915527, + "step": 3254, + "time_per_iteration": 2.8115572929382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_mlp": 1.06096649, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.057744790831179095, + "language_loss": 0.77781522, + "learning_rate": 0.00032379601050949193, + "loss": 0.78869808, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.27319336, + "step": 3255, + "time_per_iteration": 3.076742649078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086608, + "balance_loss_mlp": 1.06120479, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.07189629851165658, + "language_loss": 0.88155556, + "learning_rate": 0.0003235044889614013, + "loss": 0.8924216, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.25390625, + "step": 3256, + "time_per_iteration": 2.5873968601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089869, + "balance_loss_mlp": 1.06373787, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.05771783178096878, + "language_loss": 0.83524096, + "learning_rate": 0.0003232130359349451, + "loss": 0.84613967, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.26147461, + "step": 3257, + "time_per_iteration": 2.819540500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.05381322, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.06862538521016108, + "language_loss": 0.81873524, + "learning_rate": 0.0003229216515432751, + "loss": 0.82953036, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.25732422, + "step": 3258, + "time_per_iteration": 2.7515103816986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081926, + "balance_loss_mlp": 1.05611742, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.0620904280551254, + "language_loss": 0.79984063, + "learning_rate": 0.0003226303358995174, + "loss": 0.81065989, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.25805664, + "step": 3259, + "time_per_iteration": 2.601327896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108309, + "balance_loss_mlp": 1.05641103, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.06264498249495759, + "language_loss": 0.88746321, + "learning_rate": 0.00032233908911677, + "loss": 0.89829409, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.26672363, + "step": 3260, + "time_per_iteration": 2.8746490478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108161, + "balance_loss_mlp": 1.0554074, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.05524835690099731, + "language_loss": 0.81054789, + "learning_rate": 0.0003220479113081053, + "loss": 0.82136405, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.26245117, + "step": 3261, + "time_per_iteration": 2.7250542640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086528, + "balance_loss_mlp": 1.06051612, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.07333417495650836, + "language_loss": 0.79077941, + "learning_rate": 0.00032175680258656836, + "loss": 0.80164468, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.26049805, + "step": 3262, + "time_per_iteration": 2.7318856716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084833, + "balance_loss_mlp": 1.0588572, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.054494688128012655, + "language_loss": 0.80530143, + "learning_rate": 0.00032146576306517794, + "loss": 0.81614971, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.26000977, + "step": 3263, + "time_per_iteration": 2.7811925411224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080063, + "balance_loss_mlp": 1.05290699, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.056666242848552414, + "language_loss": 0.81309682, + "learning_rate": 0.0003211747928569255, + "loss": 0.82389748, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.27197266, + "step": 3264, + "time_per_iteration": 2.7700881958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109074, + "balance_loss_mlp": 1.06416845, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.05464471596038141, + "language_loss": 0.82094646, + "learning_rate": 0.0003208838920747754, + "loss": 0.83185387, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.26599121, + "step": 3265, + "time_per_iteration": 2.8446507453918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090884, + "balance_loss_mlp": 1.06463385, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.056349937520850824, + "language_loss": 0.77076876, + "learning_rate": 0.0003205930608316656, + "loss": 0.7816776, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.26269531, + "step": 3266, + "time_per_iteration": 3.491666555404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_mlp": 1.07074392, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.06651261940051902, + "language_loss": 0.84897095, + "learning_rate": 0.00032030229924050673, + "loss": 0.85995495, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.27661133, + "step": 3267, + "time_per_iteration": 2.647298812866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089439, + "balance_loss_mlp": 1.06264114, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.055917272399638666, + "language_loss": 0.8022815, + "learning_rate": 0.00032001160741418247, + "loss": 0.81317586, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.26843262, + "step": 3268, + "time_per_iteration": 2.652388334274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094844, + "balance_loss_mlp": 1.06809378, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.059838942630291256, + "language_loss": 0.82543945, + "learning_rate": 0.0003197209854655494, + "loss": 0.83638787, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.26757812, + "step": 3269, + "time_per_iteration": 2.6375179290771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_mlp": 1.07439375, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.061513094819297384, + "language_loss": 0.74642974, + "learning_rate": 0.0003194304335074371, + "loss": 0.75742888, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.25537109, + "step": 3270, + "time_per_iteration": 2.8767266273498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.06736612, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.08816092137491774, + "language_loss": 0.8863402, + "learning_rate": 0.0003191399516526475, + "loss": 0.89727688, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.26342773, + "step": 3271, + "time_per_iteration": 2.4882290363311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103501, + "balance_loss_mlp": 1.07775187, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.05301391071022918, + "language_loss": 0.80040759, + "learning_rate": 0.0003188495400139559, + "loss": 0.81144261, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.25732422, + "step": 3272, + "time_per_iteration": 2.755535364151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109861, + "balance_loss_mlp": 1.0714066, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.06865914840158399, + "language_loss": 0.84610647, + "learning_rate": 0.00031855919870411013, + "loss": 0.85709262, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.27246094, + "step": 3273, + "time_per_iteration": 2.8569116592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093778, + "balance_loss_mlp": 1.06794524, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.05727346843797417, + "language_loss": 0.84962982, + "learning_rate": 0.0003182689278358305, + "loss": 0.86056757, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.25866699, + "step": 3274, + "time_per_iteration": 2.690037727355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104729, + "balance_loss_mlp": 1.07783532, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.06020653166460469, + "language_loss": 0.80404747, + "learning_rate": 0.0003179787275218105, + "loss": 0.81509471, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.26928711, + "step": 3275, + "time_per_iteration": 2.5266408920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100504, + "balance_loss_mlp": 1.07448089, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.052538589715391014, + "language_loss": 0.84480745, + "learning_rate": 0.0003176885978747155, + "loss": 0.85581249, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.26037598, + "step": 3276, + "time_per_iteration": 2.639855146408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097356, + "balance_loss_mlp": 1.07041466, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.060305073155881905, + "language_loss": 0.82594693, + "learning_rate": 0.0003173985390071839, + "loss": 0.83692044, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.26977539, + "step": 3277, + "time_per_iteration": 2.860373020172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_mlp": 1.02755451, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.022211191249075446, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78934395, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.06396484, + "step": 3278, + "time_per_iteration": 4.8053810596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109346, + "balance_loss_mlp": 1.06688833, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.06392036419346926, + "language_loss": 0.8159709, + "learning_rate": 0.00031681863406122704, + "loss": 0.82690549, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.26574707, + "step": 3279, + "time_per_iteration": 2.7899298667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090056, + "balance_loss_mlp": 1.06425917, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.08623088614213353, + "language_loss": 0.85931206, + "learning_rate": 0.00031652878820794087, + "loss": 0.87021261, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.25817871, + "step": 3280, + "time_per_iteration": 2.9887900352478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099524, + "balance_loss_mlp": 1.07296467, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.06411033205686746, + "language_loss": 0.85853314, + "learning_rate": 0.00031623901358449627, + "loss": 0.86952841, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.26574707, + "step": 3281, + "time_per_iteration": 2.638303756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_mlp": 1.06183434, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.058317756156366925, + "language_loss": 0.88884354, + "learning_rate": 0.0003159493103033936, + "loss": 0.89973223, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.27038574, + "step": 3282, + "time_per_iteration": 2.577678918838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021333, + "balance_loss_mlp": 1.01494348, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.01678733827998209, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.8094039, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.06396484, + "step": 3283, + "time_per_iteration": 4.869993209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086941, + "balance_loss_mlp": 1.06035781, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.060116799925982296, + "language_loss": 0.82495177, + "learning_rate": 0.0003153701182180776, + "loss": 0.83582127, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.26611328, + "step": 3284, + "time_per_iteration": 2.792370319366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108947, + "balance_loss_mlp": 1.06271982, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.05700688218578944, + "language_loss": 0.81939638, + "learning_rate": 0.00031508062963872655, + "loss": 0.83029103, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.26757812, + "step": 3285, + "time_per_iteration": 2.5983989238739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080002, + "balance_loss_mlp": 1.05334699, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.06791630533273198, + "language_loss": 0.79373753, + "learning_rate": 0.0003147912128514423, + "loss": 0.80453753, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.2668457, + "step": 3286, + "time_per_iteration": 2.7027578353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085262, + "balance_loss_mlp": 1.05848765, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.061011344504073056, + "language_loss": 0.87480241, + "learning_rate": 0.0003145018679685859, + "loss": 0.88565505, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.26831055, + "step": 3287, + "time_per_iteration": 2.7283802032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081421, + "balance_loss_mlp": 1.05552864, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.05025789787573444, + "language_loss": 0.87796986, + "learning_rate": 0.00031421259510249134, + "loss": 0.88878405, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.25927734, + "step": 3288, + "time_per_iteration": 2.879518985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089164, + "balance_loss_mlp": 1.06193662, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.06343698340560998, + "language_loss": 0.81597275, + "learning_rate": 0.00031392339436546414, + "loss": 0.82686442, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.27246094, + "step": 3289, + "time_per_iteration": 2.8542826175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_mlp": 1.05521417, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.06408220142950623, + "language_loss": 0.83260751, + "learning_rate": 0.00031363426586978205, + "loss": 0.84343785, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.27832031, + "step": 3290, + "time_per_iteration": 2.79167103767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075181, + "balance_loss_mlp": 1.04847813, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.05376353557308444, + "language_loss": 0.84848088, + "learning_rate": 0.0003133452097276947, + "loss": 0.85923266, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.26708984, + "step": 3291, + "time_per_iteration": 2.751204252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108266, + "balance_loss_mlp": 1.05583799, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.07438043439458697, + "language_loss": 0.84737223, + "learning_rate": 0.0003130562260514238, + "loss": 0.85819882, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.26831055, + "step": 3292, + "time_per_iteration": 2.7716188430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083518, + "balance_loss_mlp": 1.05695808, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.050395454055006096, + "language_loss": 0.81929183, + "learning_rate": 0.0003127673149531626, + "loss": 0.830127, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.26550293, + "step": 3293, + "time_per_iteration": 2.7863051891326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_mlp": 1.05757475, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.05747867938132279, + "language_loss": 0.8319236, + "learning_rate": 0.0003124784765450762, + "loss": 0.84276778, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.26867676, + "step": 3294, + "time_per_iteration": 2.551786184310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109352, + "balance_loss_mlp": 1.0665071, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.0638400369710873, + "language_loss": 0.80339384, + "learning_rate": 0.0003121897109393017, + "loss": 0.81432903, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.27050781, + "step": 3295, + "time_per_iteration": 2.7408554553985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010907, + "balance_loss_mlp": 1.06406879, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.05476078823788279, + "language_loss": 0.89262557, + "learning_rate": 0.0003119010182479481, + "loss": 0.90353251, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.26623535, + "step": 3296, + "time_per_iteration": 2.658127784729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088039, + "balance_loss_mlp": 1.06214714, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.062377346698915814, + "language_loss": 0.82762587, + "learning_rate": 0.00031161239858309563, + "loss": 0.83850628, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.25915527, + "step": 3297, + "time_per_iteration": 2.5747482776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092867, + "balance_loss_mlp": 1.06669998, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.0650323770737515, + "language_loss": 0.83421898, + "learning_rate": 0.0003113238520567964, + "loss": 0.84514761, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.26208496, + "step": 3298, + "time_per_iteration": 2.6627304553985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089642, + "balance_loss_mlp": 1.06351149, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.06322814562663621, + "language_loss": 0.81827015, + "learning_rate": 0.00031103537878107403, + "loss": 0.82916659, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.26147461, + "step": 3299, + "time_per_iteration": 2.7386014461517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091865, + "balance_loss_mlp": 1.06578207, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.11045697323578996, + "language_loss": 0.80332845, + "learning_rate": 0.0003107469788679238, + "loss": 0.81424707, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.26086426, + "step": 3300, + "time_per_iteration": 2.7655692100524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084258, + "balance_loss_mlp": 1.05724525, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.06273525226286222, + "language_loss": 0.8685059, + "learning_rate": 0.00031045865242931267, + "loss": 0.8793484, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.27026367, + "step": 3301, + "time_per_iteration": 2.8187057971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092787, + "balance_loss_mlp": 1.06582153, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.06022790921544637, + "language_loss": 0.82959229, + "learning_rate": 0.00031017039957717877, + "loss": 0.84052014, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.27001953, + "step": 3302, + "time_per_iteration": 2.994527578353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088849, + "balance_loss_mlp": 1.0623126, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.2662546903518702, + "language_loss": 0.8874619, + "learning_rate": 0.0003098822204234318, + "loss": 0.89835036, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.265625, + "step": 3303, + "time_per_iteration": 2.6759462356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086338, + "balance_loss_mlp": 1.06104219, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.06306835331817585, + "language_loss": 0.87812388, + "learning_rate": 0.00030959411507995273, + "loss": 0.88898724, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.25317383, + "step": 3304, + "time_per_iteration": 3.2179057598114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06150627, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.09855035049223494, + "language_loss": 0.81458223, + "learning_rate": 0.00030930608365859407, + "loss": 0.82547283, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.27563477, + "step": 3305, + "time_per_iteration": 2.743131399154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093486, + "balance_loss_mlp": 1.06724787, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.08448546978670643, + "language_loss": 0.87924969, + "learning_rate": 0.00030901812627117943, + "loss": 0.89018464, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.26257324, + "step": 3306, + "time_per_iteration": 2.6397995948791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090032, + "balance_loss_mlp": 1.06258953, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.06217165595181868, + "language_loss": 0.85291284, + "learning_rate": 0.000308730243029504, + "loss": 0.86381316, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.27416992, + "step": 3307, + "time_per_iteration": 2.604104995727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091578, + "balance_loss_mlp": 1.06420732, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.05998324584382658, + "language_loss": 0.79413563, + "learning_rate": 0.0003084424340453339, + "loss": 0.80505145, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.27392578, + "step": 3308, + "time_per_iteration": 2.808955192565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093812, + "balance_loss_mlp": 1.06555986, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.06232682903729, + "language_loss": 0.82260096, + "learning_rate": 0.0003081546994304064, + "loss": 0.83353913, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.28222656, + "step": 3309, + "time_per_iteration": 2.786863327026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090344, + "balance_loss_mlp": 1.06326008, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.059865329496528966, + "language_loss": 0.82539266, + "learning_rate": 0.0003078670392964298, + "loss": 0.83629608, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.27148438, + "step": 3310, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096243, + "balance_loss_mlp": 1.06832409, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.060559947779739796, + "language_loss": 0.82883835, + "learning_rate": 0.00030757945375508406, + "loss": 0.83980078, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.27929688, + "step": 3311, + "time_per_iteration": 2.6342813968658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102084, + "balance_loss_mlp": 1.07375956, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.06259292774484726, + "language_loss": 0.81409383, + "learning_rate": 0.00030729194291801944, + "loss": 0.82511473, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.28283691, + "step": 3312, + "time_per_iteration": 2.6879191398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102141, + "balance_loss_mlp": 1.07455623, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.07257562907286343, + "language_loss": 0.77341402, + "learning_rate": 0.00030700450689685787, + "loss": 0.78443545, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.27636719, + "step": 3313, + "time_per_iteration": 2.5379741191864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093269, + "balance_loss_mlp": 1.06732869, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.05810286920956277, + "language_loss": 0.85484838, + "learning_rate": 0.00030671714580319186, + "loss": 0.86578107, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.25952148, + "step": 3314, + "time_per_iteration": 2.800306797027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095409, + "balance_loss_mlp": 1.06806278, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07119187429341393, + "language_loss": 0.83300906, + "learning_rate": 0.0003064298597485846, + "loss": 0.84396315, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.27392578, + "step": 3315, + "time_per_iteration": 2.822500467300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089294, + "balance_loss_mlp": 1.06213832, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.07085511575360878, + "language_loss": 0.84058923, + "learning_rate": 0.00030614264884457054, + "loss": 0.85148215, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.27197266, + "step": 3316, + "time_per_iteration": 2.670797348022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090907, + "balance_loss_mlp": 1.06443071, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.0775113841286029, + "language_loss": 0.77307498, + "learning_rate": 0.000305855513202655, + "loss": 0.78398407, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.26477051, + "step": 3317, + "time_per_iteration": 2.585374355316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088816, + "balance_loss_mlp": 1.06235111, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.06790961033266373, + "language_loss": 0.77846622, + "learning_rate": 0.0003055684529343138, + "loss": 0.78935432, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.26501465, + "step": 3318, + "time_per_iteration": 2.4445385932922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085331, + "balance_loss_mlp": 1.0597012, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.06232442900596772, + "language_loss": 0.78594166, + "learning_rate": 0.00030528146815099374, + "loss": 0.79679501, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.25634766, + "step": 3319, + "time_per_iteration": 2.654273509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085377, + "balance_loss_mlp": 1.06078434, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.06473309855040241, + "language_loss": 0.72311449, + "learning_rate": 0.00030499455896411203, + "loss": 0.73396826, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.24597168, + "step": 3320, + "time_per_iteration": 2.60524320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043431, + "balance_loss_mlp": 1.03561127, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.03712674177895302, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77344245, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.078125, + "step": 3321, + "time_per_iteration": 4.941630601882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.06535614, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.06705543469002004, + "language_loss": 0.7662977, + "learning_rate": 0.0003044209678251865, + "loss": 0.77721143, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.26013184, + "step": 3322, + "time_per_iteration": 2.877448320388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091614, + "balance_loss_mlp": 1.06602025, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.07788084148223126, + "language_loss": 0.84920502, + "learning_rate": 0.0003041342860958306, + "loss": 0.86012113, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.25610352, + "step": 3323, + "time_per_iteration": 2.8169727325439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093622, + "balance_loss_mlp": 1.06809974, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.09386152906491808, + "language_loss": 0.91524851, + "learning_rate": 0.00030384768040828857, + "loss": 0.92618477, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.25537109, + "step": 3324, + "time_per_iteration": 2.6935789585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087009, + "balance_loss_mlp": 1.06294096, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.06024043560940697, + "language_loss": 0.85838866, + "learning_rate": 0.00030356115087383094, + "loss": 0.86925876, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.24047852, + "step": 3325, + "time_per_iteration": 2.645054340362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108735, + "balance_loss_mlp": 1.06102872, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.054064191473810044, + "language_loss": 0.84931785, + "learning_rate": 0.00030327469760369803, + "loss": 0.86019135, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.26367188, + "step": 3326, + "time_per_iteration": 2.563873767852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085961, + "balance_loss_mlp": 1.05992579, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.06028685713056784, + "language_loss": 0.85342407, + "learning_rate": 0.0003029883207091009, + "loss": 0.86428368, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.26074219, + "step": 3327, + "time_per_iteration": 2.705343723297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_mlp": 1.05283976, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.06637165202459654, + "language_loss": 0.78691089, + "learning_rate": 0.00030270202030122095, + "loss": 0.7977106, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.27172852, + "step": 3328, + "time_per_iteration": 2.708845853805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081887, + "balance_loss_mlp": 1.05516016, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.06780948867889915, + "language_loss": 0.86619353, + "learning_rate": 0.00030241579649121, + "loss": 0.87701237, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.26782227, + "step": 3329, + "time_per_iteration": 2.9923856258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084819, + "balance_loss_mlp": 1.05859339, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.052278869794255514, + "language_loss": 0.79563975, + "learning_rate": 0.00030212964939018994, + "loss": 0.80648792, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.26220703, + "step": 3330, + "time_per_iteration": 2.5270252227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091276, + "balance_loss_mlp": 1.06483507, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.06193541615368343, + "language_loss": 0.85849935, + "learning_rate": 0.0003018435791092527, + "loss": 0.86941212, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.26489258, + "step": 3331, + "time_per_iteration": 2.4754018783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081779, + "balance_loss_mlp": 1.05531454, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.08536903731672153, + "language_loss": 0.81342864, + "learning_rate": 0.00030155758575946083, + "loss": 0.82424641, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.26489258, + "step": 3332, + "time_per_iteration": 2.626554489135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087465, + "balance_loss_mlp": 1.06135845, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.05880203513690982, + "language_loss": 0.83905303, + "learning_rate": 0.0003012716694518467, + "loss": 0.84992766, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.26135254, + "step": 3333, + "time_per_iteration": 2.563870906829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088918, + "balance_loss_mlp": 1.06233454, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.060655550998304664, + "language_loss": 0.85066408, + "learning_rate": 0.000300985830297413, + "loss": 0.86155331, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.26635742, + "step": 3334, + "time_per_iteration": 2.720207691192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085977, + "balance_loss_mlp": 1.05846334, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.0660382374422698, + "language_loss": 0.87618732, + "learning_rate": 0.00030070006840713205, + "loss": 0.88704705, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.27563477, + "step": 3335, + "time_per_iteration": 3.3882405757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086202, + "balance_loss_mlp": 1.06003511, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.05326551396050189, + "language_loss": 0.738437, + "learning_rate": 0.000300414383891947, + "loss": 0.74929905, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.26184082, + "step": 3336, + "time_per_iteration": 2.841377019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089325, + "balance_loss_mlp": 1.06317008, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.05652358101135248, + "language_loss": 0.88883501, + "learning_rate": 0.00030012877686276973, + "loss": 0.89972824, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.26196289, + "step": 3337, + "time_per_iteration": 2.729287624359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109448, + "balance_loss_mlp": 1.06825364, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.05602708574683237, + "language_loss": 0.87052727, + "learning_rate": 0.0002998432474304832, + "loss": 0.88147211, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.26269531, + "step": 3338, + "time_per_iteration": 2.763936996459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_mlp": 1.02664769, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.022262190661506177, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80270433, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.06445312, + "step": 3339, + "time_per_iteration": 4.899634838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085698, + "balance_loss_mlp": 1.06067634, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.05329063171196326, + "language_loss": 0.88739842, + "learning_rate": 0.00029927242179996107, + "loss": 0.89825541, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.25036621, + "step": 3340, + "time_per_iteration": 2.6731433868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_mlp": 1.05887282, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.05323225781899137, + "language_loss": 0.83480287, + "learning_rate": 0.0002989871258233398, + "loss": 0.84564984, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.25830078, + "step": 3341, + "time_per_iteration": 2.7728755474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092018, + "balance_loss_mlp": 1.06558967, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.07706425942828801, + "language_loss": 0.82514536, + "learning_rate": 0.0002987019078868373, + "loss": 0.83606553, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.26477051, + "step": 3342, + "time_per_iteration": 2.4401304721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.06178701, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.05844856820656981, + "language_loss": 0.81969512, + "learning_rate": 0.00029841676810118484, + "loss": 0.83056593, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.25317383, + "step": 3343, + "time_per_iteration": 2.662538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081725, + "balance_loss_mlp": 1.05664337, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.059827459557400715, + "language_loss": 0.8744089, + "learning_rate": 0.0002981317065770839, + "loss": 0.88522613, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.25097656, + "step": 3344, + "time_per_iteration": 3.0547289848327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084317, + "balance_loss_mlp": 1.05733991, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.06660327590373825, + "language_loss": 0.80995148, + "learning_rate": 0.00029784672342520493, + "loss": 0.8207947, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.2701416, + "step": 3345, + "time_per_iteration": 2.665701389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_mlp": 1.05967772, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.06646117456198827, + "language_loss": 0.83675933, + "learning_rate": 0.00029756181875618834, + "loss": 0.84762478, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.26904297, + "step": 3346, + "time_per_iteration": 2.5859789848327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.06036818, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.0635179791741207, + "language_loss": 0.83513415, + "learning_rate": 0.0002972769926806439, + "loss": 0.84600508, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.26757812, + "step": 3347, + "time_per_iteration": 2.4656190872192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087894, + "balance_loss_mlp": 1.06159616, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.0627778117475219, + "language_loss": 0.89043599, + "learning_rate": 0.0002969922453091508, + "loss": 0.90131485, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.26342773, + "step": 3348, + "time_per_iteration": 2.5913443565368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_mlp": 1.05721855, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.05415378993081624, + "language_loss": 0.85013533, + "learning_rate": 0.00029670757675225777, + "loss": 0.8609767, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.26953125, + "step": 3349, + "time_per_iteration": 2.739000082015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085102, + "balance_loss_mlp": 1.05906665, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.06396799690402781, + "language_loss": 0.79375887, + "learning_rate": 0.0002964229871204831, + "loss": 0.80460995, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.26049805, + "step": 3350, + "time_per_iteration": 2.6291356086730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079691, + "balance_loss_mlp": 1.0546335, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.05949012862270097, + "language_loss": 0.83774936, + "learning_rate": 0.00029613847652431403, + "loss": 0.84854627, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.25073242, + "step": 3351, + "time_per_iteration": 2.839716672897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077701, + "balance_loss_mlp": 1.05226183, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.056904070769954795, + "language_loss": 0.79438174, + "learning_rate": 0.0002958540450742078, + "loss": 0.80515873, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.2545166, + "step": 3352, + "time_per_iteration": 2.913639545440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077873, + "balance_loss_mlp": 1.05242181, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.058859243742432434, + "language_loss": 0.77210569, + "learning_rate": 0.0002955696928805901, + "loss": 0.78288442, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.2545166, + "step": 3353, + "time_per_iteration": 2.923433780670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081125, + "balance_loss_mlp": 1.05607951, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.061599648682054316, + "language_loss": 0.8637355, + "learning_rate": 0.0002952854200538563, + "loss": 0.87454677, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.25061035, + "step": 3354, + "time_per_iteration": 2.8201682567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082059, + "balance_loss_mlp": 1.05513, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.055453256805671876, + "language_loss": 0.82204401, + "learning_rate": 0.000295001226704371, + "loss": 0.83286464, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.26965332, + "step": 3355, + "time_per_iteration": 2.555814743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073686, + "balance_loss_mlp": 1.04755521, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.07998397222578815, + "language_loss": 0.83098918, + "learning_rate": 0.00029471711294246783, + "loss": 0.84172606, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.26171875, + "step": 3356, + "time_per_iteration": 2.7683768272399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04853272, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.06636958548337468, + "language_loss": 0.82041395, + "learning_rate": 0.0002944330788784494, + "loss": 0.83114803, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.24865723, + "step": 3357, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070626, + "balance_loss_mlp": 1.04476953, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.05791600567825564, + "language_loss": 0.84893548, + "learning_rate": 0.00029414912462258786, + "loss": 0.85964179, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.25878906, + "step": 3358, + "time_per_iteration": 2.811368227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074683, + "balance_loss_mlp": 1.04814672, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.06332395198444683, + "language_loss": 0.81536913, + "learning_rate": 0.00029386525028512366, + "loss": 0.82611591, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.265625, + "step": 3359, + "time_per_iteration": 2.7373340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074991, + "balance_loss_mlp": 1.04820502, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.06353277324280042, + "language_loss": 0.87003738, + "learning_rate": 0.0002935814559762666, + "loss": 0.88078725, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.26794434, + "step": 3360, + "time_per_iteration": 2.7775824069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071101, + "balance_loss_mlp": 1.04590034, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.05137930427454231, + "language_loss": 0.79679728, + "learning_rate": 0.0002932977418061957, + "loss": 0.80750829, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.25183105, + "step": 3361, + "time_per_iteration": 2.6293880939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073227, + "balance_loss_mlp": 1.04677427, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.06432809284202623, + "language_loss": 0.80709672, + "learning_rate": 0.00029301410788505833, + "loss": 0.81782901, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.26489258, + "step": 3362, + "time_per_iteration": 2.772700071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073962, + "balance_loss_mlp": 1.04715228, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.06908164950227988, + "language_loss": 0.81278014, + "learning_rate": 0.00029273055432297126, + "loss": 0.82351977, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.26782227, + "step": 3363, + "time_per_iteration": 2.479120969772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068037, + "balance_loss_mlp": 1.04115558, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06524076988807553, + "language_loss": 0.80934191, + "learning_rate": 0.00029244708123001917, + "loss": 0.82002234, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.26916504, + "step": 3364, + "time_per_iteration": 2.9441330432891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068068, + "balance_loss_mlp": 1.04217577, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.06372584124812569, + "language_loss": 0.84562182, + "learning_rate": 0.0002921636887162565, + "loss": 0.8563025, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.25927734, + "step": 3365, + "time_per_iteration": 2.732980489730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067865, + "balance_loss_mlp": 1.04277182, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.0749500155659675, + "language_loss": 0.83798963, + "learning_rate": 0.00029188037689170595, + "loss": 0.84866834, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.25109863, + "step": 3366, + "time_per_iteration": 2.9474096298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068033, + "balance_loss_mlp": 1.04130602, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.06502471406083535, + "language_loss": 0.84043062, + "learning_rate": 0.0002915971458663586, + "loss": 0.85111088, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.26782227, + "step": 3367, + "time_per_iteration": 3.0719544887542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069519, + "balance_loss_mlp": 1.04331708, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.05257796695915082, + "language_loss": 0.81762713, + "learning_rate": 0.00029131399575017494, + "loss": 0.82832229, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.26245117, + "step": 3368, + "time_per_iteration": 3.195772171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071807, + "balance_loss_mlp": 1.04481828, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.05387315925396133, + "language_loss": 0.86003518, + "learning_rate": 0.0002910309266530836, + "loss": 0.87075323, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.27026367, + "step": 3369, + "time_per_iteration": 2.790093421936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075463, + "balance_loss_mlp": 1.04854584, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.057981542205969905, + "language_loss": 0.85403055, + "learning_rate": 0.0002907479386849814, + "loss": 0.86478519, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.26977539, + "step": 3370, + "time_per_iteration": 2.628838062286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074904, + "balance_loss_mlp": 1.04926252, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.05712160703015161, + "language_loss": 0.80363882, + "learning_rate": 0.0002904650319557339, + "loss": 0.81438786, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.2565918, + "step": 3371, + "time_per_iteration": 2.9755005836486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073278, + "balance_loss_mlp": 1.04574049, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.07266117839515142, + "language_loss": 0.81511021, + "learning_rate": 0.0002901822065751758, + "loss": 0.82584298, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.27539062, + "step": 3372, + "time_per_iteration": 2.646740198135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079407, + "balance_loss_mlp": 1.05310917, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.060084455172548096, + "language_loss": 0.85821176, + "learning_rate": 0.0002898994626531093, + "loss": 0.8690058, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.26318359, + "step": 3373, + "time_per_iteration": 2.8307554721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079841, + "balance_loss_mlp": 1.05368662, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.06412256257505489, + "language_loss": 0.88422716, + "learning_rate": 0.00028961680029930526, + "loss": 0.89502561, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.26196289, + "step": 3374, + "time_per_iteration": 2.5427072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078278, + "balance_loss_mlp": 1.05246949, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.05984187516424017, + "language_loss": 0.77025837, + "learning_rate": 0.00028933421962350317, + "loss": 0.78104115, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.25830078, + "step": 3375, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076768, + "balance_loss_mlp": 1.05101824, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.06098588343511283, + "language_loss": 0.8395189, + "learning_rate": 0.0002890517207354104, + "loss": 0.8502866, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.2578125, + "step": 3376, + "time_per_iteration": 2.8559377193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108488, + "balance_loss_mlp": 1.05872583, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.061051185041057866, + "language_loss": 0.81743991, + "learning_rate": 0.0002887693037447029, + "loss": 0.82828867, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.26196289, + "step": 3377, + "time_per_iteration": 2.5842373371124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081133, + "balance_loss_mlp": 1.0550499, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.06328579672333946, + "language_loss": 0.82031405, + "learning_rate": 0.00028848696876102443, + "loss": 0.83112538, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.26086426, + "step": 3378, + "time_per_iteration": 2.6148552894592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085126, + "balance_loss_mlp": 1.05910289, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.06296964534395977, + "language_loss": 0.83665496, + "learning_rate": 0.00028820471589398723, + "loss": 0.84750628, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.26062012, + "step": 3379, + "time_per_iteration": 2.5984256267547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087258, + "balance_loss_mlp": 1.06153309, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.06986995614305117, + "language_loss": 0.77549016, + "learning_rate": 0.00028792254525317196, + "loss": 0.78636277, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.25732422, + "step": 3380, + "time_per_iteration": 2.6670660972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091352, + "balance_loss_mlp": 1.06519723, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.07163565487878029, + "language_loss": 0.81605381, + "learning_rate": 0.00028764045694812645, + "loss": 0.82696736, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.26159668, + "step": 3381, + "time_per_iteration": 2.7534923553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.06213295, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.07829383117608732, + "language_loss": 0.76753044, + "learning_rate": 0.0002873584510883671, + "loss": 0.77842152, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.26989746, + "step": 3382, + "time_per_iteration": 2.5738234519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089393, + "balance_loss_mlp": 1.0616889, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.05561178380226362, + "language_loss": 0.86494762, + "learning_rate": 0.0002870765277833788, + "loss": 0.87584156, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.27709961, + "step": 3383, + "time_per_iteration": 2.6669375896453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080554, + "balance_loss_mlp": 1.05552006, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.06569130604090773, + "language_loss": 0.80749148, + "learning_rate": 0.00028679468714261347, + "loss": 0.81829703, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.25048828, + "step": 3384, + "time_per_iteration": 2.7443134784698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078779, + "balance_loss_mlp": 1.05354261, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.06683297733149338, + "language_loss": 0.76978695, + "learning_rate": 0.0002865129292754918, + "loss": 0.7805748, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.25256348, + "step": 3385, + "time_per_iteration": 2.553633213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077167, + "balance_loss_mlp": 1.05206108, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.07067523232573529, + "language_loss": 0.81812489, + "learning_rate": 0.00028623125429140105, + "loss": 0.82889658, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.25097656, + "step": 3386, + "time_per_iteration": 2.8174142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081783, + "balance_loss_mlp": 1.05555665, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.06558978791095729, + "language_loss": 0.8706044, + "learning_rate": 0.00028594966229969785, + "loss": 0.88142228, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.2623291, + "step": 3387, + "time_per_iteration": 2.680281639099121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078521, + "balance_loss_mlp": 1.05267668, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.06492635522068706, + "language_loss": 0.81586945, + "learning_rate": 0.00028566815340970577, + "loss": 0.82665467, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.25878906, + "step": 3388, + "time_per_iteration": 2.732487916946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075271, + "balance_loss_mlp": 1.05048704, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.06387919000258871, + "language_loss": 0.81219792, + "learning_rate": 0.0002853867277307162, + "loss": 0.8229506, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.2479248, + "step": 3389, + "time_per_iteration": 2.6404130458831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081113, + "balance_loss_mlp": 1.05424297, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.06082372499882378, + "language_loss": 0.82760382, + "learning_rate": 0.00028510538537198824, + "loss": 0.83841497, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.26928711, + "step": 3390, + "time_per_iteration": 2.5929770469665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079433, + "balance_loss_mlp": 1.05408919, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.055684590981588886, + "language_loss": 0.86515808, + "learning_rate": 0.00028482412644274867, + "loss": 0.87595236, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.25366211, + "step": 3391, + "time_per_iteration": 2.9085311889648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074445, + "balance_loss_mlp": 1.04809964, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.061522898278110257, + "language_loss": 0.74154258, + "learning_rate": 0.00028454295105219207, + "loss": 0.75228703, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.26367188, + "step": 3392, + "time_per_iteration": 2.604851245880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.05011857, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.04678981860923424, + "language_loss": 0.79518068, + "learning_rate": 0.0002842618593094802, + "loss": 0.805933, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.25134277, + "step": 3393, + "time_per_iteration": 3.0968527793884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073633, + "balance_loss_mlp": 1.04734683, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.08516397934584916, + "language_loss": 0.80839396, + "learning_rate": 0.00028398085132374243, + "loss": 0.8191303, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.26306152, + "step": 3394, + "time_per_iteration": 2.802588701248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071874, + "balance_loss_mlp": 1.04662573, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.059849085460161155, + "language_loss": 0.84382617, + "learning_rate": 0.0002836999272040761, + "loss": 0.85454488, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.25268555, + "step": 3395, + "time_per_iteration": 3.1209001541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073974, + "balance_loss_mlp": 1.04781914, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.07079508853897194, + "language_loss": 0.84454936, + "learning_rate": 0.00028341908705954575, + "loss": 0.8552891, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.26196289, + "step": 3396, + "time_per_iteration": 2.5430474281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014992, + "balance_loss_mlp": 1.00736308, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.020137853963587818, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82776797, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.07617188, + "step": 3397, + "time_per_iteration": 4.857236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073644, + "balance_loss_mlp": 1.04739439, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.05698390619804648, + "language_loss": 0.78328836, + "learning_rate": 0.00028285765913198604, + "loss": 0.79402483, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.26269531, + "step": 3398, + "time_per_iteration": 2.542471408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076746, + "balance_loss_mlp": 1.05030537, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.05420440820194427, + "language_loss": 0.821926, + "learning_rate": 0.0002825770715669227, + "loss": 0.83269352, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.26489258, + "step": 3399, + "time_per_iteration": 2.718555450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106936, + "balance_loss_mlp": 1.04285991, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06072932855544304, + "language_loss": 0.81462443, + "learning_rate": 0.00028229656841292634, + "loss": 0.82531804, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.26525879, + "step": 3400, + "time_per_iteration": 2.6755053997039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074211, + "balance_loss_mlp": 1.04766357, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.06986785605378391, + "language_loss": 0.762591, + "learning_rate": 0.0002820161497788979, + "loss": 0.77333307, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.265625, + "step": 3401, + "time_per_iteration": 2.56740140914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076961, + "balance_loss_mlp": 1.05193925, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.05855571008796804, + "language_loss": 0.87057543, + "learning_rate": 0.00028173581577370545, + "loss": 0.88134497, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.25036621, + "step": 3402, + "time_per_iteration": 2.7579104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.04957581, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.05140393354142716, + "language_loss": 0.79220372, + "learning_rate": 0.0002814555665061844, + "loss": 0.80294883, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.24938965, + "step": 3403, + "time_per_iteration": 2.7005770206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078214, + "balance_loss_mlp": 1.05273879, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.06470448826772422, + "language_loss": 0.77704948, + "learning_rate": 0.00028117540208513715, + "loss": 0.7878316, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.25476074, + "step": 3404, + "time_per_iteration": 2.6598384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.05403566, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.06510521460794984, + "language_loss": 0.84932673, + "learning_rate": 0.00028089532261933313, + "loss": 0.860116, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.24890137, + "step": 3405, + "time_per_iteration": 2.693470001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107722, + "balance_loss_mlp": 1.05179238, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.06574306959894075, + "language_loss": 0.85646415, + "learning_rate": 0.0002806153282175087, + "loss": 0.86723638, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.25439453, + "step": 3406, + "time_per_iteration": 2.5597920417785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.05415273, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.06979692390704297, + "language_loss": 0.83091819, + "learning_rate": 0.0002803354189883679, + "loss": 0.84171414, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.2545166, + "step": 3407, + "time_per_iteration": 2.8204212188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.05349612, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.05468628056475838, + "language_loss": 0.85987842, + "learning_rate": 0.00028005559504058053, + "loss": 0.8706665, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.2532959, + "step": 3408, + "time_per_iteration": 2.693559408187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076206, + "balance_loss_mlp": 1.05038548, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.07417771883494789, + "language_loss": 0.7684713, + "learning_rate": 0.0002797758564827838, + "loss": 0.77923334, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.25842285, + "step": 3409, + "time_per_iteration": 2.802828788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.05920529, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.06335346821862926, + "language_loss": 0.83560646, + "learning_rate": 0.0002794962034235824, + "loss": 0.84645367, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.25537109, + "step": 3410, + "time_per_iteration": 2.6147637367248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108148, + "balance_loss_mlp": 1.05519438, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.06069626440640027, + "language_loss": 0.74793261, + "learning_rate": 0.00027921663597154695, + "loss": 0.7587474, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.26281738, + "step": 3411, + "time_per_iteration": 2.7347841262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081407, + "balance_loss_mlp": 1.05633736, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.07186540610549816, + "language_loss": 0.81030178, + "learning_rate": 0.00027893715423521525, + "loss": 0.82111579, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.25085449, + "step": 3412, + "time_per_iteration": 2.4426064491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090629, + "balance_loss_mlp": 1.06429613, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.057164257181416274, + "language_loss": 0.83953196, + "learning_rate": 0.00027865775832309163, + "loss": 0.85043824, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.26379395, + "step": 3413, + "time_per_iteration": 2.661008358001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089453, + "balance_loss_mlp": 1.06320286, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.059355909745470246, + "language_loss": 0.86547339, + "learning_rate": 0.00027837844834364733, + "loss": 0.87636793, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.26269531, + "step": 3414, + "time_per_iteration": 2.6107146739959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108986, + "balance_loss_mlp": 1.06451583, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.058864717061538036, + "language_loss": 0.86578488, + "learning_rate": 0.00027809922440532, + "loss": 0.87668347, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.25366211, + "step": 3415, + "time_per_iteration": 2.8214099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085792, + "balance_loss_mlp": 1.05929208, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.06707421916858435, + "language_loss": 0.80825239, + "learning_rate": 0.00027782008661651406, + "loss": 0.81911027, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.26513672, + "step": 3416, + "time_per_iteration": 2.772441864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087765, + "balance_loss_mlp": 1.06200361, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.054600094461814935, + "language_loss": 0.87535822, + "learning_rate": 0.00027754103508560013, + "loss": 0.88623583, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.25769043, + "step": 3417, + "time_per_iteration": 2.5883491039276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108732, + "balance_loss_mlp": 1.06016374, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.057346286211937464, + "language_loss": 0.83059859, + "learning_rate": 0.0002772620699209163, + "loss": 0.84147179, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.27197266, + "step": 3418, + "time_per_iteration": 2.560173988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080511, + "balance_loss_mlp": 1.05552435, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.07342594011001312, + "language_loss": 0.80011356, + "learning_rate": 0.0002769831912307658, + "loss": 0.81091869, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.24987793, + "step": 3419, + "time_per_iteration": 2.5090081691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077943, + "balance_loss_mlp": 1.05116832, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.15397597060543888, + "language_loss": 0.80397606, + "learning_rate": 0.00027670439912341917, + "loss": 0.81475556, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.26782227, + "step": 3420, + "time_per_iteration": 2.6002025604248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107857, + "balance_loss_mlp": 1.05198634, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.05399899267227409, + "language_loss": 0.83793807, + "learning_rate": 0.0002764256937071129, + "loss": 0.84872377, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.26611328, + "step": 3421, + "time_per_iteration": 2.7873942852020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074524, + "balance_loss_mlp": 1.04920375, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.0598445160882451, + "language_loss": 0.87503046, + "learning_rate": 0.00027614707509005036, + "loss": 0.88577569, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.25341797, + "step": 3422, + "time_per_iteration": 2.659196615219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079222, + "balance_loss_mlp": 1.05353248, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.05796849455801806, + "language_loss": 0.79051846, + "learning_rate": 0.0002758685433804008, + "loss": 0.80131066, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.25695801, + "step": 3423, + "time_per_iteration": 2.5024282932281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074298, + "balance_loss_mlp": 1.04835773, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.06008115307148776, + "language_loss": 0.79408616, + "learning_rate": 0.00027559009868630005, + "loss": 0.80482912, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.25964355, + "step": 3424, + "time_per_iteration": 3.0929386615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073444, + "balance_loss_mlp": 1.0477066, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.05902981727550509, + "language_loss": 0.80511308, + "learning_rate": 0.0002753117411158491, + "loss": 0.81584746, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.25744629, + "step": 3425, + "time_per_iteration": 3.0452723503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05611944, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.053958914804285704, + "language_loss": 0.8972351, + "learning_rate": 0.0002750334707771168, + "loss": 0.90806711, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.27124023, + "step": 3426, + "time_per_iteration": 2.626776695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108887, + "balance_loss_mlp": 1.06247699, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.06696403596262077, + "language_loss": 0.81474262, + "learning_rate": 0.0002747552877781369, + "loss": 0.82563138, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.26367188, + "step": 3427, + "time_per_iteration": 2.49870228767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082041, + "balance_loss_mlp": 1.05622029, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.056641096462852314, + "language_loss": 0.82350707, + "learning_rate": 0.0002744771922269097, + "loss": 0.83432746, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.25805664, + "step": 3428, + "time_per_iteration": 2.76737117767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083165, + "balance_loss_mlp": 1.05777287, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.05792922212348718, + "language_loss": 0.82232559, + "learning_rate": 0.0002741991842314015, + "loss": 0.83315718, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.25415039, + "step": 3429, + "time_per_iteration": 3.4959795475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082387, + "balance_loss_mlp": 1.05617321, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.05913775342689391, + "language_loss": 0.86208242, + "learning_rate": 0.0002739212638995445, + "loss": 0.87290633, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.26220703, + "step": 3430, + "time_per_iteration": 2.552647113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091567, + "balance_loss_mlp": 1.06441104, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.06592703083279383, + "language_loss": 0.83386678, + "learning_rate": 0.00027364343133923696, + "loss": 0.84478247, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.27172852, + "step": 3431, + "time_per_iteration": 2.639110565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05480886, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.06195217340834915, + "language_loss": 0.8308382, + "learning_rate": 0.0002733656866583431, + "loss": 0.84164518, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.25927734, + "step": 3432, + "time_per_iteration": 2.6898815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091031, + "balance_loss_mlp": 1.0637325, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.07646297806496907, + "language_loss": 0.83208609, + "learning_rate": 0.0002730880299646927, + "loss": 0.84299648, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.27307129, + "step": 3433, + "time_per_iteration": 3.0324153900146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.06028199, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.09642118703773885, + "language_loss": 0.85385412, + "learning_rate": 0.0002728104613660821, + "loss": 0.8647173, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.26074219, + "step": 3434, + "time_per_iteration": 2.8242013454437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082794, + "balance_loss_mlp": 1.0578196, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.06046346369252319, + "language_loss": 0.83065814, + "learning_rate": 0.0002725329809702729, + "loss": 0.8414861, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.25012207, + "step": 3435, + "time_per_iteration": 3.208373546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086877, + "balance_loss_mlp": 1.06015027, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.06729202687842574, + "language_loss": 0.76439357, + "learning_rate": 0.0002722555888849921, + "loss": 0.77526236, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.26757812, + "step": 3436, + "time_per_iteration": 3.455219030380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108327, + "balance_loss_mlp": 1.05748534, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.06326694519745679, + "language_loss": 0.80694687, + "learning_rate": 0.00027197828521793334, + "loss": 0.8177796, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.25793457, + "step": 3437, + "time_per_iteration": 2.500117301940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086414, + "balance_loss_mlp": 1.06089163, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.06352548474841713, + "language_loss": 0.84948212, + "learning_rate": 0.0002717010700767552, + "loss": 0.86034626, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.25549316, + "step": 3438, + "time_per_iteration": 2.7301025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.06205106, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.06533223637533662, + "language_loss": 0.75988388, + "learning_rate": 0.00027142394356908226, + "loss": 0.77076733, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.26318359, + "step": 3439, + "time_per_iteration": 2.5677285194396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086811, + "balance_loss_mlp": 1.06116903, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.0569940621311471, + "language_loss": 0.85089839, + "learning_rate": 0.00027114690580250456, + "loss": 0.86176658, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.25646973, + "step": 3440, + "time_per_iteration": 2.738121509552002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 1.06724405, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.05472871432656112, + "language_loss": 0.86912161, + "learning_rate": 0.0002708699568845776, + "loss": 0.88005286, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.25891113, + "step": 3441, + "time_per_iteration": 2.611889600753784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_mlp": 1.03374481, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.021890830835033067, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80329108, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.07568359, + "step": 3442, + "time_per_iteration": 4.8971052169799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090068, + "balance_loss_mlp": 1.06495047, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.064050238945667, + "language_loss": 0.83170366, + "learning_rate": 0.0002703163260247261, + "loss": 0.8426044, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.25134277, + "step": 3443, + "time_per_iteration": 2.5994081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06699824, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.06534456788919288, + "language_loss": 0.81938642, + "learning_rate": 0.0002700396442977399, + "loss": 0.83030105, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.24462891, + "step": 3444, + "time_per_iteration": 2.6017937660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091879, + "balance_loss_mlp": 1.06627333, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.06451262067496133, + "language_loss": 0.84422678, + "learning_rate": 0.0002697630518492817, + "loss": 0.85514563, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.25634766, + "step": 3445, + "time_per_iteration": 2.628159523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094697, + "balance_loss_mlp": 1.06956816, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.05416253097531709, + "language_loss": 0.85508287, + "learning_rate": 0.0002694865487867343, + "loss": 0.8660298, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.25134277, + "step": 3446, + "time_per_iteration": 2.604813814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088071, + "balance_loss_mlp": 1.06316853, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.052847331110623744, + "language_loss": 0.84946668, + "learning_rate": 0.0002692101352174453, + "loss": 0.86034739, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.24914551, + "step": 3447, + "time_per_iteration": 2.768223285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109622, + "balance_loss_mlp": 1.06981492, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.058874726069321814, + "language_loss": 0.8497262, + "learning_rate": 0.00026893381124872787, + "loss": 0.86068839, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.26452637, + "step": 3448, + "time_per_iteration": 2.6762025356292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090166, + "balance_loss_mlp": 1.06560886, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.057817999010546496, + "language_loss": 0.80621779, + "learning_rate": 0.00026865757698786097, + "loss": 0.81711942, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.24584961, + "step": 3449, + "time_per_iteration": 3.0353593826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088537, + "balance_loss_mlp": 1.06256163, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.06325061387502293, + "language_loss": 0.81828356, + "learning_rate": 0.000268381432542088, + "loss": 0.82916903, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.26000977, + "step": 3450, + "time_per_iteration": 2.8381845951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085731, + "balance_loss_mlp": 1.05967212, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.06107082028806233, + "language_loss": 0.80140352, + "learning_rate": 0.00026810537801861807, + "loss": 0.81226087, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.26074219, + "step": 3451, + "time_per_iteration": 2.755697727203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091691, + "balance_loss_mlp": 1.06455863, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.05182534623872074, + "language_loss": 0.8148368, + "learning_rate": 0.0002678294135246243, + "loss": 0.82575375, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.27087402, + "step": 3452, + "time_per_iteration": 2.7235701084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077401, + "balance_loss_mlp": 1.05224776, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.07490988727173932, + "language_loss": 0.86671561, + "learning_rate": 0.0002675535391672463, + "loss": 0.87748969, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.25170898, + "step": 3453, + "time_per_iteration": 3.0891692638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.05430508, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.05695144440492774, + "language_loss": 0.86551011, + "learning_rate": 0.0002672777550535877, + "loss": 0.8763122, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.25939941, + "step": 3454, + "time_per_iteration": 2.7647364139556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078097, + "balance_loss_mlp": 1.05288386, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.06003914399103326, + "language_loss": 0.85505843, + "learning_rate": 0.00026700206129071747, + "loss": 0.86583936, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.25231934, + "step": 3455, + "time_per_iteration": 2.5821306705474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078808, + "balance_loss_mlp": 1.05316663, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.06471391174754697, + "language_loss": 0.88815629, + "learning_rate": 0.00026672645798566925, + "loss": 0.89894438, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.25671387, + "step": 3456, + "time_per_iteration": 2.536905288696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073004, + "balance_loss_mlp": 1.04708791, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.06322098786419635, + "language_loss": 0.79450369, + "learning_rate": 0.00026645094524544225, + "loss": 0.80523372, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.25927734, + "step": 3457, + "time_per_iteration": 3.346942663192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.05416238, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.07380509782774128, + "language_loss": 0.75270724, + "learning_rate": 0.00026617552317699945, + "loss": 0.76351058, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.26220703, + "step": 3458, + "time_per_iteration": 2.8174753189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075769, + "balance_loss_mlp": 1.05087817, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.06466167118068906, + "language_loss": 0.87317026, + "learning_rate": 0.0002659001918872693, + "loss": 0.88392794, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.24890137, + "step": 3459, + "time_per_iteration": 2.620330810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.0529331, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.06328415655418428, + "language_loss": 0.81001127, + "learning_rate": 0.0002656249514831449, + "loss": 0.82079417, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.25378418, + "step": 3460, + "time_per_iteration": 2.6549599170684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079023, + "balance_loss_mlp": 1.05377483, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.054463111692168976, + "language_loss": 0.86972237, + "learning_rate": 0.00026534980207148416, + "loss": 0.8805126, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.25256348, + "step": 3461, + "time_per_iteration": 3.424241065979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.05996895, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.06786500256083805, + "language_loss": 0.7389307, + "learning_rate": 0.0002650747437591097, + "loss": 0.7497921, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.26208496, + "step": 3462, + "time_per_iteration": 3.037792921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020491, + "balance_loss_mlp": 1.01310015, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.010691660665593496, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82900071, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.07373047, + "step": 3463, + "time_per_iteration": 5.019932985305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077653, + "balance_loss_mlp": 1.05172443, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.0677151355970307, + "language_loss": 0.86401796, + "learning_rate": 0.00026452490085933155, + "loss": 0.87479448, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.25952148, + "step": 3464, + "time_per_iteration": 2.577608346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_mlp": 1.05381727, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.06950705493870243, + "language_loss": 0.90135396, + "learning_rate": 0.00026425011648539614, + "loss": 0.91214788, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.25622559, + "step": 3465, + "time_per_iteration": 2.5207860469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078468, + "balance_loss_mlp": 1.0527184, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06360289256438866, + "language_loss": 0.83105028, + "learning_rate": 0.00026397542363768267, + "loss": 0.84183496, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.25769043, + "step": 3466, + "time_per_iteration": 2.662781238555908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081527, + "balance_loss_mlp": 1.05476463, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.11778132677194894, + "language_loss": 0.8209849, + "learning_rate": 0.0002637008224228362, + "loss": 0.83180016, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.26794434, + "step": 3467, + "time_per_iteration": 2.5543577671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05868888, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.04775421920110858, + "language_loss": 0.8469578, + "learning_rate": 0.00026342631294746653, + "loss": 0.85780263, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.25842285, + "step": 3468, + "time_per_iteration": 2.7040185928344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086094, + "balance_loss_mlp": 1.06041682, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.049080807880720057, + "language_loss": 0.81080979, + "learning_rate": 0.0002631518953181476, + "loss": 0.82167077, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.25671387, + "step": 3469, + "time_per_iteration": 3.493414878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011784, + "balance_loss_mlp": 1.00391626, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.010939757170187329, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77337068, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.07861328, + "step": 3470, + "time_per_iteration": 4.9387853145599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_mlp": 1.06110907, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.0606952460981544, + "language_loss": 0.80340272, + "learning_rate": 0.00026260333602377985, + "loss": 0.81427646, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.26281738, + "step": 3471, + "time_per_iteration": 2.838916063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109892, + "balance_loss_mlp": 1.0729208, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.06496239585891986, + "language_loss": 0.87351251, + "learning_rate": 0.0002623291945717007, + "loss": 0.88450176, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.26000977, + "step": 3472, + "time_per_iteration": 2.4870412349700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097292, + "balance_loss_mlp": 1.07054186, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.04982364311813806, + "language_loss": 0.84127951, + "learning_rate": 0.00026205514539161175, + "loss": 0.85225236, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.26782227, + "step": 3473, + "time_per_iteration": 3.565732479095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102422, + "balance_loss_mlp": 1.07651806, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.06841158179572154, + "language_loss": 0.84113353, + "learning_rate": 0.00026178118858990773, + "loss": 0.85215771, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.2590332, + "step": 3474, + "time_per_iteration": 2.8573057651519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087697, + "balance_loss_mlp": 1.0619719, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.07905158602596217, + "language_loss": 0.84220064, + "learning_rate": 0.0002615073242729483, + "loss": 0.85307765, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.25732422, + "step": 3475, + "time_per_iteration": 2.6173481941223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090195, + "balance_loss_mlp": 1.06363511, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.04794889281343623, + "language_loss": 0.84776723, + "learning_rate": 0.0002612335525470573, + "loss": 0.85866916, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.26586914, + "step": 3476, + "time_per_iteration": 2.819981575012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.06361461, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.06414606112589924, + "language_loss": 0.7840637, + "learning_rate": 0.0002609598735185221, + "loss": 0.79496014, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.26062012, + "step": 3477, + "time_per_iteration": 2.6392619609832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085761, + "balance_loss_mlp": 1.0593915, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.054041595090679226, + "language_loss": 0.83408946, + "learning_rate": 0.00026068628729359445, + "loss": 0.8449471, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.26379395, + "step": 3478, + "time_per_iteration": 2.766197919845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108263, + "balance_loss_mlp": 1.05621278, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.059772967228533376, + "language_loss": 0.76451987, + "learning_rate": 0.00026041279397848996, + "loss": 0.77534616, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.2644043, + "step": 3479, + "time_per_iteration": 2.8584389686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077924, + "balance_loss_mlp": 1.05261552, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.051702403588613846, + "language_loss": 0.82616276, + "learning_rate": 0.00026013939367938797, + "loss": 0.83694196, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.25317383, + "step": 3480, + "time_per_iteration": 2.891376495361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_mlp": 1.04828119, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.05419828241435922, + "language_loss": 0.81335235, + "learning_rate": 0.00025986608650243204, + "loss": 0.82409453, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.25952148, + "step": 3481, + "time_per_iteration": 2.77876353263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073761, + "balance_loss_mlp": 1.04680765, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.051697162904794, + "language_loss": 0.79773414, + "learning_rate": 0.0002595928725537293, + "loss": 0.8084718, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.26965332, + "step": 3482, + "time_per_iteration": 2.8413639068603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073841, + "balance_loss_mlp": 1.04836571, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.05767199414491062, + "language_loss": 0.88867986, + "learning_rate": 0.0002593197519393509, + "loss": 0.89941823, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.25500488, + "step": 3483, + "time_per_iteration": 2.603405475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069253, + "balance_loss_mlp": 1.04446936, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.06697980614257329, + "language_loss": 0.79532218, + "learning_rate": 0.00025904672476533165, + "loss": 0.80601466, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.2479248, + "step": 3484, + "time_per_iteration": 2.84698224067688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070985, + "balance_loss_mlp": 1.04459202, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.05331322450394034, + "language_loss": 0.82924032, + "learning_rate": 0.0002587737911376704, + "loss": 0.83995014, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.26416016, + "step": 3485, + "time_per_iteration": 2.585921049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074192, + "balance_loss_mlp": 1.04729843, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.06756987561009595, + "language_loss": 0.84183806, + "learning_rate": 0.00025850095116232885, + "loss": 0.85257995, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.26953125, + "step": 3486, + "time_per_iteration": 2.7065019607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075208, + "balance_loss_mlp": 1.04840994, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.05801175058434062, + "language_loss": 0.77675766, + "learning_rate": 0.000258228204945233, + "loss": 0.7875098, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.2677002, + "step": 3487, + "time_per_iteration": 2.8951704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071909, + "balance_loss_mlp": 1.04588532, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.05899101310847367, + "language_loss": 0.84739226, + "learning_rate": 0.00025795555259227254, + "loss": 0.85811132, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.26062012, + "step": 3488, + "time_per_iteration": 2.777141571044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072765, + "balance_loss_mlp": 1.04677725, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.0454202058547125, + "language_loss": 0.84104466, + "learning_rate": 0.00025768299420930046, + "loss": 0.85177231, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.2598877, + "step": 3489, + "time_per_iteration": 2.720435857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073548, + "balance_loss_mlp": 1.04736936, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.052981388085366045, + "language_loss": 0.83523858, + "learning_rate": 0.0002574105299021332, + "loss": 0.84597409, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.26220703, + "step": 3490, + "time_per_iteration": 2.874335289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072546, + "balance_loss_mlp": 1.04605818, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.05653925915184199, + "language_loss": 0.84515595, + "learning_rate": 0.00025713815977655084, + "loss": 0.85588139, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.26501465, + "step": 3491, + "time_per_iteration": 2.857795000076294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107473, + "balance_loss_mlp": 1.04900455, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.0648375250519464, + "language_loss": 0.84809422, + "learning_rate": 0.0002568658839382969, + "loss": 0.85884148, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.25683594, + "step": 3492, + "time_per_iteration": 2.5480034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072741, + "balance_loss_mlp": 1.04731405, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.06366513295568171, + "language_loss": 0.84661782, + "learning_rate": 0.00025659370249307814, + "loss": 0.85734528, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.25439453, + "step": 3493, + "time_per_iteration": 2.602646589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072937, + "balance_loss_mlp": 1.04722357, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.05297099433671679, + "language_loss": 0.85274851, + "learning_rate": 0.00025632161554656473, + "loss": 0.86347795, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.25732422, + "step": 3494, + "time_per_iteration": 2.867612838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071204, + "balance_loss_mlp": 1.04509759, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.05583877885035688, + "language_loss": 0.81951666, + "learning_rate": 0.00025604962320439017, + "loss": 0.83022875, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.26147461, + "step": 3495, + "time_per_iteration": 2.7493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107172, + "balance_loss_mlp": 1.04625738, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.056464737234764244, + "language_loss": 0.82197464, + "learning_rate": 0.0002557777255721516, + "loss": 0.83269185, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.2545166, + "step": 3496, + "time_per_iteration": 2.712113857269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068431, + "balance_loss_mlp": 1.04230046, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.0673285818829442, + "language_loss": 0.80758643, + "learning_rate": 0.0002555059227554087, + "loss": 0.8182708, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.26171875, + "step": 3497, + "time_per_iteration": 2.6871681213378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107222, + "balance_loss_mlp": 1.04655433, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.05408032607546607, + "language_loss": 0.7786265, + "learning_rate": 0.00025523421485968453, + "loss": 0.78934866, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.25695801, + "step": 3498, + "time_per_iteration": 2.822655439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.04613543, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.05805760425239871, + "language_loss": 0.85567248, + "learning_rate": 0.00025496260199046585, + "loss": 0.86639267, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.25891113, + "step": 3499, + "time_per_iteration": 2.9368207454681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073883, + "balance_loss_mlp": 1.04759765, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.05807897060622665, + "language_loss": 0.84593326, + "learning_rate": 0.000254691084253202, + "loss": 0.85667205, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.26293945, + "step": 3500, + "time_per_iteration": 2.812175750732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069637, + "balance_loss_mlp": 1.04343474, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.06730887087818041, + "language_loss": 0.77490008, + "learning_rate": 0.00025441966175330567, + "loss": 0.78559649, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.2623291, + "step": 3501, + "time_per_iteration": 2.6858127117156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074737, + "balance_loss_mlp": 1.04904723, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.05973627548594562, + "language_loss": 0.7990756, + "learning_rate": 0.00025414833459615183, + "loss": 0.80982292, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.2565918, + "step": 3502, + "time_per_iteration": 2.792283296585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079245, + "balance_loss_mlp": 1.05329359, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.054401429492937234, + "language_loss": 0.80582958, + "learning_rate": 0.0002538771028870796, + "loss": 0.81662202, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.2598877, + "step": 3503, + "time_per_iteration": 2.7585413455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073955, + "balance_loss_mlp": 1.04783654, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.064846065362636, + "language_loss": 0.81689268, + "learning_rate": 0.0002536059667313903, + "loss": 0.82763219, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.2611084, + "step": 3504, + "time_per_iteration": 2.71769118309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074293, + "balance_loss_mlp": 1.04768562, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.06348051765573881, + "language_loss": 0.89503717, + "learning_rate": 0.0002533349262343483, + "loss": 0.90578014, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.26635742, + "step": 3505, + "time_per_iteration": 2.660651445388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079263, + "balance_loss_mlp": 1.05396676, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.07580313985305334, + "language_loss": 0.81963527, + "learning_rate": 0.0002530639815011807, + "loss": 0.83042789, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.25317383, + "step": 3506, + "time_per_iteration": 2.4884142875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107741, + "balance_loss_mlp": 1.05192339, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.07059145793354948, + "language_loss": 0.85113943, + "learning_rate": 0.0002527931326370781, + "loss": 0.86191356, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.25512695, + "step": 3507, + "time_per_iteration": 2.7946653366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078834, + "balance_loss_mlp": 1.05275106, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.06075343684572694, + "language_loss": 0.83284092, + "learning_rate": 0.00025252237974719276, + "loss": 0.84362924, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.26098633, + "step": 3508, + "time_per_iteration": 2.8548471927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108079, + "balance_loss_mlp": 1.05530286, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.06110839735898087, + "language_loss": 0.80529547, + "learning_rate": 0.00025225172293664056, + "loss": 0.81610334, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.25500488, + "step": 3509, + "time_per_iteration": 3.0396220684051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013373, + "balance_loss_mlp": 1.00583911, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.007570597102939453, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77946508, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.07519531, + "step": 3510, + "time_per_iteration": 4.9238317012786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081964, + "balance_loss_mlp": 1.05588078, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.06266149701009033, + "language_loss": 0.85147846, + "learning_rate": 0.00025171069797381106, + "loss": 0.86229801, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.26123047, + "step": 3511, + "time_per_iteration": 2.842026948928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107574, + "balance_loss_mlp": 1.05036068, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.05295851129049709, + "language_loss": 0.82071269, + "learning_rate": 0.00025144033003157864, + "loss": 0.83147007, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.25402832, + "step": 3512, + "time_per_iteration": 2.5853493213653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087572, + "balance_loss_mlp": 1.06216824, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.10878048166540129, + "language_loss": 0.78940082, + "learning_rate": 0.00025117005858876806, + "loss": 0.80027652, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.25402832, + "step": 3513, + "time_per_iteration": 2.683076858520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05658007, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.062477123984618736, + "language_loss": 0.8580628, + "learning_rate": 0.000250899883750308, + "loss": 0.86887884, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.25036621, + "step": 3514, + "time_per_iteration": 2.7132656574249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081057, + "balance_loss_mlp": 1.05621386, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.06208222280166845, + "language_loss": 0.82150948, + "learning_rate": 0.00025062980562109006, + "loss": 0.83232003, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.24841309, + "step": 3515, + "time_per_iteration": 4.169267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080203, + "balance_loss_mlp": 1.0545373, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.06255733263360135, + "language_loss": 0.83099926, + "learning_rate": 0.0002503598243059677, + "loss": 0.84180129, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.25683594, + "step": 3516, + "time_per_iteration": 2.7749977111816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_mlp": 1.05966699, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.06025675944988047, + "language_loss": 0.8034898, + "learning_rate": 0.0002500899399097568, + "loss": 0.8143459, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.25976562, + "step": 3517, + "time_per_iteration": 2.638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087679, + "balance_loss_mlp": 1.06179833, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.06061041390288269, + "language_loss": 0.85528451, + "learning_rate": 0.0002498201525372359, + "loss": 0.86616129, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.25915527, + "step": 3518, + "time_per_iteration": 2.6280837059020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090365, + "balance_loss_mlp": 1.06465113, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.05678341042479038, + "language_loss": 0.83314502, + "learning_rate": 0.00024955046229314584, + "loss": 0.84404874, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.25732422, + "step": 3519, + "time_per_iteration": 2.598114013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090498, + "balance_loss_mlp": 1.06486833, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06076117053087645, + "language_loss": 0.87566268, + "learning_rate": 0.00024928086928218947, + "loss": 0.88656765, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.25646973, + "step": 3520, + "time_per_iteration": 2.4903347492218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088735, + "balance_loss_mlp": 1.06373692, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.07287675105407085, + "language_loss": 0.76298815, + "learning_rate": 0.00024901137360903216, + "loss": 0.77387547, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.25012207, + "step": 3521, + "time_per_iteration": 2.957127332687378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095619, + "balance_loss_mlp": 1.07063317, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.06312793336661301, + "language_loss": 0.80923325, + "learning_rate": 0.00024874197537830115, + "loss": 0.82018942, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.25, + "step": 3522, + "time_per_iteration": 2.5331904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088383, + "balance_loss_mlp": 1.06340837, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.06755999829243825, + "language_loss": 0.83245486, + "learning_rate": 0.00024847267469458684, + "loss": 0.84333861, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.24987793, + "step": 3523, + "time_per_iteration": 2.525132417678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087699, + "balance_loss_mlp": 1.06227136, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.06413222868120108, + "language_loss": 0.7768755, + "learning_rate": 0.00024820347166244034, + "loss": 0.78775245, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.2545166, + "step": 3524, + "time_per_iteration": 2.981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086699, + "balance_loss_mlp": 1.06202292, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.05504268755714505, + "language_loss": 0.85045242, + "learning_rate": 0.0002479343663863755, + "loss": 0.86131942, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.24682617, + "step": 3525, + "time_per_iteration": 2.8227763175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_mlp": 1.05880737, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.05863991257689852, + "language_loss": 0.76910073, + "learning_rate": 0.00024766535897086876, + "loss": 0.77995467, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.26623535, + "step": 3526, + "time_per_iteration": 2.5773653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_mlp": 1.06144333, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.09784293796140163, + "language_loss": 0.78738832, + "learning_rate": 0.0002473964495203578, + "loss": 0.79827124, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.26879883, + "step": 3527, + "time_per_iteration": 2.6880078315734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084335, + "balance_loss_mlp": 1.0582881, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.057535616480669176, + "language_loss": 0.85700953, + "learning_rate": 0.0002471276381392425, + "loss": 0.86785293, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.26062012, + "step": 3528, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_mlp": 1.02067733, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.014996437557936866, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79216838, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.07519531, + "step": 3529, + "time_per_iteration": 4.95120096206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088756, + "balance_loss_mlp": 1.06375766, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06208247419260481, + "language_loss": 0.84420717, + "learning_rate": 0.00024659031000260826, + "loss": 0.85509473, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.25, + "step": 3530, + "time_per_iteration": 2.8619091510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085263, + "balance_loss_mlp": 1.05816674, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.0739213834175869, + "language_loss": 0.80927098, + "learning_rate": 0.0002463217934556985, + "loss": 0.82012367, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.27111816, + "step": 3531, + "time_per_iteration": 2.6372668743133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015203, + "balance_loss_mlp": 1.00790787, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.011067583088495437, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77547294, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.07275391, + "step": 3532, + "time_per_iteration": 4.7275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.06364703, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.07509562800064129, + "language_loss": 0.83719718, + "learning_rate": 0.0002457850559259306, + "loss": 0.84809136, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.25769043, + "step": 3533, + "time_per_iteration": 2.9546730518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082617, + "balance_loss_mlp": 1.05801249, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.058098360832657354, + "language_loss": 0.82016122, + "learning_rate": 0.00024551683515145275, + "loss": 0.83098733, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.24597168, + "step": 3534, + "time_per_iteration": 2.675198793411255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080796, + "balance_loss_mlp": 1.05546427, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.05760567747955486, + "language_loss": 0.8703866, + "learning_rate": 0.0002452487131761014, + "loss": 0.88119459, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.25354004, + "step": 3535, + "time_per_iteration": 2.7560551166534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080116, + "balance_loss_mlp": 1.0540328, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.06295067828117173, + "language_loss": 0.80308378, + "learning_rate": 0.00024498069010397093, + "loss": 0.81388497, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.26123047, + "step": 3536, + "time_per_iteration": 2.7834858894348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081714, + "balance_loss_mlp": 1.05659688, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.05311413665555526, + "language_loss": 0.85467112, + "learning_rate": 0.00024471276603911697, + "loss": 0.86548829, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.2512207, + "step": 3537, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086182, + "balance_loss_mlp": 1.06095743, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.0668547033198753, + "language_loss": 0.79341853, + "learning_rate": 0.0002444449410855572, + "loss": 0.80428034, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.25231934, + "step": 3538, + "time_per_iteration": 2.790034532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083109, + "balance_loss_mlp": 1.0583849, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.056899188287429556, + "language_loss": 0.84389639, + "learning_rate": 0.00024417721534727033, + "loss": 0.85472751, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.24731445, + "step": 3539, + "time_per_iteration": 2.703143358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081155, + "balance_loss_mlp": 1.0562042, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.06562679569248508, + "language_loss": 0.83345222, + "learning_rate": 0.00024390958892819687, + "loss": 0.84426379, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.24938965, + "step": 3540, + "time_per_iteration": 2.5123190879821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083792, + "balance_loss_mlp": 1.0574708, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.0704351751694786, + "language_loss": 0.80845803, + "learning_rate": 0.0002436420619322381, + "loss": 0.81929594, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.26367188, + "step": 3541, + "time_per_iteration": 2.8810999393463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080954, + "balance_loss_mlp": 1.05532384, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.05740970422005706, + "language_loss": 0.82921457, + "learning_rate": 0.0002433746344632577, + "loss": 0.84002411, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.25634766, + "step": 3542, + "time_per_iteration": 2.7135009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085031, + "balance_loss_mlp": 1.0591507, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.09305819117462581, + "language_loss": 0.80352092, + "learning_rate": 0.00024310730662508006, + "loss": 0.81437123, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.25891113, + "step": 3543, + "time_per_iteration": 3.061795949935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080318, + "balance_loss_mlp": 1.05509281, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.05668741815102704, + "language_loss": 0.87538439, + "learning_rate": 0.0002428400785214911, + "loss": 0.88618755, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.25231934, + "step": 3544, + "time_per_iteration": 2.600311279296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077375, + "balance_loss_mlp": 1.05138755, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.05461889595804736, + "language_loss": 0.8282584, + "learning_rate": 0.00024257295025623794, + "loss": 0.83903217, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.26025391, + "step": 3545, + "time_per_iteration": 2.9303810596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.05181503, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.05463357395047058, + "language_loss": 0.80816013, + "learning_rate": 0.00024230592193302892, + "loss": 0.8189292, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.25085449, + "step": 3546, + "time_per_iteration": 3.0259780883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108338, + "balance_loss_mlp": 1.05730915, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.061332624341889866, + "language_loss": 0.84813237, + "learning_rate": 0.00024203899365553372, + "loss": 0.85896623, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.2611084, + "step": 3547, + "time_per_iteration": 2.5990257263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101826, + "balance_loss_mlp": 1.01120329, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.024302183931920462, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7775262, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.07080078, + "step": 3548, + "time_per_iteration": 4.529210090637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082512, + "balance_loss_mlp": 1.05796695, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.06743291659407481, + "language_loss": 0.83211255, + "learning_rate": 0.00024150543765216848, + "loss": 0.84293771, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.2454834, + "step": 3549, + "time_per_iteration": 2.9848315715789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079547, + "balance_loss_mlp": 1.05444109, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.06339760092568236, + "language_loss": 0.83246768, + "learning_rate": 0.00024123881013344352, + "loss": 0.84326315, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.25109863, + "step": 3550, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078185, + "balance_loss_mlp": 1.05236471, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.05786884638385198, + "language_loss": 0.79739892, + "learning_rate": 0.00024097228307472202, + "loss": 0.80818081, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.25854492, + "step": 3551, + "time_per_iteration": 2.8328561782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078138, + "balance_loss_mlp": 1.0525794, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.06566140613628157, + "language_loss": 0.81969666, + "learning_rate": 0.00024070585657947846, + "loss": 0.83047807, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.25585938, + "step": 3552, + "time_per_iteration": 2.962819814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081884, + "balance_loss_mlp": 1.05676627, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.0534920389978937, + "language_loss": 0.8565321, + "learning_rate": 0.00024043953075114934, + "loss": 0.86735094, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.2512207, + "step": 3553, + "time_per_iteration": 2.638843059539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075297, + "balance_loss_mlp": 1.04947591, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.05485764052076591, + "language_loss": 0.88990396, + "learning_rate": 0.00024017330569313128, + "loss": 0.90065694, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.25842285, + "step": 3554, + "time_per_iteration": 2.7616748809814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078527, + "balance_loss_mlp": 1.05215812, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.07669249148194994, + "language_loss": 0.75058365, + "learning_rate": 0.0002399071815087821, + "loss": 0.76136887, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.26391602, + "step": 3555, + "time_per_iteration": 3.047292470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_mlp": 1.05511451, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.0595534971161133, + "language_loss": 0.84028351, + "learning_rate": 0.00023964115830142025, + "loss": 0.85108721, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.25256348, + "step": 3556, + "time_per_iteration": 2.708983898162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074295, + "balance_loss_mlp": 1.05035782, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.0757977451950182, + "language_loss": 0.88028133, + "learning_rate": 0.00023937523617432522, + "loss": 0.89102429, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.23950195, + "step": 3557, + "time_per_iteration": 2.454397201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077509, + "balance_loss_mlp": 1.05258226, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.08760866739293877, + "language_loss": 0.87423909, + "learning_rate": 0.00023910941523073705, + "loss": 0.88501424, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.24938965, + "step": 3558, + "time_per_iteration": 3.9113569259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.05796981, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.053991545736228864, + "language_loss": 0.86934376, + "learning_rate": 0.0002388436955738566, + "loss": 0.88018322, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.2598877, + "step": 3559, + "time_per_iteration": 2.837038040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080181, + "balance_loss_mlp": 1.05512345, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.06078167941241102, + "language_loss": 0.81248534, + "learning_rate": 0.00023857807730684523, + "loss": 0.82328713, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.25061035, + "step": 3560, + "time_per_iteration": 2.892477035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084991, + "balance_loss_mlp": 1.05795407, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.06645470458229728, + "language_loss": 0.82908154, + "learning_rate": 0.00023831256053282547, + "loss": 0.83993149, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.27050781, + "step": 3561, + "time_per_iteration": 2.724573850631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081036, + "balance_loss_mlp": 1.05547762, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.06597218498580906, + "language_loss": 0.78399622, + "learning_rate": 0.00023804714535488003, + "loss": 0.7948066, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.25561523, + "step": 3562, + "time_per_iteration": 2.95060133934021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019571, + "balance_loss_mlp": 1.01251411, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.015166594487017694, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80829203, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.07080078, + "step": 3563, + "time_per_iteration": 4.933622360229492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087969, + "balance_loss_mlp": 1.0631851, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.058645114783078524, + "language_loss": 0.81150877, + "learning_rate": 0.00023751662019934488, + "loss": 0.82238841, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.2479248, + "step": 3564, + "time_per_iteration": 2.551375150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080841, + "balance_loss_mlp": 1.05612862, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.05683958550718021, + "language_loss": 0.79323113, + "learning_rate": 0.00023725151042772364, + "loss": 0.80403948, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.24719238, + "step": 3565, + "time_per_iteration": 2.8488030433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081595, + "balance_loss_mlp": 1.05563116, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.06643768922422526, + "language_loss": 0.83425218, + "learning_rate": 0.00023698650266411276, + "loss": 0.84506816, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.26000977, + "step": 3566, + "time_per_iteration": 2.704754590988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079538, + "balance_loss_mlp": 1.05554175, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.06089372321072988, + "language_loss": 0.83402336, + "learning_rate": 0.00023672159701139755, + "loss": 0.84481871, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.23986816, + "step": 3567, + "time_per_iteration": 3.2112581729888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_mlp": 1.05952144, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.06475688467901158, + "language_loss": 0.86233699, + "learning_rate": 0.00023645679357242296, + "loss": 0.87318128, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.24890137, + "step": 3568, + "time_per_iteration": 2.618299961090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077978, + "balance_loss_mlp": 1.05325365, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.06930985258360142, + "language_loss": 0.84079957, + "learning_rate": 0.00023619209244999534, + "loss": 0.85157931, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.24694824, + "step": 3569, + "time_per_iteration": 2.5762784481048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.05775487, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.07239946064246126, + "language_loss": 0.84962302, + "learning_rate": 0.0002359274937468806, + "loss": 0.86045027, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.24975586, + "step": 3570, + "time_per_iteration": 2.507097005844116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080008, + "balance_loss_mlp": 1.0555582, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.052246818326421945, + "language_loss": 0.78233075, + "learning_rate": 0.00023566299756580512, + "loss": 0.79313087, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.2442627, + "step": 3571, + "time_per_iteration": 2.6490540504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.05523372, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.06589873086425142, + "language_loss": 0.78497767, + "learning_rate": 0.0002353986040094551, + "loss": 0.79579425, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.2644043, + "step": 3572, + "time_per_iteration": 2.525590419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05405378, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.058453848630334905, + "language_loss": 0.79833031, + "learning_rate": 0.00023513431318047796, + "loss": 0.80912042, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.24975586, + "step": 3573, + "time_per_iteration": 2.5652148723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081007, + "balance_loss_mlp": 1.0563786, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.12934714491167457, + "language_loss": 0.77343333, + "learning_rate": 0.00023487012518147977, + "loss": 0.78424335, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.24621582, + "step": 3574, + "time_per_iteration": 3.2728779315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.05660903, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.06788347581923994, + "language_loss": 0.8458752, + "learning_rate": 0.00023460604011502772, + "loss": 0.85669678, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.25549316, + "step": 3575, + "time_per_iteration": 3.650050163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071019, + "balance_loss_mlp": 1.04640222, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.06594699265094836, + "language_loss": 0.85666633, + "learning_rate": 0.00023434205808364845, + "loss": 0.86737645, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.24621582, + "step": 3576, + "time_per_iteration": 3.2174363136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081926, + "balance_loss_mlp": 1.05646336, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.073624827285274, + "language_loss": 0.85645866, + "learning_rate": 0.00023407817918982932, + "loss": 0.86727792, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.25488281, + "step": 3577, + "time_per_iteration": 2.8009090423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088022, + "balance_loss_mlp": 1.06271362, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.06549349473125507, + "language_loss": 0.79113662, + "learning_rate": 0.00023381440353601718, + "loss": 0.80201685, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.2532959, + "step": 3578, + "time_per_iteration": 3.023149251937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080761, + "balance_loss_mlp": 1.05627584, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.05959315999492073, + "language_loss": 0.86070436, + "learning_rate": 0.00023355073122461822, + "loss": 0.87151194, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.24487305, + "step": 3579, + "time_per_iteration": 2.9520890712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05880141, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.06355756593191678, + "language_loss": 0.82827502, + "learning_rate": 0.00023328716235799973, + "loss": 0.83911884, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.25598145, + "step": 3580, + "time_per_iteration": 3.351285219192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080812, + "balance_loss_mlp": 1.05680299, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.05871142943590934, + "language_loss": 0.84103072, + "learning_rate": 0.00023302369703848803, + "loss": 0.85183883, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.24023438, + "step": 3581, + "time_per_iteration": 2.7034530639648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088103, + "balance_loss_mlp": 1.06281841, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.05872811421519248, + "language_loss": 0.80432433, + "learning_rate": 0.00023276033536836937, + "loss": 0.81520534, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.25305176, + "step": 3582, + "time_per_iteration": 2.933551073074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077987, + "balance_loss_mlp": 1.05369234, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.06546577273757126, + "language_loss": 0.84750611, + "learning_rate": 0.00023249707744988984, + "loss": 0.85828596, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.24279785, + "step": 3583, + "time_per_iteration": 2.694974184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_mlp": 1.05804539, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.07473355522869814, + "language_loss": 0.82210362, + "learning_rate": 0.00023223392338525529, + "loss": 0.83294201, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.25830078, + "step": 3584, + "time_per_iteration": 2.5522758960723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078352, + "balance_loss_mlp": 1.05215001, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.05831544334966422, + "language_loss": 0.78814328, + "learning_rate": 0.00023197087327663107, + "loss": 0.79892683, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.26208496, + "step": 3585, + "time_per_iteration": 2.6880340576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083733, + "balance_loss_mlp": 1.05843663, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.6312762348239643, + "language_loss": 0.81380439, + "learning_rate": 0.00023170792722614243, + "loss": 0.8246417, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.25317383, + "step": 3586, + "time_per_iteration": 3.0318641662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079757, + "balance_loss_mlp": 1.05460346, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.05006567848129158, + "language_loss": 0.83709162, + "learning_rate": 0.00023144508533587377, + "loss": 0.84788913, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.25170898, + "step": 3587, + "time_per_iteration": 2.8474464416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06386399, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.06762785817059219, + "language_loss": 0.79246032, + "learning_rate": 0.0002311823477078698, + "loss": 0.80336785, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.26928711, + "step": 3588, + "time_per_iteration": 2.9889235496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097034, + "balance_loss_mlp": 1.0714879, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.09415937130110832, + "language_loss": 0.85614562, + "learning_rate": 0.00023091971444413428, + "loss": 0.86711591, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.2557373, + "step": 3589, + "time_per_iteration": 2.809373378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101509, + "balance_loss_mlp": 1.07424605, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.05794959755729282, + "language_loss": 0.82868153, + "learning_rate": 0.00023065718564663012, + "loss": 0.83969659, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.27307129, + "step": 3590, + "time_per_iteration": 2.7731661796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074688, + "balance_loss_mlp": 1.06705844, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.02655452112357536, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74986279, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.07617188, + "step": 3591, + "time_per_iteration": 4.988200664520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_mlp": 1.07704329, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.05972599436202674, + "language_loss": 0.81043237, + "learning_rate": 0.0002301324418579666, + "loss": 0.82145822, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.2557373, + "step": 3592, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010908, + "balance_loss_mlp": 1.0828371, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.028191154698104088, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79779273, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.07958984, + "step": 3593, + "time_per_iteration": 4.760195732116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108367, + "balance_loss_mlp": 1.08173525, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.065561015733832, + "language_loss": 0.809973, + "learning_rate": 0.00022960811715677415, + "loss": 0.82105672, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.26660156, + "step": 3594, + "time_per_iteration": 2.897792339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117, + "balance_loss_mlp": 1.08976054, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.0669935961338165, + "language_loss": 0.81794119, + "learning_rate": 0.00022934611221845608, + "loss": 0.82911116, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.27258301, + "step": 3595, + "time_per_iteration": 2.8457274436950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.08326638, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.05592882614281094, + "language_loss": 0.78289419, + "learning_rate": 0.00022908421235729609, + "loss": 0.79401559, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.28881836, + "step": 3596, + "time_per_iteration": 2.7383065223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108605, + "balance_loss_mlp": 1.08028126, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.10609288335258749, + "language_loss": 0.85567772, + "learning_rate": 0.0002288224176749728, + "loss": 0.86676377, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.28320312, + "step": 3597, + "time_per_iteration": 2.716928720474243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102474, + "balance_loss_mlp": 1.07592607, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.07082611334178894, + "language_loss": 0.78666878, + "learning_rate": 0.00022856072827312385, + "loss": 0.79769349, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.26525879, + "step": 3598, + "time_per_iteration": 2.9266068935394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102937, + "balance_loss_mlp": 1.07671118, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.06087087584265889, + "language_loss": 0.77196717, + "learning_rate": 0.00022829914425334598, + "loss": 0.78299654, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.26269531, + "step": 3599, + "time_per_iteration": 2.654209852218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.07294059, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.0619495663998332, + "language_loss": 0.80632389, + "learning_rate": 0.0002280376657171956, + "loss": 0.81731534, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.26245117, + "step": 3600, + "time_per_iteration": 2.699690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110091, + "balance_loss_mlp": 1.07408822, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.061197826149154644, + "language_loss": 0.76475906, + "learning_rate": 0.00022777629276618706, + "loss": 0.77576816, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.26855469, + "step": 3601, + "time_per_iteration": 3.2044432163238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07805634, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.05964780177227117, + "language_loss": 0.77385223, + "learning_rate": 0.0002275150255017947, + "loss": 0.78491223, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.2800293, + "step": 3602, + "time_per_iteration": 2.7982289791107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.06525421, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.02252774148051873, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76806176, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.08251953, + "step": 3603, + "time_per_iteration": 5.054601192474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.06465173, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.023702106563631756, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76200008, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.08251953, + "step": 3604, + "time_per_iteration": 4.7977614402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116592, + "balance_loss_mlp": 1.08946013, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.06388542496956687, + "language_loss": 0.85052603, + "learning_rate": 0.0002267318588424379, + "loss": 0.86169201, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.2713623, + "step": 3605, + "time_per_iteration": 2.654792308807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110494, + "balance_loss_mlp": 1.08425605, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.06333584007687255, + "language_loss": 0.87824345, + "learning_rate": 0.00022647101533842845, + "loss": 0.88934839, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.26257324, + "step": 3606, + "time_per_iteration": 2.8975396156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109552, + "balance_loss_mlp": 1.08295608, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.1091990827020025, + "language_loss": 0.76831424, + "learning_rate": 0.00022621027802778872, + "loss": 0.77940977, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.26623535, + "step": 3607, + "time_per_iteration": 2.63248348236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108534, + "balance_loss_mlp": 1.08149719, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.059104440190076296, + "language_loss": 0.78716248, + "learning_rate": 0.00022594964701174586, + "loss": 0.79824781, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.27075195, + "step": 3608, + "time_per_iteration": 2.681976079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111794, + "balance_loss_mlp": 1.08559155, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.07590462116844392, + "language_loss": 0.84867048, + "learning_rate": 0.00022568912239148586, + "loss": 0.85978842, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.26245117, + "step": 3609, + "time_per_iteration": 2.6417384147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101276, + "balance_loss_mlp": 1.07528806, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.058005826874071686, + "language_loss": 0.81464773, + "learning_rate": 0.00022542870426815344, + "loss": 0.82566053, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.26000977, + "step": 3610, + "time_per_iteration": 2.7006101608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109994, + "balance_loss_mlp": 1.08157444, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.056828094701861585, + "language_loss": 0.86496603, + "learning_rate": 0.00022516839274285173, + "loss": 0.87606597, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.28442383, + "step": 3611, + "time_per_iteration": 2.5740535259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094552, + "balance_loss_mlp": 1.06758666, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.08027595543675893, + "language_loss": 0.74892008, + "learning_rate": 0.00022490818791664265, + "loss": 0.75986564, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.26977539, + "step": 3612, + "time_per_iteration": 2.608222007751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098839, + "balance_loss_mlp": 1.072685, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.059039400605863955, + "language_loss": 0.85845947, + "learning_rate": 0.00022464808989054676, + "loss": 0.86944789, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.26171875, + "step": 3613, + "time_per_iteration": 2.676614999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108263, + "balance_loss_mlp": 1.08036768, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.062091067173502004, + "language_loss": 0.76033241, + "learning_rate": 0.00022438809876554284, + "loss": 0.77141511, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.27905273, + "step": 3614, + "time_per_iteration": 2.6178860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104393, + "balance_loss_mlp": 1.07635498, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.07239671718654846, + "language_loss": 0.80618018, + "learning_rate": 0.00022412821464256873, + "loss": 0.81722414, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.28051758, + "step": 3615, + "time_per_iteration": 2.690284252166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094366, + "balance_loss_mlp": 1.06802058, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.05319621307951733, + "language_loss": 0.82896477, + "learning_rate": 0.00022386843762252023, + "loss": 0.83990836, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.2635498, + "step": 3616, + "time_per_iteration": 2.600942611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102453, + "balance_loss_mlp": 1.07486832, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.06580678033513349, + "language_loss": 0.79979908, + "learning_rate": 0.00022360876780625193, + "loss": 0.81082356, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.27587891, + "step": 3617, + "time_per_iteration": 2.645925998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095024, + "balance_loss_mlp": 1.06762934, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.0499393728898112, + "language_loss": 0.8003695, + "learning_rate": 0.00022334920529457604, + "loss": 0.81131971, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.27441406, + "step": 3618, + "time_per_iteration": 2.899454116821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089532, + "balance_loss_mlp": 1.06254315, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.05309035379190974, + "language_loss": 0.87337005, + "learning_rate": 0.00022308975018826423, + "loss": 0.88426542, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.27026367, + "step": 3619, + "time_per_iteration": 2.912917375564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095664, + "balance_loss_mlp": 1.06719649, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.06796578820751965, + "language_loss": 0.84640574, + "learning_rate": 0.00022283040258804564, + "loss": 0.85736233, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.28466797, + "step": 3620, + "time_per_iteration": 2.8118083477020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094233, + "balance_loss_mlp": 1.06671929, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.05989374057808202, + "language_loss": 0.8382861, + "learning_rate": 0.00022257116259460802, + "loss": 0.84922838, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.27539062, + "step": 3621, + "time_per_iteration": 2.8895604610443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087446, + "balance_loss_mlp": 1.06112456, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.08406713908768157, + "language_loss": 0.81423789, + "learning_rate": 0.00022231203030859725, + "loss": 0.82511234, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.26367188, + "step": 3622, + "time_per_iteration": 2.971266269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094655, + "balance_loss_mlp": 1.06714153, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.06551084245575202, + "language_loss": 0.83678401, + "learning_rate": 0.00022205300583061737, + "loss": 0.84773052, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.27539062, + "step": 3623, + "time_per_iteration": 2.585472822189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108403, + "balance_loss_mlp": 1.07649624, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.033083333186048725, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83922231, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.07519531, + "step": 3624, + "time_per_iteration": 4.895202159881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090164, + "balance_loss_mlp": 1.06300831, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.0660307641542608, + "language_loss": 0.77727789, + "learning_rate": 0.00022153528070095735, + "loss": 0.78817952, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.27197266, + "step": 3625, + "time_per_iteration": 2.701016902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085494, + "balance_loss_mlp": 1.05917203, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07343943993793525, + "language_loss": 0.88176632, + "learning_rate": 0.00022127658025027568, + "loss": 0.89262128, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.26330566, + "step": 3626, + "time_per_iteration": 2.66186261177063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087139, + "balance_loss_mlp": 1.0592438, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.05867128849985362, + "language_loss": 0.85380179, + "learning_rate": 0.00022101798800962258, + "loss": 0.86467314, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.27905273, + "step": 3627, + "time_per_iteration": 2.61289119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088464, + "balance_loss_mlp": 1.06195211, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.06919874176804652, + "language_loss": 0.78915298, + "learning_rate": 0.00022075950407939227, + "loss": 0.80003762, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.26550293, + "step": 3628, + "time_per_iteration": 2.6066434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082826, + "balance_loss_mlp": 1.05665994, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.06455342757001964, + "language_loss": 0.83102697, + "learning_rate": 0.0002205011285599367, + "loss": 0.84185529, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.26208496, + "step": 3629, + "time_per_iteration": 2.627265691757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084671, + "balance_loss_mlp": 1.05813527, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.05600849207785957, + "language_loss": 0.80451405, + "learning_rate": 0.00022024286155156658, + "loss": 0.81536078, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.26586914, + "step": 3630, + "time_per_iteration": 2.8945116996765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080924, + "balance_loss_mlp": 1.05462611, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.05471727557268105, + "language_loss": 0.86118478, + "learning_rate": 0.00021998470315454994, + "loss": 0.87199402, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.26306152, + "step": 3631, + "time_per_iteration": 2.711768627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05755305, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.05720000052164256, + "language_loss": 0.8692646, + "learning_rate": 0.00021972665346911275, + "loss": 0.8801105, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.27050781, + "step": 3632, + "time_per_iteration": 2.7766430377960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086616, + "balance_loss_mlp": 1.0609858, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.0722224306004379, + "language_loss": 0.79897952, + "learning_rate": 0.00021946871259543877, + "loss": 0.80984569, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.2565918, + "step": 3633, + "time_per_iteration": 2.600034713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079015, + "balance_loss_mlp": 1.05383754, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.0639524243068684, + "language_loss": 0.83284152, + "learning_rate": 0.00021921088063366957, + "loss": 0.84363163, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.25183105, + "step": 3634, + "time_per_iteration": 2.956197738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085501, + "balance_loss_mlp": 1.0596205, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.058476095641480985, + "language_loss": 0.81960422, + "learning_rate": 0.00021895315768390435, + "loss": 0.83045918, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.2590332, + "step": 3635, + "time_per_iteration": 2.5913336277008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05759156, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.04531341451753373, + "language_loss": 0.87785435, + "learning_rate": 0.00021869554384619999, + "loss": 0.88868463, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.25415039, + "step": 3636, + "time_per_iteration": 2.9603588581085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089766, + "balance_loss_mlp": 1.06315875, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.21159082474566934, + "language_loss": 0.80919135, + "learning_rate": 0.00021843803922057115, + "loss": 0.82008898, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.26660156, + "step": 3637, + "time_per_iteration": 2.708937406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087312, + "balance_loss_mlp": 1.0621587, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.060159968094543256, + "language_loss": 0.82011575, + "learning_rate": 0.00021818064390698977, + "loss": 0.83098888, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.25170898, + "step": 3638, + "time_per_iteration": 2.605764389038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086471, + "balance_loss_mlp": 1.06113935, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.06371626432210087, + "language_loss": 0.87017298, + "learning_rate": 0.0002179233580053861, + "loss": 0.88103765, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.25354004, + "step": 3639, + "time_per_iteration": 2.7112109661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.0573926, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.058687026763644914, + "language_loss": 0.86069989, + "learning_rate": 0.00021766618161564688, + "loss": 0.87153351, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.26013184, + "step": 3640, + "time_per_iteration": 2.6974241733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.05666459, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.05259786469009478, + "language_loss": 0.87277496, + "learning_rate": 0.00021740911483761677, + "loss": 0.88360298, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.26123047, + "step": 3641, + "time_per_iteration": 2.5836639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089745, + "balance_loss_mlp": 1.06368566, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.04971665087061583, + "language_loss": 0.9236384, + "learning_rate": 0.00021715215777109837, + "loss": 0.93453586, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.26074219, + "step": 3642, + "time_per_iteration": 2.974407911300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085504, + "balance_loss_mlp": 1.06024349, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.05973771415141703, + "language_loss": 0.84664541, + "learning_rate": 0.00021689531051585103, + "loss": 0.85750043, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.25280762, + "step": 3643, + "time_per_iteration": 2.577305316925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089117, + "balance_loss_mlp": 1.06186557, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.062367103447564735, + "language_loss": 0.80804634, + "learning_rate": 0.00021663857317159196, + "loss": 0.81893754, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.27294922, + "step": 3644, + "time_per_iteration": 2.640782356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085412, + "balance_loss_mlp": 1.05996037, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.10933947779444686, + "language_loss": 0.82007676, + "learning_rate": 0.00021638194583799487, + "loss": 0.83093089, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.25476074, + "step": 3645, + "time_per_iteration": 2.660571813583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080249, + "balance_loss_mlp": 1.05334401, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.0653990594073395, + "language_loss": 0.82918119, + "learning_rate": 0.00021612542861469176, + "loss": 0.83998358, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.26916504, + "step": 3646, + "time_per_iteration": 3.1750996112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082853, + "balance_loss_mlp": 1.05595946, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.060469177257194674, + "language_loss": 0.82402915, + "learning_rate": 0.00021586902160127135, + "loss": 0.8348577, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.26928711, + "step": 3647, + "time_per_iteration": 2.60231614112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083469, + "balance_loss_mlp": 1.05743361, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.10102975915851765, + "language_loss": 0.74238408, + "learning_rate": 0.00021561272489727974, + "loss": 0.75321877, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.26062012, + "step": 3648, + "time_per_iteration": 2.455183744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083725, + "balance_loss_mlp": 1.0581665, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.05896874636911686, + "language_loss": 0.80454385, + "learning_rate": 0.0002153565386022199, + "loss": 0.81538105, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.25585938, + "step": 3649, + "time_per_iteration": 2.6365654468536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090223, + "balance_loss_mlp": 1.0643425, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.0684708856776036, + "language_loss": 0.82569027, + "learning_rate": 0.00021510046281555262, + "loss": 0.83659256, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.25915527, + "step": 3650, + "time_per_iteration": 2.8082711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088794, + "balance_loss_mlp": 1.06316423, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.06759336316034399, + "language_loss": 0.81458813, + "learning_rate": 0.0002148444976366949, + "loss": 0.82547605, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.2565918, + "step": 3651, + "time_per_iteration": 2.753706455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086485, + "balance_loss_mlp": 1.06129622, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.05344717871766575, + "language_loss": 0.82698804, + "learning_rate": 0.00021458864316502136, + "loss": 0.8378529, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.25183105, + "step": 3652, + "time_per_iteration": 2.737903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086912, + "balance_loss_mlp": 1.06264138, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.05962835254673255, + "language_loss": 0.87223494, + "learning_rate": 0.0002143328994998634, + "loss": 0.88310409, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.24267578, + "step": 3653, + "time_per_iteration": 2.504406213760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089816, + "balance_loss_mlp": 1.06336296, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.060478723540627326, + "language_loss": 0.78619695, + "learning_rate": 0.00021407726674050982, + "loss": 0.79709506, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.26477051, + "step": 3654, + "time_per_iteration": 2.8486123085021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094096, + "balance_loss_mlp": 1.06856155, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.050916885962277426, + "language_loss": 0.87187326, + "learning_rate": 0.0002138217449862061, + "loss": 0.88281423, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.25549316, + "step": 3655, + "time_per_iteration": 2.7588388919830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108901, + "balance_loss_mlp": 1.06326032, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.05276360585412431, + "language_loss": 0.78396368, + "learning_rate": 0.00021356633433615403, + "loss": 0.79485381, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.25744629, + "step": 3656, + "time_per_iteration": 2.6218318939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079447, + "balance_loss_mlp": 1.05436552, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.048722851637787626, + "language_loss": 0.83386952, + "learning_rate": 0.0002133110348895133, + "loss": 0.84466398, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.25061035, + "step": 3657, + "time_per_iteration": 2.9466397762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086119, + "balance_loss_mlp": 1.06054902, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.10765454833188913, + "language_loss": 0.85102618, + "learning_rate": 0.0002130558467453999, + "loss": 0.86188745, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.25585938, + "step": 3658, + "time_per_iteration": 3.3578195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05956531, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06250625204972131, + "language_loss": 0.84476495, + "learning_rate": 0.0002128007700028865, + "loss": 0.85562122, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.26086426, + "step": 3659, + "time_per_iteration": 2.716048002243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.05420375, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.07665519307089459, + "language_loss": 0.845348, + "learning_rate": 0.00021254580476100276, + "loss": 0.85614467, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.25476074, + "step": 3660, + "time_per_iteration": 2.5458219051361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.05685711, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.058748946938806695, + "language_loss": 0.7943238, + "learning_rate": 0.00021229095111873497, + "loss": 0.80515134, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.25927734, + "step": 3661, + "time_per_iteration": 2.775683641433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05252695, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.051479556836423725, + "language_loss": 0.86013281, + "learning_rate": 0.0002120362091750261, + "loss": 0.87092221, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.26452637, + "step": 3662, + "time_per_iteration": 2.835092782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076634, + "balance_loss_mlp": 1.04978824, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.060876931500520017, + "language_loss": 0.86844277, + "learning_rate": 0.00021178157902877566, + "loss": 0.87920904, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.26879883, + "step": 3663, + "time_per_iteration": 2.440558910369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.0555284, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.061135120384029226, + "language_loss": 0.87179941, + "learning_rate": 0.0002115270607788397, + "loss": 0.88261312, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.25866699, + "step": 3664, + "time_per_iteration": 2.7565457820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107708, + "balance_loss_mlp": 1.05143833, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.0582225162514945, + "language_loss": 0.85968196, + "learning_rate": 0.00021127265452403133, + "loss": 0.87045276, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.25671387, + "step": 3665, + "time_per_iteration": 2.545664072036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032205, + "balance_loss_mlp": 1.02552938, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.013425187729100906, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85123837, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.06689453, + "step": 3666, + "time_per_iteration": 4.894615888595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076922, + "balance_loss_mlp": 1.04990888, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.05971260757424555, + "language_loss": 0.82980728, + "learning_rate": 0.00021076417839483065, + "loss": 0.84057647, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.27026367, + "step": 3667, + "time_per_iteration": 2.776766300201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_mlp": 1.04667187, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.06375812283048922, + "language_loss": 0.8522588, + "learning_rate": 0.00021051010871784589, + "loss": 0.86299354, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.26855469, + "step": 3668, + "time_per_iteration": 2.5415139198303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069942, + "balance_loss_mlp": 1.04501557, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.055214127492262476, + "language_loss": 0.79052877, + "learning_rate": 0.0002102561514308045, + "loss": 0.80122823, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.24926758, + "step": 3669, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072356, + "balance_loss_mlp": 1.04667854, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.07306534316954115, + "language_loss": 0.82677996, + "learning_rate": 0.00021000230663230135, + "loss": 0.83750349, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.25708008, + "step": 3670, + "time_per_iteration": 2.6818981170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074835, + "balance_loss_mlp": 1.04937172, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.06539460490463701, + "language_loss": 0.83441806, + "learning_rate": 0.00020974857442088762, + "loss": 0.84516644, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.25476074, + "step": 3671, + "time_per_iteration": 2.608067512512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075045, + "balance_loss_mlp": 1.04928422, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.05848649704443167, + "language_loss": 0.88856924, + "learning_rate": 0.00020949495489507104, + "loss": 0.89931971, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.25769043, + "step": 3672, + "time_per_iteration": 2.6813790798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076712, + "balance_loss_mlp": 1.050367, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.06054837689365347, + "language_loss": 0.84767634, + "learning_rate": 0.00020924144815331525, + "loss": 0.8584435, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.26367188, + "step": 3673, + "time_per_iteration": 2.542840003967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076685, + "balance_loss_mlp": 1.05078053, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.05390499311408587, + "language_loss": 0.83514738, + "learning_rate": 0.00020898805429404044, + "loss": 0.84591424, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.25927734, + "step": 3674, + "time_per_iteration": 2.6225385665893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079471, + "balance_loss_mlp": 1.05372167, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.06276037819785552, + "language_loss": 0.78933322, + "learning_rate": 0.0002087347734156228, + "loss": 0.80012792, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.2578125, + "step": 3675, + "time_per_iteration": 2.855715751647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078668, + "balance_loss_mlp": 1.05318117, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.06320503796682253, + "language_loss": 0.79648715, + "learning_rate": 0.00020848160561639452, + "loss": 0.80727386, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.25512695, + "step": 3676, + "time_per_iteration": 2.647651433944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079385, + "balance_loss_mlp": 1.05396986, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.05839132735303564, + "language_loss": 0.86102867, + "learning_rate": 0.0002082285509946445, + "loss": 0.8718226, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.25415039, + "step": 3677, + "time_per_iteration": 2.5633320808410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081369, + "balance_loss_mlp": 1.05606055, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.05152517094969974, + "language_loss": 0.8344785, + "learning_rate": 0.00020797560964861683, + "loss": 0.84529221, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.25341797, + "step": 3678, + "time_per_iteration": 2.7661099433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.05028617, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.06274913334452144, + "language_loss": 0.80699748, + "learning_rate": 0.0002077227816765122, + "loss": 0.81774426, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.24401855, + "step": 3679, + "time_per_iteration": 3.065239191055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_mlp": 1.01730835, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.014391592464441782, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77472043, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.0703125, + "step": 3680, + "time_per_iteration": 4.8172595500946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073441, + "balance_loss_mlp": 1.04958761, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.05034113841233223, + "language_loss": 0.79209405, + "learning_rate": 0.00020721746624665383, + "loss": 0.80282843, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.23852539, + "step": 3681, + "time_per_iteration": 2.7298145294189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_mlp": 1.05822945, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.059799820942850454, + "language_loss": 0.80445623, + "learning_rate": 0.00020696497898508114, + "loss": 0.81529093, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.25268555, + "step": 3682, + "time_per_iteration": 2.9937915802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075641, + "balance_loss_mlp": 1.05092919, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.06191150286406427, + "language_loss": 0.77959311, + "learning_rate": 0.00020671260548979316, + "loss": 0.79034948, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.24719238, + "step": 3683, + "time_per_iteration": 3.0161404609680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081595, + "balance_loss_mlp": 1.05558372, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.05521829943560005, + "language_loss": 0.85212427, + "learning_rate": 0.00020646034585876982, + "loss": 0.86294019, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.26037598, + "step": 3684, + "time_per_iteration": 2.8698270320892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073657, + "balance_loss_mlp": 1.04850388, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.04944753850163826, + "language_loss": 0.84324521, + "learning_rate": 0.00020620820018994718, + "loss": 0.85398173, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.25170898, + "step": 3685, + "time_per_iteration": 2.801947832107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079387, + "balance_loss_mlp": 1.0536145, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.07519073749771547, + "language_loss": 0.83086288, + "learning_rate": 0.00020595616858121675, + "loss": 0.84165674, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.2578125, + "step": 3686, + "time_per_iteration": 2.7280051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070479, + "balance_loss_mlp": 1.04551697, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.05447903108557543, + "language_loss": 0.80602473, + "learning_rate": 0.00020570425113042586, + "loss": 0.81672955, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.24963379, + "step": 3687, + "time_per_iteration": 2.8146443367004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05596519, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.06579545138102952, + "language_loss": 0.85866553, + "learning_rate": 0.0002054524479353776, + "loss": 0.86947191, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.24682617, + "step": 3688, + "time_per_iteration": 2.6602835655212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04767823, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07679676176766496, + "language_loss": 0.81976587, + "learning_rate": 0.00020520075909383063, + "loss": 0.83050537, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.26306152, + "step": 3689, + "time_per_iteration": 2.866727590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074803, + "balance_loss_mlp": 1.04981625, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.05660248987472117, + "language_loss": 0.81022668, + "learning_rate": 0.00020494918470349916, + "loss": 0.82097471, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.25, + "step": 3690, + "time_per_iteration": 3.272037982940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107649, + "balance_loss_mlp": 1.04971516, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.08247583019648676, + "language_loss": 0.85683942, + "learning_rate": 0.00020469772486205297, + "loss": 0.86760426, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.26794434, + "step": 3691, + "time_per_iteration": 2.677762269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079524, + "balance_loss_mlp": 1.05322635, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.06411942158990899, + "language_loss": 0.81443423, + "learning_rate": 0.0002044463796671177, + "loss": 0.82522947, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.26330566, + "step": 3692, + "time_per_iteration": 2.6739578247070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077922, + "balance_loss_mlp": 1.0519464, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.06149610751956677, + "language_loss": 0.80325758, + "learning_rate": 0.00020419514921627408, + "loss": 0.81403679, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.2598877, + "step": 3693, + "time_per_iteration": 2.8510119915008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076383, + "balance_loss_mlp": 1.05039525, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.05808850805852677, + "language_loss": 0.77474564, + "learning_rate": 0.00020394403360705855, + "loss": 0.78550947, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.26025391, + "step": 3694, + "time_per_iteration": 2.719911813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086948, + "balance_loss_mlp": 1.06034029, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.059410233197540796, + "language_loss": 0.87816525, + "learning_rate": 0.00020369303293696228, + "loss": 0.88903475, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.26635742, + "step": 3695, + "time_per_iteration": 2.657715082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079685, + "balance_loss_mlp": 1.05571198, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.06517545508220793, + "language_loss": 0.7842719, + "learning_rate": 0.00020344214730343304, + "loss": 0.79506874, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.23962402, + "step": 3696, + "time_per_iteration": 2.6142332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05308461, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.05470571931894002, + "language_loss": 0.79182768, + "learning_rate": 0.00020319137680387296, + "loss": 0.80260944, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.25109863, + "step": 3697, + "time_per_iteration": 2.915419578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107666, + "balance_loss_mlp": 1.05055368, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.06661588329403122, + "language_loss": 0.80553949, + "learning_rate": 0.0002029407215356398, + "loss": 0.81630599, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.26123047, + "step": 3698, + "time_per_iteration": 2.5700740814208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108156, + "balance_loss_mlp": 1.05670524, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.06665507382105876, + "language_loss": 0.83601737, + "learning_rate": 0.00020269018159604663, + "loss": 0.84683299, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.24841309, + "step": 3699, + "time_per_iteration": 2.7208173274993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.05197358, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.05024967484992462, + "language_loss": 0.82184601, + "learning_rate": 0.00020243975708236162, + "loss": 0.83261693, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.25146484, + "step": 3700, + "time_per_iteration": 2.6433067321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108194, + "balance_loss_mlp": 1.05664349, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.07883365908247705, + "language_loss": 0.86320221, + "learning_rate": 0.00020218944809180818, + "loss": 0.87402165, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.25305176, + "step": 3701, + "time_per_iteration": 2.705932855606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080401, + "balance_loss_mlp": 1.05541444, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.048190263761871716, + "language_loss": 0.84987295, + "learning_rate": 0.00020193925472156493, + "loss": 0.86067688, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.25, + "step": 3702, + "time_per_iteration": 2.6893904209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_mlp": 1.03368771, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.023975764530948636, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.7532953, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.06689453, + "step": 3703, + "time_per_iteration": 4.881204843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078432, + "balance_loss_mlp": 1.05408931, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.04896517905072385, + "language_loss": 0.83809257, + "learning_rate": 0.00020143921523049863, + "loss": 0.84887689, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.24316406, + "step": 3704, + "time_per_iteration": 2.9580681324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075128, + "balance_loss_mlp": 1.04962897, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.05872916530123236, + "language_loss": 0.84084362, + "learning_rate": 0.00020118936930380837, + "loss": 0.85159492, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.25512695, + "step": 3705, + "time_per_iteration": 2.76068377494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_mlp": 1.05290496, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.05789936228630773, + "language_loss": 0.81465518, + "learning_rate": 0.0002009396393856932, + "loss": 0.82543886, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.25463867, + "step": 3706, + "time_per_iteration": 2.664915084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04923296, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.06297371189153962, + "language_loss": 0.8270002, + "learning_rate": 0.00020069002557310673, + "loss": 0.83774769, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.25512695, + "step": 3707, + "time_per_iteration": 2.658581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_mlp": 1.04976273, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.06876092007107866, + "language_loss": 0.77463377, + "learning_rate": 0.00020044052796295807, + "loss": 0.78538585, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.25476074, + "step": 3708, + "time_per_iteration": 2.7701447010040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073382, + "balance_loss_mlp": 1.04729891, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.058576923733569305, + "language_loss": 0.82293993, + "learning_rate": 0.00020019114665211063, + "loss": 0.83367372, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.2611084, + "step": 3709, + "time_per_iteration": 2.584200143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.04671192, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.05922999044905372, + "language_loss": 0.81765306, + "learning_rate": 0.00019994188173738276, + "loss": 0.82836854, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.24829102, + "step": 3710, + "time_per_iteration": 2.551407814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072134, + "balance_loss_mlp": 1.04628921, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.06343816758833129, + "language_loss": 0.80772817, + "learning_rate": 0.0001996927333155477, + "loss": 0.8184495, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.25878906, + "step": 3711, + "time_per_iteration": 2.748868227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075658, + "balance_loss_mlp": 1.04955149, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.06552359252627656, + "language_loss": 0.8595196, + "learning_rate": 0.00019944370148333346, + "loss": 0.87027609, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.26123047, + "step": 3712, + "time_per_iteration": 3.166109800338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072351, + "balance_loss_mlp": 1.04660141, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.05387618778038521, + "language_loss": 0.80135339, + "learning_rate": 0.00019919478633742278, + "loss": 0.81207693, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.2578125, + "step": 3713, + "time_per_iteration": 2.683401107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075332, + "balance_loss_mlp": 1.04877234, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.058133564140499, + "language_loss": 0.85435075, + "learning_rate": 0.00019894598797445302, + "loss": 0.86510408, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.265625, + "step": 3714, + "time_per_iteration": 2.570040225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074334, + "balance_loss_mlp": 1.04846525, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.050277092127782926, + "language_loss": 0.81853724, + "learning_rate": 0.00019869730649101615, + "loss": 0.82928061, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.25878906, + "step": 3715, + "time_per_iteration": 2.811513662338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071824, + "balance_loss_mlp": 1.04564583, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.06869941272731987, + "language_loss": 0.72641587, + "learning_rate": 0.00019844874198365943, + "loss": 0.73713416, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.26220703, + "step": 3716, + "time_per_iteration": 3.1328516006469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068844, + "balance_loss_mlp": 1.04348803, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.061640340400288096, + "language_loss": 0.84182858, + "learning_rate": 0.00019820029454888362, + "loss": 0.85251707, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.25378418, + "step": 3717, + "time_per_iteration": 2.7154488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014076, + "balance_loss_mlp": 1.00725687, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.019699659470436708, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75535345, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.06835938, + "step": 3718, + "time_per_iteration": 5.046099424362183 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04632878, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.06720182925313008, + "language_loss": 0.80157018, + "learning_rate": 0.0001977037512828529, + "loss": 0.81229812, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.26489258, + "step": 3719, + "time_per_iteration": 2.5823724269866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_mlp": 1.04183865, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.06101106638891309, + "language_loss": 0.86410248, + "learning_rate": 0.0001974556556443734, + "loss": 0.87477803, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.25708008, + "step": 3720, + "time_per_iteration": 2.6981611251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069959, + "balance_loss_mlp": 1.04529428, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.05855660874159423, + "language_loss": 0.88533628, + "learning_rate": 0.00019720767746402547, + "loss": 0.89603585, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.24658203, + "step": 3721, + "time_per_iteration": 2.7615206241607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_mlp": 1.04597294, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.062366353751096386, + "language_loss": 0.8018384, + "learning_rate": 0.00019695981683808222, + "loss": 0.81254995, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.2520752, + "step": 3722, + "time_per_iteration": 2.7723004817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079066, + "balance_loss_mlp": 1.05452061, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.061040751408566865, + "language_loss": 0.85407031, + "learning_rate": 0.00019671207386277225, + "loss": 0.86486095, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.24536133, + "step": 3723, + "time_per_iteration": 2.929828643798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074994, + "balance_loss_mlp": 1.0494113, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.060904147533300125, + "language_loss": 0.78436089, + "learning_rate": 0.0001964644486342777, + "loss": 0.79511088, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.25610352, + "step": 3724, + "time_per_iteration": 2.945258617401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072735, + "balance_loss_mlp": 1.04702103, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.06414027483355057, + "language_loss": 0.87113518, + "learning_rate": 0.00019621694124873524, + "loss": 0.88186252, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.25732422, + "step": 3725, + "time_per_iteration": 2.6636407375335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010246, + "balance_loss_mlp": 1.00323606, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.005035081365633862, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77550328, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.0703125, + "step": 3726, + "time_per_iteration": 4.901204347610474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074333, + "balance_loss_mlp": 1.04913247, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.05913508438980992, + "language_loss": 0.77430266, + "learning_rate": 0.00019572228039082428, + "loss": 0.78504598, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.2520752, + "step": 3727, + "time_per_iteration": 3.088613986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078544, + "balance_loss_mlp": 1.05268764, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.05372970057922247, + "language_loss": 0.83879149, + "learning_rate": 0.0001954751271105002, + "loss": 0.84957701, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.25866699, + "step": 3728, + "time_per_iteration": 2.8328897953033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079816, + "balance_loss_mlp": 1.05423403, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.054514017613719934, + "language_loss": 0.80957007, + "learning_rate": 0.00019522809205721687, + "loss": 0.82036829, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.25598145, + "step": 3729, + "time_per_iteration": 2.763596534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076432, + "balance_loss_mlp": 1.05167198, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.06077062039876485, + "language_loss": 0.82796627, + "learning_rate": 0.0001949811753268816, + "loss": 0.83873057, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.24768066, + "step": 3730, + "time_per_iteration": 2.6999707221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107353, + "balance_loss_mlp": 1.04911554, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.06199825755801458, + "language_loss": 0.82858533, + "learning_rate": 0.00019473437701535634, + "loss": 0.83932066, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.2442627, + "step": 3731, + "time_per_iteration": 2.6672961711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.04839206, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.05911673909192475, + "language_loss": 0.89378715, + "learning_rate": 0.00019448769721845677, + "loss": 0.90452051, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.24975586, + "step": 3732, + "time_per_iteration": 2.8097128868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077958, + "balance_loss_mlp": 1.0521369, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.0968125790866447, + "language_loss": 0.85677779, + "learning_rate": 0.00019424113603195203, + "loss": 0.86755735, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.25854492, + "step": 3733, + "time_per_iteration": 2.5098788738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.05124426, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.06289800168130656, + "language_loss": 0.80150187, + "learning_rate": 0.0001939946935515657, + "loss": 0.81226921, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.25512695, + "step": 3734, + "time_per_iteration": 2.8232650756835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075815, + "balance_loss_mlp": 1.05049455, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.06894576786718996, + "language_loss": 0.80948031, + "learning_rate": 0.0001937483698729755, + "loss": 0.82023847, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.25341797, + "step": 3735, + "time_per_iteration": 2.583744525909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_mlp": 1.06058323, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.05171464240859849, + "language_loss": 0.82055521, + "learning_rate": 0.0001935021650918128, + "loss": 0.83142066, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.25976562, + "step": 3736, + "time_per_iteration": 3.018035411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075054, + "balance_loss_mlp": 1.05029404, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.06470560317481229, + "language_loss": 0.87265974, + "learning_rate": 0.0001932560793036625, + "loss": 0.88341027, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.24755859, + "step": 3737, + "time_per_iteration": 2.5036935806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080158, + "balance_loss_mlp": 1.05338335, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.06672658192386556, + "language_loss": 0.8673166, + "learning_rate": 0.00019301011260406382, + "loss": 0.87811816, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.26794434, + "step": 3738, + "time_per_iteration": 2.651357412338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075221, + "balance_loss_mlp": 1.050843, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.054290518405139924, + "language_loss": 0.80049711, + "learning_rate": 0.00019276426508850936, + "loss": 0.81124938, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.24377441, + "step": 3739, + "time_per_iteration": 2.7231712341308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070443, + "balance_loss_mlp": 1.04517078, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.061140917990422254, + "language_loss": 0.80563027, + "learning_rate": 0.00019251853685244564, + "loss": 0.81633466, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.25292969, + "step": 3740, + "time_per_iteration": 3.0039608478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071133, + "balance_loss_mlp": 1.0455265, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.05993968121683736, + "language_loss": 0.80916333, + "learning_rate": 0.00019227292799127283, + "loss": 0.81987464, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.25622559, + "step": 3741, + "time_per_iteration": 3.011082172393799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.04817998, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.062033255796259436, + "language_loss": 0.79226792, + "learning_rate": 0.00019202743860034454, + "loss": 0.80300719, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.25744629, + "step": 3742, + "time_per_iteration": 3.2250611782073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071976, + "balance_loss_mlp": 1.04692984, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.06270566779319728, + "language_loss": 0.83965755, + "learning_rate": 0.00019178206877496873, + "loss": 0.85037732, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.25061035, + "step": 3743, + "time_per_iteration": 2.702446222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068486, + "balance_loss_mlp": 1.04463267, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.05142738510326197, + "language_loss": 0.85388875, + "learning_rate": 0.0001915368186104059, + "loss": 0.8645736, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.23840332, + "step": 3744, + "time_per_iteration": 2.737600326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072746, + "balance_loss_mlp": 1.04818881, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.07812429294813375, + "language_loss": 0.80877572, + "learning_rate": 0.0001912916882018706, + "loss": 0.81950319, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.2454834, + "step": 3745, + "time_per_iteration": 2.7886669635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072955, + "balance_loss_mlp": 1.04774189, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.10461054453296469, + "language_loss": 0.79336673, + "learning_rate": 0.00019104667764453125, + "loss": 0.80409628, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.2520752, + "step": 3746, + "time_per_iteration": 3.01520037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068478, + "balance_loss_mlp": 1.04382503, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.05271540811251211, + "language_loss": 0.80517203, + "learning_rate": 0.00019080178703350926, + "loss": 0.81585681, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.24658203, + "step": 3747, + "time_per_iteration": 2.6013572216033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067775, + "balance_loss_mlp": 1.04224086, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.06037415597287081, + "language_loss": 0.83132112, + "learning_rate": 0.00019055701646387952, + "loss": 0.84199888, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.25549316, + "step": 3748, + "time_per_iteration": 2.641214609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012229, + "balance_loss_mlp": 1.00474262, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.010630398353693617, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81484914, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.07470703, + "step": 3749, + "time_per_iteration": 4.815402507781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.04136407, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.06404376467324384, + "language_loss": 0.86850023, + "learning_rate": 0.00019006783582886368, + "loss": 0.8791635, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.24975586, + "step": 3750, + "time_per_iteration": 2.5772666931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068921, + "balance_loss_mlp": 1.04362464, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.05743356486239607, + "language_loss": 0.83082181, + "learning_rate": 0.00018982342595339437, + "loss": 0.84151101, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.25292969, + "step": 3751, + "time_per_iteration": 3.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074387, + "balance_loss_mlp": 1.04874492, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.12990726200021083, + "language_loss": 0.82180882, + "learning_rate": 0.00018957913649915076, + "loss": 0.83255273, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.25646973, + "step": 3752, + "time_per_iteration": 3.160003900527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069675, + "balance_loss_mlp": 1.0439254, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.06468827882865268, + "language_loss": 0.80619174, + "learning_rate": 0.00018933496756097428, + "loss": 0.81688845, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.2578125, + "step": 3753, + "time_per_iteration": 2.5997426509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.04083598, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.06037343169471402, + "language_loss": 0.81664622, + "learning_rate": 0.0001890909192336603, + "loss": 0.8273102, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.2557373, + "step": 3754, + "time_per_iteration": 3.018083095550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.04364371, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.056170219084609056, + "language_loss": 0.70541704, + "learning_rate": 0.00018884699161195623, + "loss": 0.71610725, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.25390625, + "step": 3755, + "time_per_iteration": 2.947492837905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068503, + "balance_loss_mlp": 1.04259872, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.08930664907788496, + "language_loss": 0.77445567, + "learning_rate": 0.00018860318479056327, + "loss": 0.78514069, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.25939941, + "step": 3756, + "time_per_iteration": 3.133481740951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075638, + "balance_loss_mlp": 1.05084264, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.05236327296273719, + "language_loss": 0.83486211, + "learning_rate": 0.00018835949886413555, + "loss": 0.84561849, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.24804688, + "step": 3757, + "time_per_iteration": 2.7377569675445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071399, + "balance_loss_mlp": 1.04592407, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.06766060207164688, + "language_loss": 0.79256356, + "learning_rate": 0.0001881159339272806, + "loss": 0.80327755, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.25476074, + "step": 3758, + "time_per_iteration": 2.6691012382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106811, + "balance_loss_mlp": 1.04336238, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.06062364795368716, + "language_loss": 0.78869492, + "learning_rate": 0.00018787249007455858, + "loss": 0.79937607, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.24731445, + "step": 3759, + "time_per_iteration": 2.628452777862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072784, + "balance_loss_mlp": 1.04866767, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.05921726316721053, + "language_loss": 0.71849477, + "learning_rate": 0.00018762916740048302, + "loss": 0.7292226, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.24108887, + "step": 3760, + "time_per_iteration": 4.164097547531128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074503, + "balance_loss_mlp": 1.04969609, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.05859039427854228, + "language_loss": 0.85892487, + "learning_rate": 0.0001873859659995195, + "loss": 0.86966991, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.24816895, + "step": 3761, + "time_per_iteration": 2.7077507972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076187, + "balance_loss_mlp": 1.05047345, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.05612829292688987, + "language_loss": 0.8333689, + "learning_rate": 0.0001871428859660878, + "loss": 0.84413075, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.25744629, + "step": 3762, + "time_per_iteration": 2.7349491119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070635, + "balance_loss_mlp": 1.04679286, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.05320593884566549, + "language_loss": 0.82095098, + "learning_rate": 0.00018689992739455975, + "loss": 0.83165729, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.23828125, + "step": 3763, + "time_per_iteration": 2.9456627368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_mlp": 1.04832602, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.05110197345931534, + "language_loss": 0.86203957, + "learning_rate": 0.00018665709037926027, + "loss": 0.87277734, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.25476074, + "step": 3764, + "time_per_iteration": 3.318403959274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068875, + "balance_loss_mlp": 1.04516387, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.06311256302273614, + "language_loss": 0.85269356, + "learning_rate": 0.00018641437501446694, + "loss": 0.86338234, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.23693848, + "step": 3765, + "time_per_iteration": 2.6275501251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077113, + "balance_loss_mlp": 1.05141139, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.06293710681021243, + "language_loss": 0.82769656, + "learning_rate": 0.0001861717813944104, + "loss": 0.83846772, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.25744629, + "step": 3766, + "time_per_iteration": 2.6608469486236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074023, + "balance_loss_mlp": 1.04876232, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.06015775700699107, + "language_loss": 0.79908741, + "learning_rate": 0.00018592930961327365, + "loss": 0.80982769, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.25280762, + "step": 3767, + "time_per_iteration": 2.7321486473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107651, + "balance_loss_mlp": 1.05160677, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.056564551709211236, + "language_loss": 0.88070989, + "learning_rate": 0.00018568695976519273, + "loss": 0.89147508, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.24890137, + "step": 3768, + "time_per_iteration": 2.7732081413269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073175, + "balance_loss_mlp": 1.04744947, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.06484399949200302, + "language_loss": 0.80721432, + "learning_rate": 0.00018544473194425593, + "loss": 0.81794608, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.25744629, + "step": 3769, + "time_per_iteration": 2.489635467529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069994, + "balance_loss_mlp": 1.04453063, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.06360093923079267, + "language_loss": 0.78936434, + "learning_rate": 0.00018520262624450485, + "loss": 0.80006427, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.25488281, + "step": 3770, + "time_per_iteration": 2.874816417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070988, + "balance_loss_mlp": 1.04658556, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.05111495515347452, + "language_loss": 0.87226415, + "learning_rate": 0.00018496064275993324, + "loss": 0.88297403, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.24377441, + "step": 3771, + "time_per_iteration": 2.7426414489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070976, + "balance_loss_mlp": 1.04448795, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.06635315591168078, + "language_loss": 0.82333881, + "learning_rate": 0.00018471878158448686, + "loss": 0.83404857, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.26538086, + "step": 3772, + "time_per_iteration": 2.927983283996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073436, + "balance_loss_mlp": 1.04748392, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.0478676363130983, + "language_loss": 0.84174544, + "learning_rate": 0.00018447704281206512, + "loss": 0.85247982, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.25964355, + "step": 3773, + "time_per_iteration": 2.863914966583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068748, + "balance_loss_mlp": 1.04243803, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.056210264368279125, + "language_loss": 0.83150065, + "learning_rate": 0.0001842354265365191, + "loss": 0.84218812, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.26330566, + "step": 3774, + "time_per_iteration": 2.6950740814208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.04815984, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.08533819626854355, + "language_loss": 0.81115055, + "learning_rate": 0.0001839939328516526, + "loss": 0.82188785, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.25598145, + "step": 3775, + "time_per_iteration": 2.7223706245422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075372, + "balance_loss_mlp": 1.04981351, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.08605287501334834, + "language_loss": 0.81360769, + "learning_rate": 0.0001837525618512218, + "loss": 0.82436144, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.2557373, + "step": 3776, + "time_per_iteration": 2.874652624130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067677, + "balance_loss_mlp": 1.04284596, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.060733174286640615, + "language_loss": 0.83042395, + "learning_rate": 0.00018351131362893519, + "loss": 0.84110069, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.24841309, + "step": 3777, + "time_per_iteration": 2.801011323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070149, + "balance_loss_mlp": 1.04434013, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.06246763883136397, + "language_loss": 0.80644751, + "learning_rate": 0.00018327018827845364, + "loss": 0.81714904, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.25842285, + "step": 3778, + "time_per_iteration": 2.5989460945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.04869461, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.05228982381822259, + "language_loss": 0.87562966, + "learning_rate": 0.00018302918589339036, + "loss": 0.88636208, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.2454834, + "step": 3779, + "time_per_iteration": 2.6237618923187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073848, + "balance_loss_mlp": 1.04871857, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06453049409533262, + "language_loss": 0.90400481, + "learning_rate": 0.00018278830656731054, + "loss": 0.9147433, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.25158691, + "step": 3780, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069473, + "balance_loss_mlp": 1.04534531, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.050403453356215815, + "language_loss": 0.86580253, + "learning_rate": 0.00018254755039373222, + "loss": 0.87649727, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.24121094, + "step": 3781, + "time_per_iteration": 2.7791805267333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078472, + "balance_loss_mlp": 1.05350983, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.06136859684084447, + "language_loss": 0.83780336, + "learning_rate": 0.0001823069174661252, + "loss": 0.84858811, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.24963379, + "step": 3782, + "time_per_iteration": 2.8298797607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069254, + "balance_loss_mlp": 1.0445894, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.05448040343195996, + "language_loss": 0.78343076, + "learning_rate": 0.00018206640787791112, + "loss": 0.79412329, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.2467041, + "step": 3783, + "time_per_iteration": 2.609013795852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071668, + "balance_loss_mlp": 1.0477066, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.057564515393037245, + "language_loss": 0.85957235, + "learning_rate": 0.00018182602172246416, + "loss": 0.87028909, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.23974609, + "step": 3784, + "time_per_iteration": 2.6400623321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072013, + "balance_loss_mlp": 1.04888618, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.060673398412002894, + "language_loss": 0.76418436, + "learning_rate": 0.00018158575909311075, + "loss": 0.77490449, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.23132324, + "step": 3785, + "time_per_iteration": 2.64180850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079039, + "balance_loss_mlp": 1.05404127, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.06019097733888483, + "language_loss": 0.8038618, + "learning_rate": 0.000181345620083129, + "loss": 0.8146522, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.24987793, + "step": 3786, + "time_per_iteration": 2.8254077434539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075124, + "balance_loss_mlp": 1.05057859, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.056512794901340806, + "language_loss": 0.86981964, + "learning_rate": 0.00018110560478574927, + "loss": 0.88057089, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.2454834, + "step": 3787, + "time_per_iteration": 2.6989898681640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107191, + "balance_loss_mlp": 1.04607677, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.0653462875447768, + "language_loss": 0.80641389, + "learning_rate": 0.0001808657132941533, + "loss": 0.81713301, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.25830078, + "step": 3788, + "time_per_iteration": 2.7848241329193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076986, + "balance_loss_mlp": 1.05143917, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.06505823149164586, + "language_loss": 0.8307749, + "learning_rate": 0.00018062594570147572, + "loss": 0.84154475, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.25549316, + "step": 3789, + "time_per_iteration": 2.633287191390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070828, + "balance_loss_mlp": 1.046152, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.05002031972924792, + "language_loss": 0.85413891, + "learning_rate": 0.00018038630210080243, + "loss": 0.86484718, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.24658203, + "step": 3790, + "time_per_iteration": 2.866363286972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072918, + "balance_loss_mlp": 1.04853952, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.05805310793954541, + "language_loss": 0.85253292, + "learning_rate": 0.0001801467825851712, + "loss": 0.86326218, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.24401855, + "step": 3791, + "time_per_iteration": 2.728860378265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071802, + "balance_loss_mlp": 1.04638696, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.14519807994310208, + "language_loss": 0.78306311, + "learning_rate": 0.00017990738724757172, + "loss": 0.79378116, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.25427246, + "step": 3792, + "time_per_iteration": 2.8468916416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_mlp": 1.04319978, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.05959978185176886, + "language_loss": 0.8250258, + "learning_rate": 0.00017966811618094598, + "loss": 0.83570778, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.24987793, + "step": 3793, + "time_per_iteration": 2.909195899963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077382, + "balance_loss_mlp": 1.05095315, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.06013443658312294, + "language_loss": 0.8499018, + "learning_rate": 0.00017942896947818664, + "loss": 0.86067569, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.26452637, + "step": 3794, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008896, + "balance_loss_mlp": 1.00121939, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.014415453052224393, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75833952, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.07666016, + "step": 3795, + "time_per_iteration": 4.844003200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067336, + "balance_loss_mlp": 1.04246938, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.07521259733742676, + "language_loss": 0.8533113, + "learning_rate": 0.00017895104953559947, + "loss": 0.86398464, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.24865723, + "step": 3796, + "time_per_iteration": 2.5970304012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04881954, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.082255252193866, + "language_loss": 0.8954308, + "learning_rate": 0.00017871227648131672, + "loss": 0.90617365, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.25476074, + "step": 3797, + "time_per_iteration": 2.5412604808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066925, + "balance_loss_mlp": 1.0418191, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.050248722250274616, + "language_loss": 0.8297137, + "learning_rate": 0.0001784736281619907, + "loss": 0.84038293, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.25134277, + "step": 3798, + "time_per_iteration": 2.5844838619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068914, + "balance_loss_mlp": 1.04355788, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.07691325106249959, + "language_loss": 0.7466501, + "learning_rate": 0.00017823510467027232, + "loss": 0.75733924, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.25341797, + "step": 3799, + "time_per_iteration": 2.777209520339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071842, + "balance_loss_mlp": 1.04620039, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.08066489228669042, + "language_loss": 0.7834214, + "learning_rate": 0.00017799670609876516, + "loss": 0.79413986, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.25671387, + "step": 3800, + "time_per_iteration": 2.5069777965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107102, + "balance_loss_mlp": 1.04590285, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.05293495483873373, + "language_loss": 0.88974595, + "learning_rate": 0.00017775843254002366, + "loss": 0.90045619, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.2512207, + "step": 3801, + "time_per_iteration": 2.725081443786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077058, + "balance_loss_mlp": 1.0519762, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.05473119278948026, + "language_loss": 0.84161508, + "learning_rate": 0.00017752028408655367, + "loss": 0.85238564, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.25097656, + "step": 3802, + "time_per_iteration": 3.025043249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075279, + "balance_loss_mlp": 1.04994678, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.05406841313546952, + "language_loss": 0.85023701, + "learning_rate": 0.00017728226083081272, + "loss": 0.86098975, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.25354004, + "step": 3803, + "time_per_iteration": 2.556396245956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078247, + "balance_loss_mlp": 1.05355895, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.06231590720725376, + "language_loss": 0.81697959, + "learning_rate": 0.00017704436286520965, + "loss": 0.82776201, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.24682617, + "step": 3804, + "time_per_iteration": 2.5290911197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078799, + "balance_loss_mlp": 1.05242968, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.06772838198197546, + "language_loss": 0.84615296, + "learning_rate": 0.0001768065902821046, + "loss": 0.85694098, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.26379395, + "step": 3805, + "time_per_iteration": 2.6657214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072852, + "balance_loss_mlp": 1.04691195, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.06439046141851584, + "language_loss": 0.82463551, + "learning_rate": 0.00017656894317380907, + "loss": 0.83536404, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.25976562, + "step": 3806, + "time_per_iteration": 2.7381749153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008677, + "balance_loss_mlp": 1.00085652, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.011036115367728498, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77039945, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.078125, + "step": 3807, + "time_per_iteration": 5.021719217300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074035, + "balance_loss_mlp": 1.04846501, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.059101144317775495, + "language_loss": 0.84063375, + "learning_rate": 0.00017609402575064875, + "loss": 0.85137415, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.25585938, + "step": 3808, + "time_per_iteration": 2.601905345916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070564, + "balance_loss_mlp": 1.04569697, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.06287307202427123, + "language_loss": 0.81348085, + "learning_rate": 0.00017585675562016367, + "loss": 0.8241865, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.2487793, + "step": 3809, + "time_per_iteration": 2.5671656131744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101117, + "balance_loss_mlp": 1.0035404, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.009961164092808575, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78224015, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.07617188, + "step": 3810, + "time_per_iteration": 4.85601019859314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067566, + "balance_loss_mlp": 1.04141164, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.0717218178349286, + "language_loss": 0.85344338, + "learning_rate": 0.00017538259298196474, + "loss": 0.86411905, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.26171875, + "step": 3811, + "time_per_iteration": 2.5674660205841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066538, + "balance_loss_mlp": 1.0418613, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.06191722538005279, + "language_loss": 0.8221786, + "learning_rate": 0.00017514570065833745, + "loss": 0.83284396, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.24658203, + "step": 3812, + "time_per_iteration": 2.7502520084381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065481, + "balance_loss_mlp": 1.04084063, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.09654235990380512, + "language_loss": 0.80427462, + "learning_rate": 0.00017490893445433426, + "loss": 0.81492949, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.24633789, + "step": 3813, + "time_per_iteration": 2.644380569458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.04200649, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.05501039024116298, + "language_loss": 0.81422758, + "learning_rate": 0.00017467229446187587, + "loss": 0.82489812, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.25061035, + "step": 3814, + "time_per_iteration": 2.6799376010894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072542, + "balance_loss_mlp": 1.04665017, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.054283563918009155, + "language_loss": 0.81726635, + "learning_rate": 0.00017443578077283424, + "loss": 0.82799172, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.2590332, + "step": 3815, + "time_per_iteration": 2.6411497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072187, + "balance_loss_mlp": 1.04731965, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.06697852947124575, + "language_loss": 0.85358864, + "learning_rate": 0.0001741993934790319, + "loss": 0.8643105, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.24853516, + "step": 3816, + "time_per_iteration": 2.813728094100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106954, + "balance_loss_mlp": 1.04457784, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.07301575323096621, + "language_loss": 0.83966112, + "learning_rate": 0.00017396313267224273, + "loss": 0.85035658, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.24963379, + "step": 3817, + "time_per_iteration": 2.7044739723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074705, + "balance_loss_mlp": 1.04939699, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.05834260982052782, + "language_loss": 0.88725907, + "learning_rate": 0.0001737269984441912, + "loss": 0.89800614, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.2532959, + "step": 3818, + "time_per_iteration": 2.6479249000549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068069, + "balance_loss_mlp": 1.04333293, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.04867070462384417, + "language_loss": 0.85300821, + "learning_rate": 0.00017349099088655263, + "loss": 0.86368895, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.24743652, + "step": 3819, + "time_per_iteration": 2.687084197998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068808, + "balance_loss_mlp": 1.04391694, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.05808726713133537, + "language_loss": 0.81269497, + "learning_rate": 0.00017325511009095375, + "loss": 0.82338297, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.24902344, + "step": 3820, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.04350281, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.05934097649534438, + "language_loss": 0.83911049, + "learning_rate": 0.00017301935614897113, + "loss": 0.84979987, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.2545166, + "step": 3821, + "time_per_iteration": 2.6836743354797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.04855156, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.0494453398159371, + "language_loss": 0.81605434, + "learning_rate": 0.00017278372915213274, + "loss": 0.82679343, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.25378418, + "step": 3822, + "time_per_iteration": 2.651975393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008179, + "balance_loss_mlp": 1.00121737, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.007266533432635982, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80902022, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.06982422, + "step": 3823, + "time_per_iteration": 4.976882457733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075947, + "balance_loss_mlp": 1.05096054, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.0894957625662193, + "language_loss": 0.80647838, + "learning_rate": 0.00017231285635975314, + "loss": 0.81723785, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.25, + "step": 3824, + "time_per_iteration": 2.8835809230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_mlp": 1.04871035, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.0659132638478438, + "language_loss": 0.83565962, + "learning_rate": 0.00017207761074702115, + "loss": 0.84640133, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.25488281, + "step": 3825, + "time_per_iteration": 2.5829551219940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.05089879, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.05423674228427361, + "language_loss": 0.83801639, + "learning_rate": 0.0001718424924450514, + "loss": 0.84877622, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.25085449, + "step": 3826, + "time_per_iteration": 2.6215810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072026, + "balance_loss_mlp": 1.0464201, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.047662784770319516, + "language_loss": 0.86247635, + "learning_rate": 0.00017160750154512482, + "loss": 0.8731966, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.25610352, + "step": 3827, + "time_per_iteration": 2.7316274642944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072121, + "balance_loss_mlp": 1.04726601, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.05425230647323069, + "language_loss": 0.83439684, + "learning_rate": 0.0001713726381384731, + "loss": 0.84511811, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.24841309, + "step": 3828, + "time_per_iteration": 2.7767257690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107353, + "balance_loss_mlp": 1.04826927, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.06782192310346803, + "language_loss": 0.81600618, + "learning_rate": 0.00017113790231627812, + "loss": 0.8267414, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.25280762, + "step": 3829, + "time_per_iteration": 2.4791929721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100591, + "balance_loss_mlp": 0.99885303, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.00950707875200575, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80264139, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.07080078, + "step": 3830, + "time_per_iteration": 6.233624696731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075447, + "balance_loss_mlp": 1.05000758, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.0605697653719091, + "language_loss": 0.82126367, + "learning_rate": 0.00017066881378973936, + "loss": 0.83201814, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.25463867, + "step": 3831, + "time_per_iteration": 2.6804988384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107347, + "balance_loss_mlp": 1.0483644, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.051765900336182155, + "language_loss": 0.83060026, + "learning_rate": 0.00017043446126751189, + "loss": 0.84133494, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.2512207, + "step": 3832, + "time_per_iteration": 2.677116870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078507, + "balance_loss_mlp": 1.05299592, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.06293756083772555, + "language_loss": 0.76538479, + "learning_rate": 0.00017020023669397376, + "loss": 0.7761699, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.25524902, + "step": 3833, + "time_per_iteration": 2.6688897609710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107645, + "balance_loss_mlp": 1.04953265, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.06089571273560201, + "language_loss": 0.81964701, + "learning_rate": 0.0001699661401600589, + "loss": 0.83041155, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.26953125, + "step": 3834, + "time_per_iteration": 2.6013684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072143, + "balance_loss_mlp": 1.04688239, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.05707021957695399, + "language_loss": 0.78780484, + "learning_rate": 0.00016973217175665205, + "loss": 0.79852629, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.25268555, + "step": 3835, + "time_per_iteration": 2.5545742511749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_mlp": 0.99759406, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.011573205656029463, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82170916, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.07128906, + "step": 3836, + "time_per_iteration": 4.935137748718262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073573, + "balance_loss_mlp": 1.04770422, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.05911051824592706, + "language_loss": 0.8443321, + "learning_rate": 0.00016926461970465047, + "loss": 0.85506785, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.25878906, + "step": 3837, + "time_per_iteration": 2.753530979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070473, + "balance_loss_mlp": 1.04638028, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.055427222427827466, + "language_loss": 0.84596455, + "learning_rate": 0.00016903103623757516, + "loss": 0.85666919, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.2409668, + "step": 3838, + "time_per_iteration": 3.0433106422424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070361, + "balance_loss_mlp": 1.04518437, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.06096616849216926, + "language_loss": 0.80038297, + "learning_rate": 0.00016879758126404738, + "loss": 0.81108665, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.25183105, + "step": 3839, + "time_per_iteration": 2.726783037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071404, + "balance_loss_mlp": 1.04598832, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.0748668456042948, + "language_loss": 0.80022889, + "learning_rate": 0.00016856425487470216, + "loss": 0.81094301, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.25439453, + "step": 3840, + "time_per_iteration": 3.0780324935913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_mlp": 1.04228592, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.06187629511856373, + "language_loss": 0.79238671, + "learning_rate": 0.00016833105716012486, + "loss": 0.80306083, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.25146484, + "step": 3841, + "time_per_iteration": 3.1636850833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070126, + "balance_loss_mlp": 1.04452026, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.05887802150454755, + "language_loss": 0.85242188, + "learning_rate": 0.00016809798821085088, + "loss": 0.86312318, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.25622559, + "step": 3842, + "time_per_iteration": 2.990478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069406, + "balance_loss_mlp": 1.04390705, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.051928079352218694, + "language_loss": 0.8929773, + "learning_rate": 0.00016786504811736565, + "loss": 0.90367138, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.25524902, + "step": 3843, + "time_per_iteration": 2.6872341632843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066886, + "balance_loss_mlp": 1.04195881, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.06625408386492132, + "language_loss": 0.82992953, + "learning_rate": 0.00016763223697010442, + "loss": 0.84059834, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.24938965, + "step": 3844, + "time_per_iteration": 2.9391865730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.04412675, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.05828893088019289, + "language_loss": 0.84686291, + "learning_rate": 0.00016739955485945256, + "loss": 0.85754752, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.24304199, + "step": 3845, + "time_per_iteration": 2.7142622470855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072077, + "balance_loss_mlp": 1.04656637, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.07100000886785215, + "language_loss": 0.85870165, + "learning_rate": 0.00016716700187574513, + "loss": 0.86942244, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.25537109, + "step": 3846, + "time_per_iteration": 2.6977670192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067392, + "balance_loss_mlp": 1.04300213, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.054057188356913304, + "language_loss": 0.84146428, + "learning_rate": 0.0001669345781092675, + "loss": 0.85213816, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.24377441, + "step": 3847, + "time_per_iteration": 2.7265117168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074624, + "balance_loss_mlp": 1.05013824, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.06355688718688712, + "language_loss": 0.87326193, + "learning_rate": 0.0001667022836502546, + "loss": 0.88400817, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.24499512, + "step": 3848, + "time_per_iteration": 2.7551324367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073869, + "balance_loss_mlp": 1.04852557, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.08017271540920272, + "language_loss": 0.828776, + "learning_rate": 0.00016647011858889077, + "loss": 0.83951473, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.25378418, + "step": 3849, + "time_per_iteration": 2.5299232006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073654, + "balance_loss_mlp": 1.04783297, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.06268234066304752, + "language_loss": 0.85992008, + "learning_rate": 0.00016623808301531056, + "loss": 0.87065661, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.25842285, + "step": 3850, + "time_per_iteration": 2.6404004096984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077251, + "balance_loss_mlp": 1.05166864, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.07684631062218569, + "language_loss": 0.79265726, + "learning_rate": 0.00016600617701959842, + "loss": 0.80342978, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.25610352, + "step": 3851, + "time_per_iteration": 2.719182014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007219, + "balance_loss_mlp": 1.00025725, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.009023170879128087, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79851031, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.06982422, + "step": 3852, + "time_per_iteration": 4.949675798416138 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073682, + "balance_loss_mlp": 1.04883838, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.05701919948873552, + "language_loss": 0.81264549, + "learning_rate": 0.00016554275412186315, + "loss": 0.82338226, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.24853516, + "step": 3853, + "time_per_iteration": 2.843740701675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04884005, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.06536701062861092, + "language_loss": 0.80980605, + "learning_rate": 0.0001653112373997568, + "loss": 0.82055348, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.25927734, + "step": 3854, + "time_per_iteration": 2.65200138092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073741, + "balance_loss_mlp": 1.04929078, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.06830067718858168, + "language_loss": 0.74823475, + "learning_rate": 0.0001650798506153517, + "loss": 0.75897211, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.24450684, + "step": 3855, + "time_per_iteration": 2.687006950378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070882, + "balance_loss_mlp": 1.04607463, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.07905469083436836, + "language_loss": 0.84182036, + "learning_rate": 0.00016484859385848023, + "loss": 0.85252917, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.24816895, + "step": 3856, + "time_per_iteration": 2.6188693046569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072107, + "balance_loss_mlp": 1.0480032, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.061726371250172385, + "language_loss": 0.77338076, + "learning_rate": 0.0001646174672189243, + "loss": 0.7841019, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.24108887, + "step": 3857, + "time_per_iteration": 2.649557590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067204, + "balance_loss_mlp": 1.0426352, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.0578395137702567, + "language_loss": 0.80607724, + "learning_rate": 0.00016438647078641488, + "loss": 0.81674922, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.24572754, + "step": 3858, + "time_per_iteration": 2.619621515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072571, + "balance_loss_mlp": 1.04788327, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.06183948781118283, + "language_loss": 0.83172727, + "learning_rate": 0.00016415560465063344, + "loss": 0.842453, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.24694824, + "step": 3859, + "time_per_iteration": 2.7068328857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_mlp": 1.04234803, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.07149126280637065, + "language_loss": 0.79273307, + "learning_rate": 0.0001639248689012095, + "loss": 0.80340761, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.2512207, + "step": 3860, + "time_per_iteration": 2.559715986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069308, + "balance_loss_mlp": 1.04384458, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.06474025834236737, + "language_loss": 0.87225401, + "learning_rate": 0.00016369426362772271, + "loss": 0.88294709, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.25463867, + "step": 3861, + "time_per_iteration": 2.768488883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071358, + "balance_loss_mlp": 1.0464673, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.05729012917412524, + "language_loss": 0.80612242, + "learning_rate": 0.00016346378891970233, + "loss": 0.816836, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.24890137, + "step": 3862, + "time_per_iteration": 2.805666923522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107111, + "balance_loss_mlp": 1.04621959, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.054983080042834975, + "language_loss": 0.81883794, + "learning_rate": 0.00016323344486662633, + "loss": 0.82954907, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.24902344, + "step": 3863, + "time_per_iteration": 3.301302671432495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072159, + "balance_loss_mlp": 1.04673147, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.05456395021125743, + "language_loss": 0.78892124, + "learning_rate": 0.00016300323155792247, + "loss": 0.7996428, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.2545166, + "step": 3864, + "time_per_iteration": 2.8931703567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066338, + "balance_loss_mlp": 1.0422101, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.05569760658066131, + "language_loss": 0.88605452, + "learning_rate": 0.00016277314908296687, + "loss": 0.89671785, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.24121094, + "step": 3865, + "time_per_iteration": 2.684453248977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071833, + "balance_loss_mlp": 1.04628646, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.09698057624651829, + "language_loss": 0.75883031, + "learning_rate": 0.00016254319753108604, + "loss": 0.76954859, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.25561523, + "step": 3866, + "time_per_iteration": 2.8249847888946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04626155, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.06903603879321982, + "language_loss": 0.76659936, + "learning_rate": 0.00016231337699155492, + "loss": 0.7773214, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.25964355, + "step": 3867, + "time_per_iteration": 2.954054594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.04905081, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.052812057289516566, + "language_loss": 0.78941596, + "learning_rate": 0.0001620836875535977, + "loss": 0.80016011, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.25378418, + "step": 3868, + "time_per_iteration": 2.868677854537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065856, + "balance_loss_mlp": 1.04120398, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.06361911402361287, + "language_loss": 0.806584, + "learning_rate": 0.00016185412930638766, + "loss": 0.81724262, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.24658203, + "step": 3869, + "time_per_iteration": 2.8323211669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071749, + "balance_loss_mlp": 1.04708433, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.05653769935152868, + "language_loss": 0.82733011, + "learning_rate": 0.00016162470233904765, + "loss": 0.83804756, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.24658203, + "step": 3870, + "time_per_iteration": 2.7211382389068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04862642, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.06316774486708195, + "language_loss": 0.82555729, + "learning_rate": 0.00016139540674064856, + "loss": 0.83629668, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.25280762, + "step": 3871, + "time_per_iteration": 2.739121675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070986, + "balance_loss_mlp": 1.04655969, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.05640449284487911, + "language_loss": 0.78114176, + "learning_rate": 0.00016116624260021113, + "loss": 0.79185158, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.24414062, + "step": 3872, + "time_per_iteration": 2.7855870723724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071272, + "balance_loss_mlp": 1.04650021, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.05661952400288272, + "language_loss": 0.84321451, + "learning_rate": 0.0001609372100067046, + "loss": 0.85392725, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.24768066, + "step": 3873, + "time_per_iteration": 2.5051002502441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076949, + "balance_loss_mlp": 1.05139041, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.07051271048779074, + "language_loss": 0.85103834, + "learning_rate": 0.0001607083090490475, + "loss": 0.86180782, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.25585938, + "step": 3874, + "time_per_iteration": 2.865432024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073052, + "balance_loss_mlp": 1.04762459, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.0748811600341369, + "language_loss": 0.80497265, + "learning_rate": 0.00016047953981610714, + "loss": 0.81570315, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.25439453, + "step": 3875, + "time_per_iteration": 2.7216734886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007311, + "balance_loss_mlp": 1.00044441, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.007625795803468779, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80736953, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.06884766, + "step": 3876, + "time_per_iteration": 5.382456064224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075501, + "balance_loss_mlp": 1.05013371, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.05488514319290027, + "language_loss": 0.81060588, + "learning_rate": 0.0001600223968795889, + "loss": 0.82136083, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.25378418, + "step": 3877, + "time_per_iteration": 2.9120445251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100724, + "balance_loss_mlp": 1.0003736, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.007629360710496433, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.7670331, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.06884766, + "step": 3878, + "time_per_iteration": 4.901887893676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072632, + "balance_loss_mlp": 1.04855156, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.07646083771091663, + "language_loss": 0.82140052, + "learning_rate": 0.00015956578190706483, + "loss": 0.83212686, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.24072266, + "step": 3879, + "time_per_iteration": 2.665292978286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106672, + "balance_loss_mlp": 1.04198372, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.05773895513703621, + "language_loss": 0.75869894, + "learning_rate": 0.00015933767262892468, + "loss": 0.76936615, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.24743652, + "step": 3880, + "time_per_iteration": 2.7083511352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068645, + "balance_loss_mlp": 1.04439831, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.07814319262934219, + "language_loss": 0.82429087, + "learning_rate": 0.00015910969560762927, + "loss": 0.83497727, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.2421875, + "step": 3881, + "time_per_iteration": 2.556643009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072001, + "balance_loss_mlp": 1.04824293, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.05526796797761112, + "language_loss": 0.8303771, + "learning_rate": 0.00015888185093168727, + "loss": 0.84109712, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.2376709, + "step": 3882, + "time_per_iteration": 2.7359204292297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074755, + "balance_loss_mlp": 1.0493511, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.05340233113956033, + "language_loss": 0.81238657, + "learning_rate": 0.00015865413868955581, + "loss": 0.82313412, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.25439453, + "step": 3883, + "time_per_iteration": 2.658531665802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.04343939, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.051714571371053245, + "language_loss": 0.82935232, + "learning_rate": 0.00015842655896964054, + "loss": 0.8400166, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.22973633, + "step": 3884, + "time_per_iteration": 3.018538475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077883, + "balance_loss_mlp": 1.05318248, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.06900594182420934, + "language_loss": 0.74108642, + "learning_rate": 0.00015819911186029567, + "loss": 0.75186527, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.24719238, + "step": 3885, + "time_per_iteration": 2.767460823059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074459, + "balance_loss_mlp": 1.04935396, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.05191869003121536, + "language_loss": 0.8641215, + "learning_rate": 0.00015797179744982443, + "loss": 0.87486613, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.25073242, + "step": 3886, + "time_per_iteration": 2.722130060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04973185, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.0600854170312897, + "language_loss": 0.79131281, + "learning_rate": 0.00015774461582647765, + "loss": 0.80205405, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.24389648, + "step": 3887, + "time_per_iteration": 2.6940510272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072368, + "balance_loss_mlp": 1.04781055, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07732553341953252, + "language_loss": 0.8101362, + "learning_rate": 0.00015751756707845505, + "loss": 0.82085991, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.24560547, + "step": 3888, + "time_per_iteration": 2.6013286113739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_mlp": 1.04748178, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.05831839301711609, + "language_loss": 0.88772756, + "learning_rate": 0.00015729065129390502, + "loss": 0.89844996, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.24768066, + "step": 3889, + "time_per_iteration": 3.000511884689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107686, + "balance_loss_mlp": 1.05071712, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.09513844178064898, + "language_loss": 0.82148743, + "learning_rate": 0.0001570638685609241, + "loss": 0.83225602, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.26159668, + "step": 3890, + "time_per_iteration": 2.5567352771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_mlp": 1.04621816, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.06496446825599186, + "language_loss": 0.80583847, + "learning_rate": 0.00015683721896755693, + "loss": 0.81655896, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.25866699, + "step": 3891, + "time_per_iteration": 2.5300092697143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017758, + "balance_loss_mlp": 1.01103473, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.007988812932881569, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83228242, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.06738281, + "step": 3892, + "time_per_iteration": 4.90599799156189 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072454, + "balance_loss_mlp": 1.04705048, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.06234068524332242, + "language_loss": 0.85285282, + "learning_rate": 0.00015638431955158528, + "loss": 0.86357737, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.25402832, + "step": 3893, + "time_per_iteration": 2.674448251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077206, + "balance_loss_mlp": 1.05186236, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.051873425900431515, + "language_loss": 0.8129698, + "learning_rate": 0.00015615806990481186, + "loss": 0.82374185, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.25366211, + "step": 3894, + "time_per_iteration": 2.749011754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075566, + "balance_loss_mlp": 1.05075812, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.04941596722004592, + "language_loss": 0.84629339, + "learning_rate": 0.00015593195374931452, + "loss": 0.85704899, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.24804688, + "step": 3895, + "time_per_iteration": 2.7212753295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077894, + "balance_loss_mlp": 1.05278873, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.06116342211722219, + "language_loss": 0.80278218, + "learning_rate": 0.00015570597117287922, + "loss": 0.8135612, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.25109863, + "step": 3896, + "time_per_iteration": 2.7101802825927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_mlp": 1.04633236, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.069447374717696, + "language_loss": 0.77728438, + "learning_rate": 0.0001554801222632406, + "loss": 0.78799057, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.24291992, + "step": 3897, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107305, + "balance_loss_mlp": 1.04788542, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.06164453931329584, + "language_loss": 0.85245335, + "learning_rate": 0.00015525440710808052, + "loss": 0.86318392, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.25170898, + "step": 3898, + "time_per_iteration": 2.653172016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107387, + "balance_loss_mlp": 1.04789472, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.06163823743918883, + "language_loss": 0.77877641, + "learning_rate": 0.00015502882579502953, + "loss": 0.78951514, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.2598877, + "step": 3899, + "time_per_iteration": 2.949995517730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074982, + "balance_loss_mlp": 1.04994845, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.062464860099035104, + "language_loss": 0.85077929, + "learning_rate": 0.00015480337841166592, + "loss": 0.86152911, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.25012207, + "step": 3900, + "time_per_iteration": 2.7779133319854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078827, + "balance_loss_mlp": 1.05378067, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.06586633865886998, + "language_loss": 0.82996714, + "learning_rate": 0.00015457806504551647, + "loss": 0.8407554, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.25061035, + "step": 3901, + "time_per_iteration": 2.8566529750823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074374, + "balance_loss_mlp": 1.04922056, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.053967524095388235, + "language_loss": 0.78072977, + "learning_rate": 0.0001543528857840554, + "loss": 0.79147345, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.25158691, + "step": 3902, + "time_per_iteration": 2.6760079860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107594, + "balance_loss_mlp": 1.05152607, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.0598852080475998, + "language_loss": 0.80620217, + "learning_rate": 0.000154127840714705, + "loss": 0.81696159, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.24401855, + "step": 3903, + "time_per_iteration": 2.788379430770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072506, + "balance_loss_mlp": 1.04635119, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.0690597284577383, + "language_loss": 0.82208622, + "learning_rate": 0.00015390292992483557, + "loss": 0.8328113, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.26184082, + "step": 3904, + "time_per_iteration": 2.507995128631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010781, + "balance_loss_mlp": 1.05372167, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.057063892472999186, + "language_loss": 0.8453331, + "learning_rate": 0.00015367815350176523, + "loss": 0.85611403, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.24389648, + "step": 3905, + "time_per_iteration": 2.733604907989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077145, + "balance_loss_mlp": 1.05211139, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.056222194479754704, + "language_loss": 0.82852668, + "learning_rate": 0.00015345351153275987, + "loss": 0.83929813, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.25048828, + "step": 3906, + "time_per_iteration": 2.5045523643493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068307, + "balance_loss_mlp": 1.04364252, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.0670025336867701, + "language_loss": 0.80755925, + "learning_rate": 0.00015322900410503332, + "loss": 0.81824237, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.24645996, + "step": 3907, + "time_per_iteration": 2.7994320392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072066, + "balance_loss_mlp": 1.04688847, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.05833722566179846, + "language_loss": 0.77270997, + "learning_rate": 0.00015300463130574703, + "loss": 0.78343064, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.2520752, + "step": 3908, + "time_per_iteration": 2.8524723052978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074857, + "balance_loss_mlp": 1.05007386, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.06750112030828431, + "language_loss": 0.8202616, + "learning_rate": 0.00015278039322201033, + "loss": 0.83101016, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.24780273, + "step": 3909, + "time_per_iteration": 2.9736523628234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107108, + "balance_loss_mlp": 1.04605806, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.06973049488885559, + "language_loss": 0.79777265, + "learning_rate": 0.00015255628994088004, + "loss": 0.80848348, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.25012207, + "step": 3910, + "time_per_iteration": 2.5302295684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071428, + "balance_loss_mlp": 1.04685879, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.06491426565594356, + "language_loss": 0.75382125, + "learning_rate": 0.00015233232154936082, + "loss": 0.76453555, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.24572754, + "step": 3911, + "time_per_iteration": 3.251619815826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078269, + "balance_loss_mlp": 1.05206633, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.0623404961346465, + "language_loss": 0.76721239, + "learning_rate": 0.0001521084881344048, + "loss": 0.77799511, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.26220703, + "step": 3912, + "time_per_iteration": 2.850635051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075285, + "balance_loss_mlp": 1.05071616, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.05187339069994817, + "language_loss": 0.86498892, + "learning_rate": 0.00015188478978291208, + "loss": 0.87574184, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.24572754, + "step": 3913, + "time_per_iteration": 2.765442371368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072293, + "balance_loss_mlp": 1.04759288, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.06241775193338078, + "language_loss": 0.86580771, + "learning_rate": 0.00015166122658173014, + "loss": 0.87653065, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.24682617, + "step": 3914, + "time_per_iteration": 2.7562687397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076326, + "balance_loss_mlp": 1.05007637, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.05387803011271429, + "language_loss": 0.88860059, + "learning_rate": 0.00015143779861765332, + "loss": 0.89936382, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.26257324, + "step": 3915, + "time_per_iteration": 2.932776927947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107374, + "balance_loss_mlp": 1.04894459, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.057566889010823, + "language_loss": 0.81424505, + "learning_rate": 0.00015121450597742458, + "loss": 0.82498246, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.2479248, + "step": 3916, + "time_per_iteration": 2.854919672012329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078465, + "balance_loss_mlp": 1.05316877, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07096809285669192, + "language_loss": 0.7879523, + "learning_rate": 0.00015099134874773369, + "loss": 0.79873693, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.25317383, + "step": 3917, + "time_per_iteration": 2.717822313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072702, + "balance_loss_mlp": 1.04738212, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.05614014037376785, + "language_loss": 0.80481035, + "learning_rate": 0.00015076832701521793, + "loss": 0.81553745, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.25341797, + "step": 3918, + "time_per_iteration": 2.7440896034240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077078, + "balance_loss_mlp": 1.05145979, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.07007735924828153, + "language_loss": 0.81983852, + "learning_rate": 0.000150545440866462, + "loss": 0.83060932, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.25646973, + "step": 3919, + "time_per_iteration": 3.0307159423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080089, + "balance_loss_mlp": 1.05534124, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.06360208867996311, + "language_loss": 0.78682411, + "learning_rate": 0.000150322690387998, + "loss": 0.79762495, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.24755859, + "step": 3920, + "time_per_iteration": 2.4933719635009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075825, + "balance_loss_mlp": 1.05169666, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.07326690987324283, + "language_loss": 0.75561839, + "learning_rate": 0.00015010007566630535, + "loss": 0.76637661, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.24121094, + "step": 3921, + "time_per_iteration": 2.7614030838012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.05487168, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.09124669942400691, + "language_loss": 0.81765956, + "learning_rate": 0.00014987759678781077, + "loss": 0.82845515, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.24707031, + "step": 3922, + "time_per_iteration": 2.6194660663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107743, + "balance_loss_mlp": 1.0523603, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07360679346061566, + "language_loss": 0.82340884, + "learning_rate": 0.00014965525383888795, + "loss": 0.83418316, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.25085449, + "step": 3923, + "time_per_iteration": 2.8085968494415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073002, + "balance_loss_mlp": 1.04881442, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.05494147017339954, + "language_loss": 0.72481954, + "learning_rate": 0.00014943304690585851, + "loss": 0.73554957, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.24182129, + "step": 3924, + "time_per_iteration": 2.9154560565948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079076, + "balance_loss_mlp": 1.0540781, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.07583618548945481, + "language_loss": 0.79405016, + "learning_rate": 0.0001492109760749908, + "loss": 0.80484092, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.25012207, + "step": 3925, + "time_per_iteration": 2.5836076736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076807, + "balance_loss_mlp": 1.0515945, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.05355436965428176, + "language_loss": 0.80110025, + "learning_rate": 0.00014898904143250002, + "loss": 0.81186831, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.25231934, + "step": 3926, + "time_per_iteration": 2.6505353450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024147, + "balance_loss_mlp": 1.01732779, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.0157174231445921, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76779342, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.06835938, + "step": 3927, + "time_per_iteration": 4.953717470169067 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077692, + "balance_loss_mlp": 1.05381441, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.05489454471207153, + "language_loss": 0.80429578, + "learning_rate": 0.0001485455810572474, + "loss": 0.81507266, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.23864746, + "step": 3928, + "time_per_iteration": 2.637946844100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077738, + "balance_loss_mlp": 1.05282295, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.058181996359435495, + "language_loss": 0.84069693, + "learning_rate": 0.00014832405549665236, + "loss": 0.85147429, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.24902344, + "step": 3929, + "time_per_iteration": 2.6932008266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074105, + "balance_loss_mlp": 1.05033493, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.06320192227376603, + "language_loss": 0.78577268, + "learning_rate": 0.00014810266646876746, + "loss": 0.79651374, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.2376709, + "step": 3930, + "time_per_iteration": 2.7697536945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071439, + "balance_loss_mlp": 1.04683375, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.06814480820805115, + "language_loss": 0.77612817, + "learning_rate": 0.00014788141405954364, + "loss": 0.78684253, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.24633789, + "step": 3931, + "time_per_iteration": 2.979769468307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073336, + "balance_loss_mlp": 1.04886281, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.059820147392813335, + "language_loss": 0.84867471, + "learning_rate": 0.00014766029835487865, + "loss": 0.85940808, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.24475098, + "step": 3932, + "time_per_iteration": 2.7333834171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.05246568, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.06432503649028948, + "language_loss": 0.79687858, + "learning_rate": 0.0001474393194406173, + "loss": 0.80764747, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.24438477, + "step": 3933, + "time_per_iteration": 2.896916627883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.04866862, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.05519381243728572, + "language_loss": 0.79627228, + "learning_rate": 0.00014721847740255112, + "loss": 0.80699992, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.24084473, + "step": 3934, + "time_per_iteration": 2.8888845443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011104, + "balance_loss_mlp": 1.00419021, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.009067101269619127, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74923027, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.06933594, + "step": 3935, + "time_per_iteration": 4.645391941070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106868, + "balance_loss_mlp": 1.0436697, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.07237572770548766, + "language_loss": 0.78754729, + "learning_rate": 0.00014677720429790526, + "loss": 0.79823411, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.25, + "step": 3936, + "time_per_iteration": 2.588223457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063945, + "balance_loss_mlp": 1.03916097, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.047485127857512396, + "language_loss": 0.84842449, + "learning_rate": 0.0001465567734026429, + "loss": 0.85906392, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.24804688, + "step": 3937, + "time_per_iteration": 2.733915090560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.04105449, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08009981712231565, + "language_loss": 0.82548285, + "learning_rate": 0.00014633647972621034, + "loss": 0.83615267, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.25964355, + "step": 3938, + "time_per_iteration": 2.4831509590148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066047, + "balance_loss_mlp": 1.04131114, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.049323859420558516, + "language_loss": 0.8679713, + "learning_rate": 0.00014611632335413354, + "loss": 0.87863177, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.24743652, + "step": 3939, + "time_per_iteration": 2.817972421646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066166, + "balance_loss_mlp": 1.04209805, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.05378533672074644, + "language_loss": 0.82628143, + "learning_rate": 0.00014589630437188456, + "loss": 0.83694315, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.24047852, + "step": 3940, + "time_per_iteration": 3.1869349479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069178, + "balance_loss_mlp": 1.04451323, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.0625352255464929, + "language_loss": 0.78564709, + "learning_rate": 0.00014567642286488253, + "loss": 0.79633886, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.2467041, + "step": 3941, + "time_per_iteration": 2.5982542037963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067881, + "balance_loss_mlp": 1.04189372, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.07448102100024, + "language_loss": 0.79396963, + "learning_rate": 0.00014545667891849258, + "loss": 0.8046484, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.26013184, + "step": 3942, + "time_per_iteration": 2.6813278198242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.04107857, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.05620268870521042, + "language_loss": 0.82649952, + "learning_rate": 0.00014523707261802733, + "loss": 0.83716011, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.24987793, + "step": 3943, + "time_per_iteration": 2.6405162811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068268, + "balance_loss_mlp": 1.04263783, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.05791403818328359, + "language_loss": 0.8163532, + "learning_rate": 0.00014501760404874527, + "loss": 0.8270359, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.25634766, + "step": 3944, + "time_per_iteration": 2.722963809967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106607, + "balance_loss_mlp": 1.04108405, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.06238439989518053, + "language_loss": 0.86068374, + "learning_rate": 0.00014479827329585176, + "loss": 0.87134445, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.24963379, + "step": 3945, + "time_per_iteration": 2.7014224529266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.04362893, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.04867252918796388, + "language_loss": 0.8493138, + "learning_rate": 0.00014457908044449846, + "loss": 0.85999829, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.24816895, + "step": 3946, + "time_per_iteration": 2.7054529190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106665, + "balance_loss_mlp": 1.04214025, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.06710425705469547, + "language_loss": 0.83130479, + "learning_rate": 0.00014436002557978371, + "loss": 0.84197128, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.24511719, + "step": 3947, + "time_per_iteration": 2.788025379180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004209, + "balance_loss_mlp": 0.99724722, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.014479305235322698, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77647352, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.06982422, + "step": 3948, + "time_per_iteration": 4.901083946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067183, + "balance_loss_mlp": 1.0420419, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.05243549460506514, + "language_loss": 0.79874659, + "learning_rate": 0.0001439223301503945, + "loss": 0.80941838, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.25146484, + "step": 3949, + "time_per_iteration": 2.538907527923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_mlp": 1.04793382, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.10938231230046584, + "language_loss": 0.76564628, + "learning_rate": 0.00014370368975564834, + "loss": 0.77636915, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.24353027, + "step": 3950, + "time_per_iteration": 2.9170916080474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072617, + "balance_loss_mlp": 1.04752314, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.06203753703543282, + "language_loss": 0.83670735, + "learning_rate": 0.00014348518768739766, + "loss": 0.84743357, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.25109863, + "step": 3951, + "time_per_iteration": 2.730717897415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002273, + "balance_loss_mlp": 0.99526346, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.013476999765998546, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.7773031, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.0703125, + "step": 3952, + "time_per_iteration": 4.813526391983032 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067232, + "balance_loss_mlp": 1.04273486, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.09216142296418942, + "language_loss": 0.86414772, + "learning_rate": 0.00014304859886964867, + "loss": 0.87482005, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.24487305, + "step": 3953, + "time_per_iteration": 2.9926557540893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068844, + "balance_loss_mlp": 1.0442636, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.05133749222292773, + "language_loss": 0.83801866, + "learning_rate": 0.00014283051228964878, + "loss": 0.84870708, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.24572754, + "step": 3954, + "time_per_iteration": 2.68623423576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068293, + "balance_loss_mlp": 1.0438199, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.07227710596047977, + "language_loss": 0.82760596, + "learning_rate": 0.00014261256437514197, + "loss": 0.8382889, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.24462891, + "step": 3955, + "time_per_iteration": 2.664646625518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067041, + "balance_loss_mlp": 1.04321122, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.06297577079801352, + "language_loss": 0.82699019, + "learning_rate": 0.0001423947552107428, + "loss": 0.83766061, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.23815918, + "step": 3956, + "time_per_iteration": 2.705461263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106862, + "balance_loss_mlp": 1.04420578, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.06155196103872169, + "language_loss": 0.77009457, + "learning_rate": 0.00014217708488101243, + "loss": 0.78078079, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.24389648, + "step": 3957, + "time_per_iteration": 3.0419583320617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069471, + "balance_loss_mlp": 1.04394794, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08526954862375616, + "language_loss": 0.77756625, + "learning_rate": 0.0001419595534704579, + "loss": 0.78826094, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.25537109, + "step": 3958, + "time_per_iteration": 2.6491963863372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_mlp": 1.04386628, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.05412065824000071, + "language_loss": 0.81526375, + "learning_rate": 0.00014174216106353237, + "loss": 0.82594681, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.24438477, + "step": 3959, + "time_per_iteration": 2.6313867568969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067389, + "balance_loss_mlp": 1.04217625, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.06465573226743382, + "language_loss": 0.76379991, + "learning_rate": 0.00014152490774463512, + "loss": 0.77447385, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.25231934, + "step": 3960, + "time_per_iteration": 2.5999507904052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_mlp": 1.04220998, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.07999326850245969, + "language_loss": 0.87070781, + "learning_rate": 0.00014130779359811135, + "loss": 0.8813796, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.24963379, + "step": 3961, + "time_per_iteration": 2.470813512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067267, + "balance_loss_mlp": 1.04288888, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.05710380569291167, + "language_loss": 0.8618769, + "learning_rate": 0.0001410908187082521, + "loss": 0.87254959, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.24365234, + "step": 3962, + "time_per_iteration": 2.8664379119873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068922, + "balance_loss_mlp": 1.04251671, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.06132823926479849, + "language_loss": 0.83309317, + "learning_rate": 0.0001408739831592949, + "loss": 0.84378237, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.26416016, + "step": 3963, + "time_per_iteration": 2.6825127601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066723, + "balance_loss_mlp": 1.04126, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.061542532553496905, + "language_loss": 0.7740978, + "learning_rate": 0.0001406572870354224, + "loss": 0.78476501, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.25488281, + "step": 3964, + "time_per_iteration": 2.8066251277923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068698, + "balance_loss_mlp": 1.0439024, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.0743228593837952, + "language_loss": 0.8726244, + "learning_rate": 0.00014044073042076337, + "loss": 0.88331133, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.24804688, + "step": 3965, + "time_per_iteration": 2.56150484085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063933, + "balance_loss_mlp": 1.04018617, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.05421398369924913, + "language_loss": 0.89094937, + "learning_rate": 0.00014022431339939302, + "loss": 0.90158874, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.23730469, + "step": 3966, + "time_per_iteration": 2.655383586883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062991, + "balance_loss_mlp": 1.03745639, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.06770239365131947, + "language_loss": 0.78292239, + "learning_rate": 0.00014000803605533163, + "loss": 0.79355228, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.25537109, + "step": 3967, + "time_per_iteration": 2.7987117767333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064596, + "balance_loss_mlp": 1.04112363, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.07669794307495341, + "language_loss": 0.83868659, + "learning_rate": 0.00013979189847254553, + "loss": 0.84933251, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.23474121, + "step": 3968, + "time_per_iteration": 2.5726282596588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068744, + "balance_loss_mlp": 1.04398394, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.06422002731628916, + "language_loss": 0.80682731, + "learning_rate": 0.00013957590073494674, + "loss": 0.81751466, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.24780273, + "step": 3969, + "time_per_iteration": 2.7832956314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063668, + "balance_loss_mlp": 1.03938496, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.0668838144354532, + "language_loss": 0.79043007, + "learning_rate": 0.0001393600429263931, + "loss": 0.80106676, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.24267578, + "step": 3970, + "time_per_iteration": 2.7605960369110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_mlp": 1.01881361, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.01539224673333176, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75770235, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.06542969, + "step": 3971, + "time_per_iteration": 4.931908369064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062427, + "balance_loss_mlp": 1.03795385, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.05820803040195091, + "language_loss": 0.81583369, + "learning_rate": 0.0001389287474315804, + "loss": 0.82645798, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.24450684, + "step": 3972, + "time_per_iteration": 2.6463029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.03905725, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.06630733211536807, + "language_loss": 0.8009305, + "learning_rate": 0.00013871330991276505, + "loss": 0.81156087, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.23950195, + "step": 3973, + "time_per_iteration": 2.706376791000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.04109335, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.09714932843218141, + "language_loss": 0.80939794, + "learning_rate": 0.00013849801265788247, + "loss": 0.82005835, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.24938965, + "step": 3974, + "time_per_iteration": 2.9903647899627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068526, + "balance_loss_mlp": 1.04320633, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.0619121183931823, + "language_loss": 0.83072186, + "learning_rate": 0.00013828285575051818, + "loss": 0.84140712, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.25354004, + "step": 3975, + "time_per_iteration": 2.6031386852264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067956, + "balance_loss_mlp": 1.04345858, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.057910960280581285, + "language_loss": 0.84483737, + "learning_rate": 0.0001380678392742035, + "loss": 0.85551691, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.24499512, + "step": 3976, + "time_per_iteration": 2.7115118503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064756, + "balance_loss_mlp": 1.03948343, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.05528375504555246, + "language_loss": 0.8460198, + "learning_rate": 0.00013785296331241526, + "loss": 0.85666734, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.25292969, + "step": 3977, + "time_per_iteration": 2.907803535461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.04070044, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.06111847190326819, + "language_loss": 0.87681317, + "learning_rate": 0.00013763822794857583, + "loss": 0.8874656, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.24560547, + "step": 3978, + "time_per_iteration": 3.321633815765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.04451132, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.06052146133272544, + "language_loss": 0.89868426, + "learning_rate": 0.00013742363326605278, + "loss": 0.90937102, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.24145508, + "step": 3979, + "time_per_iteration": 2.6986289024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069543, + "balance_loss_mlp": 1.04461646, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.05317029267567304, + "language_loss": 0.78481579, + "learning_rate": 0.00013720917934815935, + "loss": 0.79551125, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.24938965, + "step": 3980, + "time_per_iteration": 2.7501296997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.0423131, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.07789477911070539, + "language_loss": 0.83389938, + "learning_rate": 0.00013699486627815344, + "loss": 0.84457338, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.25109863, + "step": 3981, + "time_per_iteration": 2.6420035362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071024, + "balance_loss_mlp": 1.04684854, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.05908161503025523, + "language_loss": 0.82459986, + "learning_rate": 0.00013678069413923928, + "loss": 0.8353101, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.24169922, + "step": 3982, + "time_per_iteration": 2.6923530101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069033, + "balance_loss_mlp": 1.04438031, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.05955764603352247, + "language_loss": 0.82175618, + "learning_rate": 0.00013656666301456555, + "loss": 0.83244646, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.24658203, + "step": 3983, + "time_per_iteration": 2.507713794708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070412, + "balance_loss_mlp": 1.04577184, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.05383529418711491, + "language_loss": 0.84338224, + "learning_rate": 0.0001363527729872267, + "loss": 0.8540864, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.24633789, + "step": 3984, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074302, + "balance_loss_mlp": 1.04956603, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.06494738707995135, + "language_loss": 0.76830447, + "learning_rate": 0.00013613902414026207, + "loss": 0.77904749, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.24755859, + "step": 3985, + "time_per_iteration": 2.782332420349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071114, + "balance_loss_mlp": 1.04673553, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.062182975481338824, + "language_loss": 0.82354724, + "learning_rate": 0.00013592541655665642, + "loss": 0.83425832, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.24389648, + "step": 3986, + "time_per_iteration": 2.9702792167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072457, + "balance_loss_mlp": 1.04731619, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.06854938145673521, + "language_loss": 0.85176432, + "learning_rate": 0.00013571195031933947, + "loss": 0.86248893, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.25134277, + "step": 3987, + "time_per_iteration": 2.7414095401763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017121, + "balance_loss_mlp": 1.01035035, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.007346318890350203, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81498468, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.06787109, + "step": 3988, + "time_per_iteration": 4.668884515762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070415, + "balance_loss_mlp": 1.04586959, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.05768495800947346, + "language_loss": 0.85781401, + "learning_rate": 0.00013528544221501655, + "loss": 0.86851817, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.2454834, + "step": 3989, + "time_per_iteration": 2.7101733684539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082587, + "balance_loss_mlp": 1.05741, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.055358014362887016, + "language_loss": 0.81702012, + "learning_rate": 0.00013507240051359586, + "loss": 0.82784599, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.25195312, + "step": 3990, + "time_per_iteration": 3.045398235321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076149, + "balance_loss_mlp": 1.05154467, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.06163508204720818, + "language_loss": 0.86476028, + "learning_rate": 0.00013485950048963425, + "loss": 0.87552178, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.24597168, + "step": 3991, + "time_per_iteration": 2.6160435676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070646, + "balance_loss_mlp": 1.04651809, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.06405846615426129, + "language_loss": 0.83226049, + "learning_rate": 0.00013464674222578643, + "loss": 0.84296697, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.24133301, + "step": 3992, + "time_per_iteration": 3.241417407989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075856, + "balance_loss_mlp": 1.05133498, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.0861576715386882, + "language_loss": 0.83366966, + "learning_rate": 0.00013443412580465292, + "loss": 0.84442818, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.24523926, + "step": 3993, + "time_per_iteration": 4.034927606582642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077486, + "balance_loss_mlp": 1.05234468, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.06539433729122356, + "language_loss": 0.84409565, + "learning_rate": 0.00013422165130877857, + "loss": 0.8548705, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.25146484, + "step": 3994, + "time_per_iteration": 2.899876356124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076399, + "balance_loss_mlp": 1.05162692, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.06486497816335288, + "language_loss": 0.80478024, + "learning_rate": 0.00013400931882065327, + "loss": 0.81554425, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.24755859, + "step": 3995, + "time_per_iteration": 2.637848138809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075008, + "balance_loss_mlp": 1.04965222, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.05862406149444633, + "language_loss": 0.80990946, + "learning_rate": 0.0001337971284227118, + "loss": 0.82065952, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.25378418, + "step": 3996, + "time_per_iteration": 2.996047258377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014574, + "balance_loss_mlp": 1.00766027, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.0056434488295698, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77132994, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.06933594, + "step": 3997, + "time_per_iteration": 4.898136854171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079533, + "balance_loss_mlp": 1.05454707, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.05325095394191187, + "language_loss": 0.8044312, + "learning_rate": 0.0001333731742268438, + "loss": 0.81522655, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.24963379, + "step": 3998, + "time_per_iteration": 2.682598352432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074014, + "balance_loss_mlp": 1.0484314, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.05950995381117831, + "language_loss": 0.85969436, + "learning_rate": 0.0001331614105935109, + "loss": 0.87043446, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.25598145, + "step": 3999, + "time_per_iteration": 2.6892640590667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074883, + "balance_loss_mlp": 1.04993236, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.05504712261606648, + "language_loss": 0.84570682, + "learning_rate": 0.00013294978937954883, + "loss": 0.85645556, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.24963379, + "step": 4000, + "time_per_iteration": 2.7923429012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069985, + "balance_loss_mlp": 1.04480815, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.06369425747465635, + "language_loss": 0.85450065, + "learning_rate": 0.00013273831066711655, + "loss": 0.86520052, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.25195312, + "step": 4001, + "time_per_iteration": 2.604282855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078533, + "balance_loss_mlp": 1.05357099, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.05227266936279928, + "language_loss": 0.80363703, + "learning_rate": 0.00013252697453831747, + "loss": 0.81442237, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.24975586, + "step": 4002, + "time_per_iteration": 2.7329213619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075879, + "balance_loss_mlp": 1.04984355, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.05410166650315036, + "language_loss": 0.82619524, + "learning_rate": 0.00013231578107519916, + "loss": 0.836954, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.26037598, + "step": 4003, + "time_per_iteration": 2.8749208450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.04640484, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.06212967389396702, + "language_loss": 0.82997644, + "learning_rate": 0.00013210473035975422, + "loss": 0.84067953, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.23901367, + "step": 4004, + "time_per_iteration": 2.632235288619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073797, + "balance_loss_mlp": 1.04878688, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.08127476835150631, + "language_loss": 0.85770059, + "learning_rate": 0.0001318938224739201, + "loss": 0.8684386, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.25012207, + "step": 4005, + "time_per_iteration": 3.1193981170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069411, + "balance_loss_mlp": 1.04423416, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06450494228742032, + "language_loss": 0.84133303, + "learning_rate": 0.00013168305749957843, + "loss": 0.85202718, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.25183105, + "step": 4006, + "time_per_iteration": 2.7679009437561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066176, + "balance_loss_mlp": 1.04140389, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.05098549472423512, + "language_loss": 0.82851386, + "learning_rate": 0.00013147243551855532, + "loss": 0.83917558, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.24768066, + "step": 4007, + "time_per_iteration": 2.56661057472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106967, + "balance_loss_mlp": 1.04553032, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.048065096858605744, + "language_loss": 0.80720365, + "learning_rate": 0.00013126195661262148, + "loss": 0.81790042, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.24133301, + "step": 4008, + "time_per_iteration": 2.748946189880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04871428, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.05225893259169981, + "language_loss": 0.86849338, + "learning_rate": 0.00013105162086349216, + "loss": 0.87922454, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.24389648, + "step": 4009, + "time_per_iteration": 2.8441760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075731, + "balance_loss_mlp": 1.05036318, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.05592364071179798, + "language_loss": 0.86026949, + "learning_rate": 0.00013084142835282687, + "loss": 0.87102675, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.25390625, + "step": 4010, + "time_per_iteration": 2.6637930870056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018937, + "balance_loss_mlp": 1.01207089, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.009139726311940191, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80903304, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.06884766, + "step": 4011, + "time_per_iteration": 4.862056255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071968, + "balance_loss_mlp": 1.04698229, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.059120136224188345, + "language_loss": 0.89225733, + "learning_rate": 0.0001304214733732485, + "loss": 0.90297705, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.24987793, + "step": 4012, + "time_per_iteration": 2.759030818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073689, + "balance_loss_mlp": 1.04848814, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.06471041730678638, + "language_loss": 0.82730675, + "learning_rate": 0.00013021171106737672, + "loss": 0.83804369, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.25219727, + "step": 4013, + "time_per_iteration": 2.6606547832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076087, + "balance_loss_mlp": 1.05184019, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.04900066226128831, + "language_loss": 0.79908812, + "learning_rate": 0.00013000209232605071, + "loss": 0.80984896, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.24230957, + "step": 4014, + "time_per_iteration": 2.665905237197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.05282855, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.06112285511526787, + "language_loss": 0.79969144, + "learning_rate": 0.0001297926172306519, + "loss": 0.81047451, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.25500488, + "step": 4015, + "time_per_iteration": 2.657850503921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070931, + "balance_loss_mlp": 1.04582596, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.05237005996318212, + "language_loss": 0.79076529, + "learning_rate": 0.0001295832858625055, + "loss": 0.80147457, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.25097656, + "step": 4016, + "time_per_iteration": 3.2558414936065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073374, + "balance_loss_mlp": 1.04854274, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.05315298112926871, + "language_loss": 0.70125198, + "learning_rate": 0.00012937409830288154, + "loss": 0.71198577, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.24841309, + "step": 4017, + "time_per_iteration": 2.8071141242980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075792, + "balance_loss_mlp": 1.05123496, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.08174629048590813, + "language_loss": 0.85300404, + "learning_rate": 0.00012916505463299362, + "loss": 0.86376196, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.24572754, + "step": 4018, + "time_per_iteration": 2.521777868270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107567, + "balance_loss_mlp": 1.05085087, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.06122283434294953, + "language_loss": 0.78065503, + "learning_rate": 0.00012895615493399972, + "loss": 0.7914117, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.24816895, + "step": 4019, + "time_per_iteration": 2.843815326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077372, + "balance_loss_mlp": 1.05187333, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.06855885620407597, + "language_loss": 0.82437384, + "learning_rate": 0.00012874739928700192, + "loss": 0.83514762, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.25512695, + "step": 4020, + "time_per_iteration": 2.625136613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074703, + "balance_loss_mlp": 1.049371, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.06838381213825387, + "language_loss": 0.79939824, + "learning_rate": 0.00012853878777304624, + "loss": 0.81014526, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.25341797, + "step": 4021, + "time_per_iteration": 2.870577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078589, + "balance_loss_mlp": 1.05390024, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.05110075490537154, + "language_loss": 0.84490621, + "learning_rate": 0.000128330320473123, + "loss": 0.85569209, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.24694824, + "step": 4022, + "time_per_iteration": 2.6740787029266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016568, + "balance_loss_mlp": 1.0095588, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.007922822581460296, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79348469, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.0703125, + "step": 4023, + "time_per_iteration": 4.92633318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079759, + "balance_loss_mlp": 1.05466557, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.0679194814270071, + "language_loss": 0.81913227, + "learning_rate": 0.0001279138188390543, + "loss": 0.82992983, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.25109863, + "step": 4024, + "time_per_iteration": 2.822410821914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074667, + "balance_loss_mlp": 1.05036056, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.05859090584356498, + "language_loss": 0.86860305, + "learning_rate": 0.00012770578466660915, + "loss": 0.87934977, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.24291992, + "step": 4025, + "time_per_iteration": 2.9427406787872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081782, + "balance_loss_mlp": 1.05593777, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.062055121147232294, + "language_loss": 0.82006752, + "learning_rate": 0.0001274978950315968, + "loss": 0.83088535, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.25878906, + "step": 4026, + "time_per_iteration": 2.795128583908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075185, + "balance_loss_mlp": 1.05006814, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.08117867948419291, + "language_loss": 0.83182287, + "learning_rate": 0.00012729015001472716, + "loss": 0.84257472, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.2512207, + "step": 4027, + "time_per_iteration": 2.6325039863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04923844, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.05921270053212527, + "language_loss": 0.82036096, + "learning_rate": 0.00012708254969665418, + "loss": 0.83111012, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.25695801, + "step": 4028, + "time_per_iteration": 2.7775604724884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079508, + "balance_loss_mlp": 1.05439043, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.06748938029736128, + "language_loss": 0.83602798, + "learning_rate": 0.00012687509415797526, + "loss": 0.8468231, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.2512207, + "step": 4029, + "time_per_iteration": 2.550536632537842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077, + "balance_loss_mlp": 1.05216861, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.05440055390110852, + "language_loss": 0.81075221, + "learning_rate": 0.00012666778347923208, + "loss": 0.82152218, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.24829102, + "step": 4030, + "time_per_iteration": 2.627509593963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071803, + "balance_loss_mlp": 1.04774618, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.05010388479437805, + "language_loss": 0.84007275, + "learning_rate": 0.0001264606177409092, + "loss": 0.85079074, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.24047852, + "step": 4031, + "time_per_iteration": 2.6272945404052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.04220271, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.05768180331763808, + "language_loss": 0.86024618, + "learning_rate": 0.00012625359702343609, + "loss": 0.87091547, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.24707031, + "step": 4032, + "time_per_iteration": 2.7090940475463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066743, + "balance_loss_mlp": 1.04268646, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.05938615660994814, + "language_loss": 0.84870774, + "learning_rate": 0.00012604672140718504, + "loss": 0.85937512, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.24047852, + "step": 4033, + "time_per_iteration": 2.6182591915130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069686, + "balance_loss_mlp": 1.04444957, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.06535534521633303, + "language_loss": 0.77696943, + "learning_rate": 0.00012583999097247233, + "loss": 0.78766632, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.25256348, + "step": 4034, + "time_per_iteration": 2.8029134273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064158, + "balance_loss_mlp": 1.03980374, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.06345201912035602, + "language_loss": 0.79936808, + "learning_rate": 0.0001256334057995578, + "loss": 0.81000972, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.24365234, + "step": 4035, + "time_per_iteration": 2.7016162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063824, + "balance_loss_mlp": 1.03927922, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.056783072410337934, + "language_loss": 0.85302633, + "learning_rate": 0.000125426965968645, + "loss": 0.86366457, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.24536133, + "step": 4036, + "time_per_iteration": 2.699063301086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064554, + "balance_loss_mlp": 1.04023552, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07280358929704372, + "language_loss": 0.82468921, + "learning_rate": 0.00012522067155988092, + "loss": 0.83533478, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.24304199, + "step": 4037, + "time_per_iteration": 2.6608784198760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065667, + "balance_loss_mlp": 1.04110956, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.05908438933946337, + "language_loss": 0.75255591, + "learning_rate": 0.00012501452265335617, + "loss": 0.76321256, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.24560547, + "step": 4038, + "time_per_iteration": 2.8470029830932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068825, + "balance_loss_mlp": 1.04455364, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.059889567588302765, + "language_loss": 0.83042991, + "learning_rate": 0.0001248085193291047, + "loss": 0.84111816, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.24255371, + "step": 4039, + "time_per_iteration": 2.730807304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068961, + "balance_loss_mlp": 1.04434443, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.05468138841735705, + "language_loss": 0.82561696, + "learning_rate": 0.00012460266166710443, + "loss": 0.83630657, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.24609375, + "step": 4040, + "time_per_iteration": 3.2041711807250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.04067707, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.06748219458401046, + "language_loss": 0.77782911, + "learning_rate": 0.00012439694974727633, + "loss": 0.78847289, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.23706055, + "step": 4041, + "time_per_iteration": 3.0317485332489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066638, + "balance_loss_mlp": 1.04206872, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.05414460584794901, + "language_loss": 0.80244517, + "learning_rate": 0.00012419138364948458, + "loss": 0.81311154, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.24560547, + "step": 4042, + "time_per_iteration": 2.7031445503234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063112, + "balance_loss_mlp": 1.03903186, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.05641629712994933, + "language_loss": 0.8246541, + "learning_rate": 0.00012398596345353702, + "loss": 0.83528519, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.24072266, + "step": 4043, + "time_per_iteration": 2.9440853595733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068598, + "balance_loss_mlp": 1.04396939, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06391086688710987, + "language_loss": 0.83438706, + "learning_rate": 0.0001237806892391851, + "loss": 0.84507304, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.24621582, + "step": 4044, + "time_per_iteration": 2.699157476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070628, + "balance_loss_mlp": 1.04549861, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.06994716374064003, + "language_loss": 0.81070113, + "learning_rate": 0.0001235755610861233, + "loss": 0.82140744, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.25134277, + "step": 4045, + "time_per_iteration": 2.772141933441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_mlp": 1.03941059, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.05993243352080289, + "language_loss": 0.8561902, + "learning_rate": 0.0001233705790739893, + "loss": 0.86682665, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.2421875, + "step": 4046, + "time_per_iteration": 2.715252637863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063047, + "balance_loss_mlp": 1.03909791, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.05972072663503075, + "language_loss": 0.74957597, + "learning_rate": 0.0001231657432823643, + "loss": 0.7602064, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.23937988, + "step": 4047, + "time_per_iteration": 3.2537522315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068015, + "balance_loss_mlp": 1.04351759, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.08653070667167902, + "language_loss": 0.79029131, + "learning_rate": 0.0001229610537907725, + "loss": 0.80097145, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.24511719, + "step": 4048, + "time_per_iteration": 2.6116526126861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.04393375, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.07563630490624115, + "language_loss": 0.9076525, + "learning_rate": 0.00012275651067868143, + "loss": 0.91833448, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.24255371, + "step": 4049, + "time_per_iteration": 2.6179893016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067923, + "balance_loss_mlp": 1.04335427, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.05280851265506485, + "language_loss": 0.80811793, + "learning_rate": 0.00012255211402550182, + "loss": 0.81879717, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.24572754, + "step": 4050, + "time_per_iteration": 3.24564266204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106679, + "balance_loss_mlp": 1.0426023, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.06858136893192632, + "language_loss": 0.76661634, + "learning_rate": 0.00012234786391058727, + "loss": 0.77728426, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.24194336, + "step": 4051, + "time_per_iteration": 2.7757604122161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072241, + "balance_loss_mlp": 1.04733872, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.06727738365211818, + "language_loss": 0.85258687, + "learning_rate": 0.0001221437604132352, + "loss": 0.86330926, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.24914551, + "step": 4052, + "time_per_iteration": 2.6063547134399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068764, + "balance_loss_mlp": 1.04383731, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.07458964549292091, + "language_loss": 0.81427658, + "learning_rate": 0.0001219398036126852, + "loss": 0.82496417, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.24926758, + "step": 4053, + "time_per_iteration": 2.7283682823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069627, + "balance_loss_mlp": 1.04429483, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.05694657807222903, + "language_loss": 0.78109223, + "learning_rate": 0.00012173599358812027, + "loss": 0.79178852, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.25341797, + "step": 4054, + "time_per_iteration": 3.234590768814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071917, + "balance_loss_mlp": 1.04676414, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07419180869397772, + "language_loss": 0.82674354, + "learning_rate": 0.0001215323304186668, + "loss": 0.83746266, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.25170898, + "step": 4055, + "time_per_iteration": 2.747619152069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068312, + "balance_loss_mlp": 1.04483986, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.05875387608266639, + "language_loss": 0.88048428, + "learning_rate": 0.00012132881418339364, + "loss": 0.8911674, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.23449707, + "step": 4056, + "time_per_iteration": 2.711988687515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020033, + "balance_loss_mlp": 1.01268935, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.018856968303163506, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78537595, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.07324219, + "step": 4057, + "time_per_iteration": 4.845250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068664, + "balance_loss_mlp": 1.04403543, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.07123577938940119, + "language_loss": 0.76748145, + "learning_rate": 0.00012092222283137944, + "loss": 0.77816808, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.24633789, + "step": 4058, + "time_per_iteration": 2.7359137535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015158, + "balance_loss_mlp": 1.00781476, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.015685369403867444, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79921466, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.07324219, + "step": 4059, + "time_per_iteration": 4.798109292984009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068881, + "balance_loss_mlp": 1.0447526, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.05299878085313822, + "language_loss": 0.83917785, + "learning_rate": 0.00012051622016348856, + "loss": 0.84986663, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.24121094, + "step": 4060, + "time_per_iteration": 3.049039125442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068121, + "balance_loss_mlp": 1.04395747, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.06624192950559629, + "language_loss": 0.84473646, + "learning_rate": 0.00012031343978315539, + "loss": 0.85541761, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.24169922, + "step": 4061, + "time_per_iteration": 2.5507752895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065012, + "balance_loss_mlp": 1.03988302, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.06423319540710895, + "language_loss": 0.82729137, + "learning_rate": 0.00012011080681021774, + "loss": 0.83794153, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.2512207, + "step": 4062, + "time_per_iteration": 2.611497640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_mlp": 1.04255247, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.05897582780878463, + "language_loss": 0.86540973, + "learning_rate": 0.00011990832132334512, + "loss": 0.87608433, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.24902344, + "step": 4063, + "time_per_iteration": 2.538100481033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069255, + "balance_loss_mlp": 1.04350615, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.06886794650581697, + "language_loss": 0.82671422, + "learning_rate": 0.00011970598340114897, + "loss": 0.83740675, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.25769043, + "step": 4064, + "time_per_iteration": 2.9499306678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067844, + "balance_loss_mlp": 1.04288173, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.06050124300613184, + "language_loss": 0.83932686, + "learning_rate": 0.00011950379312218396, + "loss": 0.85000533, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.24975586, + "step": 4065, + "time_per_iteration": 2.6981561183929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066746, + "balance_loss_mlp": 1.04218912, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.0632552129471463, + "language_loss": 0.86015439, + "learning_rate": 0.00011930175056494719, + "loss": 0.87082189, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.24560547, + "step": 4066, + "time_per_iteration": 2.8711161613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072113, + "balance_loss_mlp": 1.04645956, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.04894683237800459, + "language_loss": 0.76267391, + "learning_rate": 0.00011909985580787885, + "loss": 0.77339506, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.25683594, + "step": 4067, + "time_per_iteration": 2.623110771179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067479, + "balance_loss_mlp": 1.04259992, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.051509930807311505, + "language_loss": 0.81230474, + "learning_rate": 0.00011889810892936137, + "loss": 0.82297957, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.24865723, + "step": 4068, + "time_per_iteration": 2.7267825603485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072363, + "balance_loss_mlp": 1.04626799, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.06989547292093733, + "language_loss": 0.7721833, + "learning_rate": 0.00011869651000771959, + "loss": 0.78290695, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.26086426, + "step": 4069, + "time_per_iteration": 2.8386263847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066766, + "balance_loss_mlp": 1.04239988, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.05590330027486899, + "language_loss": 0.82998097, + "learning_rate": 0.00011849505912122117, + "loss": 0.84064865, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.24353027, + "step": 4070, + "time_per_iteration": 2.7070353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071318, + "balance_loss_mlp": 1.04591501, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07526003762503049, + "language_loss": 0.77768147, + "learning_rate": 0.00011829375634807654, + "loss": 0.78839469, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.25415039, + "step": 4071, + "time_per_iteration": 3.020780324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070393, + "balance_loss_mlp": 1.04478693, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.07034512653521276, + "language_loss": 0.81144375, + "learning_rate": 0.00011809260176643821, + "loss": 0.82214773, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.25634766, + "step": 4072, + "time_per_iteration": 3.048811674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070982, + "balance_loss_mlp": 1.0463053, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.059023675544883115, + "language_loss": 0.83701313, + "learning_rate": 0.00011789159545440131, + "loss": 0.84772301, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.2467041, + "step": 4073, + "time_per_iteration": 2.576185703277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_mlp": 1.04587591, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05002266794770535, + "language_loss": 0.82456756, + "learning_rate": 0.00011769073749000348, + "loss": 0.83526802, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.24169922, + "step": 4074, + "time_per_iteration": 2.8168578147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079683, + "balance_loss_mlp": 1.05485141, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.06200789049799382, + "language_loss": 0.76390898, + "learning_rate": 0.0001174900279512246, + "loss": 0.77470577, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.24829102, + "step": 4075, + "time_per_iteration": 2.5813939571380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073387, + "balance_loss_mlp": 1.04935431, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.0619484344784815, + "language_loss": 0.81889015, + "learning_rate": 0.00011728946691598707, + "loss": 0.82962406, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.2401123, + "step": 4076, + "time_per_iteration": 2.604560136795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072391, + "balance_loss_mlp": 1.04755974, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.06799226151214174, + "language_loss": 0.76469254, + "learning_rate": 0.00011708905446215561, + "loss": 0.77541649, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.24841309, + "step": 4077, + "time_per_iteration": 2.8604230880737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076266, + "balance_loss_mlp": 1.05137515, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.05961897466734042, + "language_loss": 0.80370939, + "learning_rate": 0.00011688879066753711, + "loss": 0.81447208, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.24914551, + "step": 4078, + "time_per_iteration": 2.676522970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076486, + "balance_loss_mlp": 1.05258489, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.06622191956248384, + "language_loss": 0.87440282, + "learning_rate": 0.00011668867560988122, + "loss": 0.88516766, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.23876953, + "step": 4079, + "time_per_iteration": 2.5650012493133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_mlp": 1.04890943, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.05649669086947294, + "language_loss": 0.84200358, + "learning_rate": 0.00011648870936687916, + "loss": 0.85274112, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.24853516, + "step": 4080, + "time_per_iteration": 2.746581554412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077673, + "balance_loss_mlp": 1.05274642, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.06352176738045938, + "language_loss": 0.78612041, + "learning_rate": 0.00011628889201616461, + "loss": 0.79689717, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.24926758, + "step": 4081, + "time_per_iteration": 2.6527559757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107817, + "balance_loss_mlp": 1.05413723, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.059333709179443264, + "language_loss": 0.81988198, + "learning_rate": 0.00011608922363531393, + "loss": 0.83066362, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.24023438, + "step": 4082, + "time_per_iteration": 2.6462624073028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107947, + "balance_loss_mlp": 1.05538988, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.07948062508408354, + "language_loss": 0.83596992, + "learning_rate": 0.00011588970430184504, + "loss": 0.84676462, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.24072266, + "step": 4083, + "time_per_iteration": 3.0286946296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076264, + "balance_loss_mlp": 1.05245781, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.051566265858254114, + "language_loss": 0.81763256, + "learning_rate": 0.00011569033409321822, + "loss": 0.82839513, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.23803711, + "step": 4084, + "time_per_iteration": 2.677654981613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074536, + "balance_loss_mlp": 1.051386, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.08091390991612231, + "language_loss": 0.73483807, + "learning_rate": 0.00011549111308683591, + "loss": 0.74558342, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.23144531, + "step": 4085, + "time_per_iteration": 2.658348560333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107372, + "balance_loss_mlp": 1.04996181, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.08162659995684916, + "language_loss": 0.80996692, + "learning_rate": 0.00011529204136004251, + "loss": 0.8207041, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.23754883, + "step": 4086, + "time_per_iteration": 2.4145894050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075099, + "balance_loss_mlp": 1.05042303, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.05377876358731873, + "language_loss": 0.84456086, + "learning_rate": 0.00011509311899012459, + "loss": 0.85531187, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.2467041, + "step": 4087, + "time_per_iteration": 2.6459927558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107412, + "balance_loss_mlp": 1.05020642, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.06320948891726123, + "language_loss": 0.78042746, + "learning_rate": 0.00011489434605431053, + "loss": 0.79116869, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.23901367, + "step": 4088, + "time_per_iteration": 2.6297101974487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071348, + "balance_loss_mlp": 1.04667187, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.0677420392553831, + "language_loss": 0.81525648, + "learning_rate": 0.0001146957226297708, + "loss": 0.82596999, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.24682617, + "step": 4089, + "time_per_iteration": 2.686326026916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078205, + "balance_loss_mlp": 1.05306411, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.0625994839156693, + "language_loss": 0.76622844, + "learning_rate": 0.00011449724879361827, + "loss": 0.77701044, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.25146484, + "step": 4090, + "time_per_iteration": 2.944439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072693, + "balance_loss_mlp": 1.04931605, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.07495100545355479, + "language_loss": 0.73970717, + "learning_rate": 0.00011429892462290687, + "loss": 0.7504341, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.23376465, + "step": 4091, + "time_per_iteration": 2.666902542114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076674, + "balance_loss_mlp": 1.05143714, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.08113379583083238, + "language_loss": 0.83441567, + "learning_rate": 0.00011410075019463295, + "loss": 0.84518242, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.25256348, + "step": 4092, + "time_per_iteration": 2.6106560230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077622, + "balance_loss_mlp": 1.05387568, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.06484532402040227, + "language_loss": 0.80351543, + "learning_rate": 0.00011390272558573461, + "loss": 0.81429172, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.23730469, + "step": 4093, + "time_per_iteration": 2.6585676670074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_mlp": 1.04527855, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.06429182980195784, + "language_loss": 0.79664457, + "learning_rate": 0.00011370485087309202, + "loss": 0.80732632, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.22888184, + "step": 4094, + "time_per_iteration": 2.6336958408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.05357265, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.05995704636978539, + "language_loss": 0.79134881, + "learning_rate": 0.00011350712613352688, + "loss": 0.80212498, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.24023438, + "step": 4095, + "time_per_iteration": 2.7010250091552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075966, + "balance_loss_mlp": 1.05183816, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.08641736652738899, + "language_loss": 0.79282218, + "learning_rate": 0.00011330955144380283, + "loss": 0.80358183, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.24121094, + "step": 4096, + "time_per_iteration": 2.608861207962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075916, + "balance_loss_mlp": 1.0511322, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.05952811942716876, + "language_loss": 0.8624779, + "learning_rate": 0.00011311212688062483, + "loss": 0.87323707, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.24780273, + "step": 4097, + "time_per_iteration": 2.779981851577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071321, + "balance_loss_mlp": 1.04694271, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.07195937030471744, + "language_loss": 0.77942967, + "learning_rate": 0.0001129148525206402, + "loss": 0.79014289, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.24365234, + "step": 4098, + "time_per_iteration": 2.821665048599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.04944086, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.06162490932245696, + "language_loss": 0.86554086, + "learning_rate": 0.00011271772844043759, + "loss": 0.87627459, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.23913574, + "step": 4099, + "time_per_iteration": 2.6586296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071216, + "balance_loss_mlp": 1.04692149, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.0681580072696929, + "language_loss": 0.75896525, + "learning_rate": 0.00011252075471654727, + "loss": 0.7696774, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.24279785, + "step": 4100, + "time_per_iteration": 2.9564926624298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078491, + "balance_loss_mlp": 1.05411243, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.05782591826916926, + "language_loss": 0.78057027, + "learning_rate": 0.00011232393142544133, + "loss": 0.79135513, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.24365234, + "step": 4101, + "time_per_iteration": 2.9211971759796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071535, + "balance_loss_mlp": 1.04766965, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.05727153385577965, + "language_loss": 0.83000249, + "learning_rate": 0.00011212725864353323, + "loss": 0.84071785, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.23864746, + "step": 4102, + "time_per_iteration": 3.0621325969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010163, + "balance_loss_mlp": 1.00324917, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.011162541851203939, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77346092, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.06933594, + "step": 4103, + "time_per_iteration": 4.843589782714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_mlp": 1.05412483, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.06819602416212077, + "language_loss": 0.7587899, + "learning_rate": 0.00011173436491267291, + "loss": 0.76958299, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.25195312, + "step": 4104, + "time_per_iteration": 2.5900800228118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071242, + "balance_loss_mlp": 1.04709017, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.058920781794481306, + "language_loss": 0.82051444, + "learning_rate": 0.0001115381441162554, + "loss": 0.83122683, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.24145508, + "step": 4105, + "time_per_iteration": 2.615739345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010138, + "balance_loss_mlp": 1.00322342, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.008527676451490414, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74593866, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.06933594, + "step": 4106, + "time_per_iteration": 4.8757970333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073097, + "balance_loss_mlp": 1.04834938, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06585069884795663, + "language_loss": 0.85103577, + "learning_rate": 0.00011114615504234465, + "loss": 0.8617667, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.24743652, + "step": 4107, + "time_per_iteration": 2.7732784748077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069208, + "balance_loss_mlp": 1.04528236, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.058284414227118955, + "language_loss": 0.80976272, + "learning_rate": 0.00011095038691703468, + "loss": 0.82045484, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.23913574, + "step": 4108, + "time_per_iteration": 2.8352532386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.04486418, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.05943698489540686, + "language_loss": 0.82732534, + "learning_rate": 0.00011075476983417998, + "loss": 0.83800745, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.2331543, + "step": 4109, + "time_per_iteration": 2.8430795669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074219, + "balance_loss_mlp": 1.04974484, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.060761374813596516, + "language_loss": 0.77827907, + "learning_rate": 0.00011055930386972579, + "loss": 0.78902125, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.24487305, + "step": 4110, + "time_per_iteration": 2.8313000202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04976118, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.06263812917256945, + "language_loss": 0.78423452, + "learning_rate": 0.00011036398909955863, + "loss": 0.79498512, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.2532959, + "step": 4111, + "time_per_iteration": 2.952772378921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070116, + "balance_loss_mlp": 1.04626179, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.053341385482647614, + "language_loss": 0.8184247, + "learning_rate": 0.00011016882559950648, + "loss": 0.82912588, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.23852539, + "step": 4112, + "time_per_iteration": 2.8120028972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071944, + "balance_loss_mlp": 1.0478282, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.05974230031891692, + "language_loss": 0.81143081, + "learning_rate": 0.00010997381344533853, + "loss": 0.82215035, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.2409668, + "step": 4113, + "time_per_iteration": 2.7792022228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072154, + "balance_loss_mlp": 1.04729867, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.1069423386655018, + "language_loss": 0.80604601, + "learning_rate": 0.00010977895271276517, + "loss": 0.81676757, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.24853516, + "step": 4114, + "time_per_iteration": 2.663350820541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076688, + "balance_loss_mlp": 1.05166614, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.054772292847761084, + "language_loss": 0.80259216, + "learning_rate": 0.00010958424347743807, + "loss": 0.81335902, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.25036621, + "step": 4115, + "time_per_iteration": 2.7451772689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068946, + "balance_loss_mlp": 1.04523504, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06136237031197372, + "language_loss": 0.80247879, + "learning_rate": 0.00010938968581494991, + "loss": 0.81316829, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.23718262, + "step": 4116, + "time_per_iteration": 2.953747034072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076632, + "balance_loss_mlp": 1.05221832, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.35095947289915363, + "language_loss": 0.79212964, + "learning_rate": 0.000109195279800835, + "loss": 0.80289596, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.24414062, + "step": 4117, + "time_per_iteration": 2.715400218963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107219, + "balance_loss_mlp": 1.04746628, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.07732725032472638, + "language_loss": 0.76730645, + "learning_rate": 0.00010900102551056834, + "loss": 0.77802831, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.24719238, + "step": 4118, + "time_per_iteration": 3.0449063777923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074277, + "balance_loss_mlp": 1.05074465, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.056769240767155345, + "language_loss": 0.84711397, + "learning_rate": 0.00010880692301956601, + "loss": 0.85785675, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.23535156, + "step": 4119, + "time_per_iteration": 2.4458513259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072784, + "balance_loss_mlp": 1.0488348, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.050731815554774906, + "language_loss": 0.86393607, + "learning_rate": 0.00010861297240318518, + "loss": 0.87466389, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.23937988, + "step": 4120, + "time_per_iteration": 2.8899548053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071017, + "balance_loss_mlp": 1.04761648, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.05101998246826083, + "language_loss": 0.8699168, + "learning_rate": 0.00010841917373672444, + "loss": 0.88062692, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.23388672, + "step": 4121, + "time_per_iteration": 2.735093593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072991, + "balance_loss_mlp": 1.04925656, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.06441744003390583, + "language_loss": 0.78287446, + "learning_rate": 0.00010822552709542293, + "loss": 0.79360437, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.23730469, + "step": 4122, + "time_per_iteration": 2.807694911956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107165, + "balance_loss_mlp": 1.04821384, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.05111544046549841, + "language_loss": 0.86263895, + "learning_rate": 0.0001080320325544612, + "loss": 0.87335551, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.23425293, + "step": 4123, + "time_per_iteration": 2.7574799060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073909, + "balance_loss_mlp": 1.04917264, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.061782912255848775, + "language_loss": 0.82952595, + "learning_rate": 0.00010783869018895997, + "loss": 0.84026504, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.24731445, + "step": 4124, + "time_per_iteration": 2.5689857006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070849, + "balance_loss_mlp": 1.04611349, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.05779110209300624, + "language_loss": 0.84302074, + "learning_rate": 0.00010764550007398189, + "loss": 0.85372925, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.24755859, + "step": 4125, + "time_per_iteration": 2.624270439147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072775, + "balance_loss_mlp": 1.04818177, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.06779162634736255, + "language_loss": 0.81683314, + "learning_rate": 0.00010745246228452982, + "loss": 0.8275609, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.24597168, + "step": 4126, + "time_per_iteration": 2.6135480403900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073416, + "balance_loss_mlp": 1.04958653, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.08048379704700387, + "language_loss": 0.81824982, + "learning_rate": 0.00010725957689554771, + "loss": 0.82898396, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.23791504, + "step": 4127, + "time_per_iteration": 2.7506372928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072246, + "balance_loss_mlp": 1.0476892, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.05152467214863494, + "language_loss": 0.84890383, + "learning_rate": 0.00010706684398192013, + "loss": 0.85962629, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.2454834, + "step": 4128, + "time_per_iteration": 2.707517385482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070232, + "balance_loss_mlp": 1.04580581, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.06480144593438951, + "language_loss": 0.82284296, + "learning_rate": 0.00010687426361847313, + "loss": 0.83354527, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.24414062, + "step": 4129, + "time_per_iteration": 2.7257397174835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076314, + "balance_loss_mlp": 1.05154264, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.058525612459034176, + "language_loss": 0.85989046, + "learning_rate": 0.00010668183587997254, + "loss": 0.87065363, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.24780273, + "step": 4130, + "time_per_iteration": 2.6166372299194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107138, + "balance_loss_mlp": 1.04734755, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.05131686661427292, + "language_loss": 0.77755392, + "learning_rate": 0.0001064895608411256, + "loss": 0.78826773, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.24047852, + "step": 4131, + "time_per_iteration": 2.806696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074078, + "balance_loss_mlp": 1.04922318, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.06416883380663327, + "language_loss": 0.80383301, + "learning_rate": 0.00010629743857657998, + "loss": 0.81457376, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.24853516, + "step": 4132, + "time_per_iteration": 2.9960997104644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_mlp": 1.02619219, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.021894290253507743, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71631676, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.06835938, + "step": 4133, + "time_per_iteration": 4.623692512512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_mlp": 1.05289078, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.06117584542977267, + "language_loss": 0.82268703, + "learning_rate": 0.00010591365266868802, + "loss": 0.83344853, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.23254395, + "step": 4134, + "time_per_iteration": 2.9868671894073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_mlp": 1.02421665, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.02172018156045361, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76542771, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.06787109, + "step": 4135, + "time_per_iteration": 4.976134300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071076, + "balance_loss_mlp": 1.04654241, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.06221307729096171, + "language_loss": 0.79485571, + "learning_rate": 0.00010553047875229166, + "loss": 0.80556649, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.24536133, + "step": 4136, + "time_per_iteration": 2.5057613849639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077488, + "balance_loss_mlp": 1.05284762, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.058263200481796444, + "language_loss": 0.83615577, + "learning_rate": 0.00010533912147689328, + "loss": 0.84693062, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.24645996, + "step": 4137, + "time_per_iteration": 2.601483106613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073106, + "balance_loss_mlp": 1.04894197, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.05046290164389679, + "language_loss": 0.82613522, + "learning_rate": 0.00010514791742243656, + "loss": 0.83686626, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.24157715, + "step": 4138, + "time_per_iteration": 2.5787172317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069273, + "balance_loss_mlp": 1.04491878, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.06370293603956936, + "language_loss": 0.82636452, + "learning_rate": 0.00010495686666315341, + "loss": 0.83705723, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.2434082, + "step": 4139, + "time_per_iteration": 2.9137964248657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074776, + "balance_loss_mlp": 1.05144691, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.06334360078787715, + "language_loss": 0.77300745, + "learning_rate": 0.00010476596927321635, + "loss": 0.78375518, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.23327637, + "step": 4140, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071011, + "balance_loss_mlp": 1.04807556, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.04689989949815107, + "language_loss": 0.80298364, + "learning_rate": 0.00010457522532673835, + "loss": 0.81369376, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.22924805, + "step": 4141, + "time_per_iteration": 2.804485321044922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074886, + "balance_loss_mlp": 1.05092525, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.0598189384756839, + "language_loss": 0.83671814, + "learning_rate": 0.00010438463489777272, + "loss": 0.84746695, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.23937988, + "step": 4142, + "time_per_iteration": 2.593764305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071767, + "balance_loss_mlp": 1.047472, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.06370121218595463, + "language_loss": 0.77954531, + "learning_rate": 0.00010419419806031316, + "loss": 0.79026294, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.24291992, + "step": 4143, + "time_per_iteration": 2.6709976196289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073716, + "balance_loss_mlp": 1.05002928, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.05491242983846696, + "language_loss": 0.84102464, + "learning_rate": 0.00010400391488829403, + "loss": 0.85176182, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.23681641, + "step": 4144, + "time_per_iteration": 2.767470121383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074277, + "balance_loss_mlp": 1.04962492, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.0554387027549828, + "language_loss": 0.86743295, + "learning_rate": 0.00010381378545558984, + "loss": 0.87817574, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.24658203, + "step": 4145, + "time_per_iteration": 2.7449891567230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067536, + "balance_loss_mlp": 1.04415917, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.05491436381940487, + "language_loss": 0.8494069, + "learning_rate": 0.00010362380983601505, + "loss": 0.86008221, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.23352051, + "step": 4146, + "time_per_iteration": 2.540022134780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071068, + "balance_loss_mlp": 1.04748869, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.05412435738245621, + "language_loss": 0.79004198, + "learning_rate": 0.00010343398810332477, + "loss": 0.80075264, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.2355957, + "step": 4147, + "time_per_iteration": 3.457382917404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064363, + "balance_loss_mlp": 1.04003215, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.057902148991934105, + "language_loss": 0.84552854, + "learning_rate": 0.00010324432033121467, + "loss": 0.8561722, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.2434082, + "step": 4148, + "time_per_iteration": 2.923807382583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068154, + "balance_loss_mlp": 1.04443097, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.05631253152540408, + "language_loss": 0.83762592, + "learning_rate": 0.00010305480659332005, + "loss": 0.84830743, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.23718262, + "step": 4149, + "time_per_iteration": 2.58620285987854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069961, + "balance_loss_mlp": 1.04592896, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.0573271525329392, + "language_loss": 0.83780408, + "learning_rate": 0.00010286544696321682, + "loss": 0.84850371, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.24023438, + "step": 4150, + "time_per_iteration": 2.5883052349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066534, + "balance_loss_mlp": 1.04267991, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.06896512478110718, + "language_loss": 0.79666734, + "learning_rate": 0.00010267624151442073, + "loss": 0.80733263, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.23864746, + "step": 4151, + "time_per_iteration": 2.660498857498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.04581642, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.08734927072671815, + "language_loss": 0.81357807, + "learning_rate": 0.000102487190320388, + "loss": 0.82427216, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.23596191, + "step": 4152, + "time_per_iteration": 3.378927230834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066042, + "balance_loss_mlp": 1.04099584, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.06453133942638287, + "language_loss": 0.79819167, + "learning_rate": 0.00010229829345451475, + "loss": 0.80885208, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.25061035, + "step": 4153, + "time_per_iteration": 3.319824457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068485, + "balance_loss_mlp": 1.04403472, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06331967067065977, + "language_loss": 0.79648089, + "learning_rate": 0.00010210955099013724, + "loss": 0.8071658, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.24438477, + "step": 4154, + "time_per_iteration": 3.4123902320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066775, + "balance_loss_mlp": 1.04323101, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.07636085006530072, + "language_loss": 0.76853955, + "learning_rate": 0.00010192096300053167, + "loss": 0.77920735, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.23547363, + "step": 4155, + "time_per_iteration": 3.0885937213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062142, + "balance_loss_mlp": 1.03831244, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.054659422757200274, + "language_loss": 0.85324514, + "learning_rate": 0.00010173252955891477, + "loss": 0.86386657, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.23828125, + "step": 4156, + "time_per_iteration": 2.7677974700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_mlp": 1.04228568, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07109273842515242, + "language_loss": 0.7358321, + "learning_rate": 0.00010154425073844253, + "loss": 0.74649352, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.23840332, + "step": 4157, + "time_per_iteration": 2.6927075386047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068264, + "balance_loss_mlp": 1.0444932, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.045745633219446115, + "language_loss": 0.82380056, + "learning_rate": 0.00010135612661221138, + "loss": 0.83448321, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.23742676, + "step": 4158, + "time_per_iteration": 2.5854756832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069104, + "balance_loss_mlp": 1.04447556, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.11567647192687848, + "language_loss": 0.82128304, + "learning_rate": 0.00010116815725325751, + "loss": 0.83197409, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.24633789, + "step": 4159, + "time_per_iteration": 3.2739415168762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069034, + "balance_loss_mlp": 1.04451275, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.051597263691987395, + "language_loss": 0.8072108, + "learning_rate": 0.00010098034273455725, + "loss": 0.81790113, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.24523926, + "step": 4160, + "time_per_iteration": 2.9544758796691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_mlp": 1.03870511, + "epoch": 0.8005001923816852, + "flos": 488465925120.0, + "grad_norm": 0.05783262033731221, + "language_loss": 0.8016758, + "learning_rate": 0.00010079268312902662, + "loss": 0.81229925, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.23632812, + "step": 4161, + "time_per_iteration": 2.6659867763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068128, + "balance_loss_mlp": 1.0439285, + "epoch": 0.8006925740669488, + "flos": 513248306688.0, + "grad_norm": 0.061865386123552114, + "language_loss": 0.81856108, + "learning_rate": 0.0001006051785095215, + "loss": 0.82924247, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.24206543, + "step": 4162, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_mlp": 1.03975272, + "epoch": 0.8008849557522124, + "flos": 578529879552.0, + "grad_norm": 0.07843686612614051, + "language_loss": 0.79364276, + "learning_rate": 0.0001004178289488376, + "loss": 0.8042888, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.24841309, + "step": 4163, + "time_per_iteration": 2.7118594646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065979, + "balance_loss_mlp": 1.04216099, + "epoch": 0.801077337437476, + "flos": 478708766208.0, + "grad_norm": 0.053856509771467526, + "language_loss": 0.83958352, + "learning_rate": 0.0001002306345197106, + "loss": 0.85024333, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.23815918, + "step": 4164, + "time_per_iteration": 2.578308343887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068826, + "balance_loss_mlp": 1.04498386, + "epoch": 0.8012697191227395, + "flos": 676700573184.0, + "grad_norm": 0.06071324675869247, + "language_loss": 0.80072832, + "learning_rate": 0.00010004359529481571, + "loss": 0.81141657, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.23828125, + "step": 4165, + "time_per_iteration": 2.905269145965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069548, + "balance_loss_mlp": 1.04625463, + "epoch": 0.8014621008080031, + "flos": 1295132405760.0, + "grad_norm": 0.06586004492951407, + "language_loss": 0.82562852, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83632404, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.23303223, + "step": 4166, + "time_per_iteration": 3.7128326892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.0523994, + "epoch": 0.8016544824932667, + "flos": 511827683328.0, + "grad_norm": 0.07654195541387675, + "language_loss": 0.83752513, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84828973, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.24035645, + "step": 4167, + "time_per_iteration": 2.6038944721221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073049, + "balance_loss_mlp": 1.04911208, + "epoch": 0.8018468641785302, + "flos": 535690879488.0, + "grad_norm": 0.07186822354063922, + "language_loss": 0.81419587, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82492638, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.23925781, + "step": 4168, + "time_per_iteration": 2.6602823734283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.04145718, + "epoch": 0.8020392458637937, + "flos": 1023431086080.0, + "grad_norm": 0.07423668989948455, + "language_loss": 0.79874551, + "learning_rate": 9.929699188895447e-05, + "loss": 0.80939651, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.23620605, + "step": 4169, + "time_per_iteration": 3.2493438720703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018878, + "balance_loss_mlp": 1.01201177, + "epoch": 0.8022316275490573, + "flos": 1561806821376.0, + "grad_norm": 0.012589088151617522, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.7907328, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.06884766, + "step": 4170, + "time_per_iteration": 4.9680397510528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074777, + "balance_loss_mlp": 1.05058968, + "epoch": 0.8024240092343209, + "flos": 420698810880.0, + "grad_norm": 0.059456208257663117, + "language_loss": 0.83677548, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84752321, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.24194336, + "step": 4171, + "time_per_iteration": 2.5254433155059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072144, + "balance_loss_mlp": 1.04793262, + "epoch": 0.8026163909195845, + "flos": 763836645888.0, + "grad_norm": 0.06548527724702696, + "language_loss": 0.79033399, + "learning_rate": 9.873867253111762e-05, + "loss": 0.80105543, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.24206543, + "step": 4172, + "time_per_iteration": 3.027867317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016172, + "balance_loss_mlp": 1.00930583, + "epoch": 0.8028087726048481, + "flos": 1518861362688.0, + "grad_norm": 0.010389425904671548, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81280732, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.06884766, + "step": 4173, + "time_per_iteration": 4.953596353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068112, + "balance_loss_mlp": 1.04440093, + "epoch": 0.8030011542901115, + "flos": 517861486080.0, + "grad_norm": 0.05784474933725161, + "language_loss": 0.88619745, + "learning_rate": 9.836723842278733e-05, + "loss": 0.8968786, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.23706055, + "step": 4174, + "time_per_iteration": 2.5819122791290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073726, + "balance_loss_mlp": 1.0496335, + "epoch": 0.8031935359753751, + "flos": 545616165888.0, + "grad_norm": 0.06458070365487185, + "language_loss": 0.77987558, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79061282, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.2409668, + "step": 4175, + "time_per_iteration": 2.6588666439056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066808, + "balance_loss_mlp": 1.04332376, + "epoch": 0.8033859176606387, + "flos": 603559309824.0, + "grad_norm": 0.05329489111459666, + "language_loss": 0.84854198, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85921007, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.23474121, + "step": 4176, + "time_per_iteration": 2.793970823287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073083, + "balance_loss_mlp": 1.04912198, + "epoch": 0.8035782993459023, + "flos": 565859520000.0, + "grad_norm": 0.06573423775320317, + "language_loss": 0.81603885, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82676965, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.23937988, + "step": 4177, + "time_per_iteration": 2.6889266967773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069602, + "balance_loss_mlp": 1.04500914, + "epoch": 0.8037706810311658, + "flos": 538435952640.0, + "grad_norm": 0.06472425874294328, + "language_loss": 0.84585536, + "learning_rate": 9.762624191379054e-05, + "loss": 0.85655141, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.24584961, + "step": 4178, + "time_per_iteration": 2.6785759925842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_mlp": 1.04635787, + "epoch": 0.8039630627164294, + "flos": 515187993600.0, + "grad_norm": 0.06080511762192285, + "language_loss": 0.79866344, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80935353, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.22644043, + "step": 4179, + "time_per_iteration": 2.638092517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006529, + "balance_loss_mlp": 0.99971008, + "epoch": 0.804155444401693, + "flos": 1478834247168.0, + "grad_norm": 0.007721922864740662, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75740093, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.06835938, + "step": 4180, + "time_per_iteration": 4.915950775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070552, + "balance_loss_mlp": 1.04663897, + "epoch": 0.8043478260869565, + "flos": 521164896768.0, + "grad_norm": 0.07383497316313108, + "language_loss": 0.77627772, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78698325, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.23901367, + "step": 4181, + "time_per_iteration": 2.6206631660461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068366, + "balance_loss_mlp": 1.04459548, + "epoch": 0.8045402077722201, + "flos": 545448038400.0, + "grad_norm": 0.05588102000641447, + "language_loss": 0.80511087, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81579459, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.23754883, + "step": 4182, + "time_per_iteration": 2.762315511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071285, + "balance_loss_mlp": 1.0475862, + "epoch": 0.8047325894574836, + "flos": 678388068864.0, + "grad_norm": 0.06106500726873185, + "language_loss": 0.73959017, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75030297, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.23681641, + "step": 4183, + "time_per_iteration": 2.914637327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067779, + "balance_loss_mlp": 1.04349661, + "epoch": 0.8049249711427472, + "flos": 587227262976.0, + "grad_norm": 0.06176389383154452, + "language_loss": 0.79018617, + "learning_rate": 9.65194350425882e-05, + "loss": 0.80086398, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.24291992, + "step": 4184, + "time_per_iteration": 2.7324373722076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.04418921, + "epoch": 0.8051173528280108, + "flos": 814194312192.0, + "grad_norm": 0.0704849570966166, + "language_loss": 0.78184354, + "learning_rate": 9.633551507115452e-05, + "loss": 0.79252017, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.23486328, + "step": 4185, + "time_per_iteration": 3.108370542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072015, + "balance_loss_mlp": 1.04811406, + "epoch": 0.8053097345132744, + "flos": 725687175168.0, + "grad_norm": 0.06261111839901821, + "language_loss": 0.77779377, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78851396, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.23876953, + "step": 4186, + "time_per_iteration": 2.9714622497558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071694, + "balance_loss_mlp": 1.04800677, + "epoch": 0.805502116198538, + "flos": 748050453504.0, + "grad_norm": 0.0639304252406746, + "language_loss": 0.81475556, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82547259, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.23669434, + "step": 4187, + "time_per_iteration": 2.9930331707000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.04930937, + "epoch": 0.8056944978838014, + "flos": 640258421760.0, + "grad_norm": 0.051419524714537625, + "language_loss": 0.87329555, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88402855, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.23999023, + "step": 4188, + "time_per_iteration": 2.9604146480560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069277, + "balance_loss_mlp": 1.04494643, + "epoch": 0.805886879569065, + "flos": 644631892992.0, + "grad_norm": 0.08161833889252136, + "language_loss": 0.78437293, + "learning_rate": 9.560140306306436e-05, + "loss": 0.7950657, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.2434082, + "step": 4189, + "time_per_iteration": 2.7467589378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067948, + "balance_loss_mlp": 1.04483366, + "epoch": 0.8060792612543286, + "flos": 661230812160.0, + "grad_norm": 0.06429171281300161, + "language_loss": 0.81920123, + "learning_rate": 9.541826738671233e-05, + "loss": 0.82988071, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.2310791, + "step": 4190, + "time_per_iteration": 2.8135294914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072966, + "balance_loss_mlp": 1.04954159, + "epoch": 0.8062716429395922, + "flos": 455075366400.0, + "grad_norm": 0.06490382146104284, + "language_loss": 0.8240993, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83482891, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.23425293, + "step": 4191, + "time_per_iteration": 2.5631208419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106835, + "balance_loss_mlp": 1.04356611, + "epoch": 0.8064640246248557, + "flos": 526407994368.0, + "grad_norm": 0.07814550212543987, + "language_loss": 0.85738069, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86806417, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.2479248, + "step": 4192, + "time_per_iteration": 2.6771442890167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075525, + "balance_loss_mlp": 1.05173111, + "epoch": 0.8066564063101193, + "flos": 865115458560.0, + "grad_norm": 0.05122743172221322, + "language_loss": 0.82006085, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83081615, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.23803711, + "step": 4193, + "time_per_iteration": 3.1859543323516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_mlp": 1.04287791, + "epoch": 0.8068487879953828, + "flos": 530536614912.0, + "grad_norm": 0.05914433880976634, + "language_loss": 0.8218236, + "learning_rate": 9.468729611697246e-05, + "loss": 0.83249271, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.24035645, + "step": 4194, + "time_per_iteration": 2.6743924617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069484, + "balance_loss_mlp": 1.04611874, + "epoch": 0.8070411696806464, + "flos": 566183291904.0, + "grad_norm": 0.053953829426493176, + "language_loss": 0.82204551, + "learning_rate": 9.450494651319003e-05, + "loss": 0.83274031, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.23364258, + "step": 4195, + "time_per_iteration": 2.651991605758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068847, + "balance_loss_mlp": 1.04413462, + "epoch": 0.80723355136591, + "flos": 986591010816.0, + "grad_norm": 0.059924308887848395, + "language_loss": 0.7934891, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80417752, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.24719238, + "step": 4196, + "time_per_iteration": 3.319042921066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_mlp": 1.04500604, + "epoch": 0.8074259330511735, + "flos": 566961513984.0, + "grad_norm": 0.05558320714369874, + "language_loss": 0.83282971, + "learning_rate": 9.414071965778221e-05, + "loss": 0.84352082, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.24108887, + "step": 4197, + "time_per_iteration": 2.787029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069448, + "balance_loss_mlp": 1.04553497, + "epoch": 0.8076183147364371, + "flos": 494662712832.0, + "grad_norm": 0.06138750227857287, + "language_loss": 0.79915768, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80985212, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.23901367, + "step": 4198, + "time_per_iteration": 2.716717481613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069564, + "balance_loss_mlp": 1.04573464, + "epoch": 0.8078106964217007, + "flos": 420011993088.0, + "grad_norm": 0.0692709388607304, + "language_loss": 0.80226934, + "learning_rate": 9.377712307650044e-05, + "loss": 0.81296504, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.23840332, + "step": 4199, + "time_per_iteration": 2.4748783111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062956, + "balance_loss_mlp": 1.03986561, + "epoch": 0.8080030781069643, + "flos": 527537152512.0, + "grad_norm": 0.06697499154438591, + "language_loss": 0.83209741, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84272695, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.23083496, + "step": 4200, + "time_per_iteration": 2.65059757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069346, + "balance_loss_mlp": 1.04487228, + "epoch": 0.8081954597922277, + "flos": 544148554752.0, + "grad_norm": 0.07319165127998006, + "language_loss": 0.81825453, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82894802, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.24462891, + "step": 4201, + "time_per_iteration": 2.6146020889282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074978, + "balance_loss_mlp": 1.04963422, + "epoch": 0.8083878414774913, + "flos": 640900823040.0, + "grad_norm": 0.06857691771567244, + "language_loss": 0.75549376, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76624352, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.25378418, + "step": 4202, + "time_per_iteration": 2.8163068294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068191, + "balance_loss_mlp": 1.04393232, + "epoch": 0.8085802231627549, + "flos": 705614146560.0, + "grad_norm": 0.05917117219948211, + "language_loss": 0.72779202, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73847389, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.24243164, + "step": 4203, + "time_per_iteration": 2.926943063735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_mlp": 1.03952456, + "epoch": 0.8087726048480185, + "flos": 419762373120.0, + "grad_norm": 0.06288185583259268, + "language_loss": 0.88665909, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89728582, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.23156738, + "step": 4204, + "time_per_iteration": 2.533940076828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107244, + "balance_loss_mlp": 1.04898, + "epoch": 0.8089649865332821, + "flos": 508766178816.0, + "grad_norm": 0.04485916890924309, + "language_loss": 0.8704623, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88118672, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.23461914, + "step": 4205, + "time_per_iteration": 2.8577754497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074426, + "balance_loss_mlp": 1.05071497, + "epoch": 0.8091573682185456, + "flos": 457219883520.0, + "grad_norm": 0.06131410050585944, + "language_loss": 0.85401237, + "learning_rate": 9.250950659394386e-05, + "loss": 0.86475658, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.23706055, + "step": 4206, + "time_per_iteration": 2.739069700241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067889, + "balance_loss_mlp": 1.04442883, + "epoch": 0.8093497499038091, + "flos": 525256441344.0, + "grad_norm": 0.058670433467628735, + "language_loss": 0.77132225, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78200108, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.23449707, + "step": 4207, + "time_per_iteration": 2.7943315505981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.05148828, + "epoch": 0.8095421315890727, + "flos": 489617478144.0, + "grad_norm": 0.060722658828799125, + "language_loss": 0.76975334, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78050661, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.23840332, + "step": 4208, + "time_per_iteration": 2.628981351852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072763, + "balance_loss_mlp": 1.04902852, + "epoch": 0.8097345132743363, + "flos": 625109861376.0, + "grad_norm": 0.06220580039777883, + "language_loss": 0.80972207, + "learning_rate": 9.196861401017164e-05, + "loss": 0.82044971, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.23706055, + "step": 4209, + "time_per_iteration": 2.7958450317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074238, + "balance_loss_mlp": 1.05026519, + "epoch": 0.8099268949595998, + "flos": 615688584192.0, + "grad_norm": 0.06273370335614084, + "language_loss": 0.79599589, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80673826, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.23962402, + "step": 4210, + "time_per_iteration": 2.7682175636291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077533, + "balance_loss_mlp": 1.0541079, + "epoch": 0.8101192766448634, + "flos": 479642632704.0, + "grad_norm": 0.06627219906647694, + "language_loss": 0.80003035, + "learning_rate": 9.160881089682566e-05, + "loss": 0.81080568, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.23400879, + "step": 4211, + "time_per_iteration": 2.631140947341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074787, + "balance_loss_mlp": 1.05109978, + "epoch": 0.810311658330127, + "flos": 517327741440.0, + "grad_norm": 0.05849303544720674, + "language_loss": 0.86804986, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87879777, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.23681641, + "step": 4212, + "time_per_iteration": 2.5948104858398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071765, + "balance_loss_mlp": 1.04823303, + "epoch": 0.8105040400153906, + "flos": 575782235136.0, + "grad_norm": 0.06296476398611156, + "language_loss": 0.841757, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85247463, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.23547363, + "step": 4213, + "time_per_iteration": 2.7866246700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072973, + "balance_loss_mlp": 1.04913127, + "epoch": 0.8106964217006541, + "flos": 638963707392.0, + "grad_norm": 0.057390560702911333, + "language_loss": 0.85144347, + "learning_rate": 9.107029553743862e-05, + "loss": 0.8621732, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.23840332, + "step": 4214, + "time_per_iteration": 2.842522144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079487, + "balance_loss_mlp": 1.05565715, + "epoch": 0.8108888033859176, + "flos": 579505964544.0, + "grad_norm": 0.06211235803289476, + "language_loss": 0.81593323, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82672811, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.23815918, + "step": 4215, + "time_per_iteration": 2.700958251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.05348349, + "epoch": 0.8110811850711812, + "flos": 559907209728.0, + "grad_norm": 0.05316807102824332, + "language_loss": 0.8362149, + "learning_rate": 9.071207898465284e-05, + "loss": 0.8469913, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.24157715, + "step": 4216, + "time_per_iteration": 2.8000106811523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003326, + "balance_loss_mlp": 0.996364, + "epoch": 0.8112735667564448, + "flos": 1517939979264.0, + "grad_norm": 0.009729060219912019, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78263742, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.06982422, + "step": 4217, + "time_per_iteration": 5.381849765777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078499, + "balance_loss_mlp": 1.05508685, + "epoch": 0.8114659484417084, + "flos": 616340897280.0, + "grad_norm": 0.07154443867922712, + "language_loss": 0.85639107, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86717606, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.23400879, + "step": 4218, + "time_per_iteration": 2.835637092590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_mlp": 1.04660106, + "epoch": 0.8116583301269719, + "flos": 649951340544.0, + "grad_norm": 0.056455139203309426, + "language_loss": 0.79380965, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80451119, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.23535156, + "step": 4219, + "time_per_iteration": 2.942309617996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.05086303, + "epoch": 0.8118507118122354, + "flos": 553087844352.0, + "grad_norm": 0.06424342270235255, + "language_loss": 0.80477631, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81552052, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.23535156, + "step": 4220, + "time_per_iteration": 2.708371877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069972, + "balance_loss_mlp": 1.04653561, + "epoch": 0.812043093497499, + "flos": 544118819328.0, + "grad_norm": 0.07378899764991645, + "language_loss": 0.87550426, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88620389, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.234375, + "step": 4221, + "time_per_iteration": 2.6693153381347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069628, + "balance_loss_mlp": 1.04652536, + "epoch": 0.8122354751827626, + "flos": 583404788736.0, + "grad_norm": 0.054351261228181624, + "language_loss": 0.83540028, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84609658, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.23095703, + "step": 4222, + "time_per_iteration": 2.7870144844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_mlp": 0.99706995, + "epoch": 0.8124278568680262, + "flos": 1530568120320.0, + "grad_norm": 0.009675250582902717, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79254061, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.06933594, + "step": 4223, + "time_per_iteration": 4.929150581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073586, + "balance_loss_mlp": 1.04953003, + "epoch": 0.8126202385532897, + "flos": 432865161216.0, + "grad_norm": 0.06048023651851461, + "language_loss": 0.80246794, + "learning_rate": 8.928557430748668e-05, + "loss": 0.81320387, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.24047852, + "step": 4224, + "time_per_iteration": 2.558927059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005286, + "balance_loss_mlp": 0.99841946, + "epoch": 0.8128126202385533, + "flos": 1547905987584.0, + "grad_norm": 0.007942256322590725, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77500916, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.06884766, + "step": 4225, + "time_per_iteration": 4.829640865325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_mlp": 1.04875958, + "epoch": 0.8130050019238169, + "flos": 528317945856.0, + "grad_norm": 0.06071065403013227, + "language_loss": 0.89078009, + "learning_rate": 8.893054129078077e-05, + "loss": 0.90149975, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.23217773, + "step": 4226, + "time_per_iteration": 2.6663520336151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074451, + "balance_loss_mlp": 1.04990602, + "epoch": 0.8131973836090804, + "flos": 543125481984.0, + "grad_norm": 0.05854730036453429, + "language_loss": 0.80283505, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81357956, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.24523926, + "step": 4227, + "time_per_iteration": 2.737178087234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_mlp": 1.0496552, + "epoch": 0.8133897652943439, + "flos": 576494019072.0, + "grad_norm": 0.07250995282732464, + "language_loss": 0.82311916, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83386123, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.2454834, + "step": 4228, + "time_per_iteration": 2.7085983753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072354, + "balance_loss_mlp": 1.0487864, + "epoch": 0.8135821469796075, + "flos": 579219268608.0, + "grad_norm": 0.06181457109601203, + "language_loss": 0.79171389, + "learning_rate": 8.839918887251025e-05, + "loss": 0.80243742, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.23571777, + "step": 4229, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107323, + "balance_loss_mlp": 1.04960322, + "epoch": 0.8137745286648711, + "flos": 650346693120.0, + "grad_norm": 0.05392017949015088, + "language_loss": 0.84132361, + "learning_rate": 8.822239090334472e-05, + "loss": 0.85205585, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.23620605, + "step": 4230, + "time_per_iteration": 2.934464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107431, + "balance_loss_mlp": 1.04952633, + "epoch": 0.8139669103501347, + "flos": 701888219136.0, + "grad_norm": 0.059650707054353234, + "language_loss": 0.75657654, + "learning_rate": 8.804575280042493e-05, + "loss": 0.76731968, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.2479248, + "step": 4231, + "time_per_iteration": 2.8997764587402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107496, + "balance_loss_mlp": 1.05128467, + "epoch": 0.8141592920353983, + "flos": 650223355392.0, + "grad_norm": 0.08031341432177187, + "language_loss": 0.83293855, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84368813, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.23657227, + "step": 4232, + "time_per_iteration": 2.823725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.04657876, + "epoch": 0.8143516737206618, + "flos": 536829949440.0, + "grad_norm": 0.06617341429663869, + "language_loss": 0.81372333, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82443595, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.24694824, + "step": 4233, + "time_per_iteration": 2.596062660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076841, + "balance_loss_mlp": 1.05249834, + "epoch": 0.8145440554059253, + "flos": 508366056960.0, + "grad_norm": 0.06250257700047851, + "language_loss": 0.82593143, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83669984, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.2434082, + "step": 4234, + "time_per_iteration": 2.573758602142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072995, + "balance_loss_mlp": 1.04948664, + "epoch": 0.8147364370911889, + "flos": 635032576512.0, + "grad_norm": 0.05248303737167748, + "language_loss": 0.86687446, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87760437, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.23498535, + "step": 4235, + "time_per_iteration": 2.8768227100372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.04670143, + "epoch": 0.8149288187764525, + "flos": 422801482752.0, + "grad_norm": 0.0632046895747122, + "language_loss": 0.78667194, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79738808, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.24926758, + "step": 4236, + "time_per_iteration": 2.479666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072748, + "balance_loss_mlp": 1.05016923, + "epoch": 0.8151212004617161, + "flos": 597444014592.0, + "grad_norm": 0.05718912918333018, + "language_loss": 0.81758833, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82831585, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.22583008, + "step": 4237, + "time_per_iteration": 2.7750775814056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010102, + "balance_loss_mlp": 1.00333071, + "epoch": 0.8153135821469796, + "flos": 1479330915840.0, + "grad_norm": 0.007872761056194484, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78862947, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.06787109, + "step": 4238, + "time_per_iteration": 4.9892895221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070513, + "balance_loss_mlp": 1.04645634, + "epoch": 0.8155059638322432, + "flos": 437097669120.0, + "grad_norm": 0.058923617839845364, + "language_loss": 0.82525653, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83596164, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.24035645, + "step": 4239, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070695, + "balance_loss_mlp": 1.04635239, + "epoch": 0.8156983455175068, + "flos": 794390727168.0, + "grad_norm": 0.05678086613450876, + "language_loss": 0.85605669, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86676371, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.24328613, + "step": 4240, + "time_per_iteration": 3.0692131519317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073498, + "balance_loss_mlp": 1.0495373, + "epoch": 0.8158907272027703, + "flos": 685986029568.0, + "grad_norm": 0.059688242669239784, + "language_loss": 0.81930083, + "learning_rate": 8.628817947092616e-05, + "loss": 0.83003575, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.23974609, + "step": 4241, + "time_per_iteration": 2.845006227493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074554, + "balance_loss_mlp": 1.05058062, + "epoch": 0.8160831088880338, + "flos": 487055213568.0, + "grad_norm": 0.07205757206043806, + "language_loss": 0.8466413, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85738689, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.23974609, + "step": 4242, + "time_per_iteration": 2.540684461593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076416, + "balance_loss_mlp": 1.0526818, + "epoch": 0.8162754905732974, + "flos": 464872172544.0, + "grad_norm": 0.060485918814619816, + "language_loss": 0.80289745, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81366158, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.23730469, + "step": 4243, + "time_per_iteration": 2.5800933837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010569, + "balance_loss_mlp": 1.00384557, + "epoch": 0.816467872258561, + "flos": 1239530522112.0, + "grad_norm": 0.005783108588514147, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76295686, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.06738281, + "step": 4244, + "time_per_iteration": 4.735422611236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073017, + "balance_loss_mlp": 1.04923451, + "epoch": 0.8166602539438246, + "flos": 687169516032.0, + "grad_norm": 0.058029924379859654, + "language_loss": 0.86948967, + "learning_rate": 8.558964360534615e-05, + "loss": 0.88021982, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.23779297, + "step": 4245, + "time_per_iteration": 2.920170783996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011667, + "balance_loss_mlp": 1.00494385, + "epoch": 0.8168526356290882, + "flos": 1490520807936.0, + "grad_norm": 0.004946966844072032, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73986411, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.06738281, + "step": 4246, + "time_per_iteration": 4.9647252559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073957, + "balance_loss_mlp": 1.0499959, + "epoch": 0.8170450173143516, + "flos": 578201338368.0, + "grad_norm": 0.05414239297152735, + "language_loss": 0.84796524, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85870481, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.23950195, + "step": 4247, + "time_per_iteration": 2.730175256729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073063, + "balance_loss_mlp": 1.04956651, + "epoch": 0.8172373989996152, + "flos": 571275514368.0, + "grad_norm": 0.0616901984083448, + "language_loss": 0.84620667, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85693735, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.23486328, + "step": 4248, + "time_per_iteration": 2.741957664489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076046, + "balance_loss_mlp": 1.05173934, + "epoch": 0.8174297806848788, + "flos": 528831866880.0, + "grad_norm": 0.06067464626133788, + "language_loss": 0.81253791, + "learning_rate": 8.489368195241948e-05, + "loss": 0.82329834, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.24304199, + "step": 4249, + "time_per_iteration": 2.7099785804748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069837, + "balance_loss_mlp": 1.04637706, + "epoch": 0.8176221623701424, + "flos": 569108602368.0, + "grad_norm": 0.057171818885021125, + "language_loss": 0.7918399, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80253828, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.23461914, + "step": 4250, + "time_per_iteration": 2.7882819175720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_mlp": 1.05128062, + "epoch": 0.8178145440554059, + "flos": 656521459200.0, + "grad_norm": 0.07586143278279464, + "language_loss": 0.80699354, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81774032, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.23388672, + "step": 4251, + "time_per_iteration": 2.8482766151428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069339, + "balance_loss_mlp": 1.04569983, + "epoch": 0.8180069257406695, + "flos": 545924883456.0, + "grad_norm": 0.0891506562399526, + "language_loss": 0.87943554, + "learning_rate": 8.437340264101828e-05, + "loss": 0.89012891, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.23657227, + "step": 4252, + "time_per_iteration": 2.690966844558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070632, + "balance_loss_mlp": 1.04671824, + "epoch": 0.818199307425933, + "flos": 619271350272.0, + "grad_norm": 0.06457353202356947, + "language_loss": 0.84974635, + "learning_rate": 8.420029883528474e-05, + "loss": 0.86045271, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.23876953, + "step": 4253, + "time_per_iteration": 2.7346584796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107695, + "balance_loss_mlp": 1.05236959, + "epoch": 0.8183916891111966, + "flos": 647618872320.0, + "grad_norm": 0.058517110393729366, + "language_loss": 0.7747196, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78548908, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.24560547, + "step": 4254, + "time_per_iteration": 2.902817964553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073698, + "balance_loss_mlp": 1.05016649, + "epoch": 0.8185840707964602, + "flos": 499120247808.0, + "grad_norm": 0.0672149584819886, + "language_loss": 0.78553545, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79627252, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.23510742, + "step": 4255, + "time_per_iteration": 2.5662145614624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068342, + "balance_loss_mlp": 1.04424965, + "epoch": 0.8187764524817237, + "flos": 786229659648.0, + "grad_norm": 0.04856216619330027, + "language_loss": 0.79861438, + "learning_rate": 8.368195625315251e-05, + "loss": 0.8092978, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.24084473, + "step": 4256, + "time_per_iteration": 3.066568374633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072675, + "balance_loss_mlp": 1.04816532, + "epoch": 0.8189688341669873, + "flos": 550710959616.0, + "grad_norm": 0.0482875627424869, + "language_loss": 0.8077091, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81843579, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.24487305, + "step": 4257, + "time_per_iteration": 2.8208279609680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008421, + "balance_loss_mlp": 1.00174499, + "epoch": 0.8191612158522509, + "flos": 1351972435968.0, + "grad_norm": 0.0038578999028146157, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.7215777, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.06689453, + "step": 4258, + "time_per_iteration": 4.855503082275391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069037, + "balance_loss_mlp": 1.04530239, + "epoch": 0.8193535975375145, + "flos": 544257211392.0, + "grad_norm": 0.06050178824041954, + "language_loss": 0.84065247, + "learning_rate": 8.316506833163318e-05, + "loss": 0.85134286, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.23718262, + "step": 4259, + "time_per_iteration": 2.6480743885040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106949, + "balance_loss_mlp": 1.04551733, + "epoch": 0.8195459792227779, + "flos": 865733266944.0, + "grad_norm": 0.053169941526036414, + "language_loss": 0.85682011, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86751509, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.23986816, + "step": 4260, + "time_per_iteration": 3.0866482257843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.05007422, + "epoch": 0.8197383609080415, + "flos": 569293982208.0, + "grad_norm": 0.061667842904939214, + "language_loss": 0.81912506, + "learning_rate": 8.282128542083101e-05, + "loss": 0.8298648, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.2388916, + "step": 4261, + "time_per_iteration": 2.675989866256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.04430175, + "epoch": 0.8199307425933051, + "flos": 530813399040.0, + "grad_norm": 0.053936985731850594, + "language_loss": 0.84986472, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86053348, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.22583008, + "step": 4262, + "time_per_iteration": 2.6208925247192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073879, + "balance_loss_mlp": 1.04916656, + "epoch": 0.8201231242785687, + "flos": 567070170624.0, + "grad_norm": 0.053552526024574415, + "language_loss": 0.85210896, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86284775, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.24719238, + "step": 4263, + "time_per_iteration": 2.716259717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070202, + "balance_loss_mlp": 1.04670596, + "epoch": 0.8203155059638323, + "flos": 1230505717248.0, + "grad_norm": 0.061528756897057495, + "language_loss": 0.82971454, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84041655, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.23474121, + "step": 4264, + "time_per_iteration": 3.53371000289917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070087, + "balance_loss_mlp": 1.04562545, + "epoch": 0.8205078876490958, + "flos": 574198626816.0, + "grad_norm": 0.05437009195264598, + "language_loss": 0.80134302, + "learning_rate": 8.213566368959558e-05, + "loss": 0.81204391, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.24462891, + "step": 4265, + "time_per_iteration": 2.678452730178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068039, + "balance_loss_mlp": 1.04403043, + "epoch": 0.8207002693343594, + "flos": 931400280576.0, + "grad_norm": 0.05700879545907034, + "language_loss": 0.78525734, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79593778, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.2401123, + "step": 4266, + "time_per_iteration": 3.185957670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.04407215, + "epoch": 0.8208926510196229, + "flos": 549571889664.0, + "grad_norm": 0.05502076151782196, + "language_loss": 0.80635643, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81702411, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.22692871, + "step": 4267, + "time_per_iteration": 2.6516170501708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070354, + "balance_loss_mlp": 1.04677415, + "epoch": 0.8210850327048865, + "flos": 648182352384.0, + "grad_norm": 0.05163863039989425, + "language_loss": 0.82111454, + "learning_rate": 8.162315056592918e-05, + "loss": 0.8318181, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.23596191, + "step": 4268, + "time_per_iteration": 2.8418874740600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068732, + "balance_loss_mlp": 1.04558122, + "epoch": 0.82127741439015, + "flos": 601520878080.0, + "grad_norm": 0.0584034837454372, + "language_loss": 0.81278813, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82347548, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.23144531, + "step": 4269, + "time_per_iteration": 2.7613987922668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070732, + "balance_loss_mlp": 1.04777241, + "epoch": 0.8214697960754136, + "flos": 474831963648.0, + "grad_norm": 0.09670213264130083, + "language_loss": 0.8378197, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84852707, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.22973633, + "step": 4270, + "time_per_iteration": 2.718817949295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066128, + "balance_loss_mlp": 1.04242873, + "epoch": 0.8216621777606772, + "flos": 903648172032.0, + "grad_norm": 0.06875355685078723, + "language_loss": 0.85044003, + "learning_rate": 8.11120992965671e-05, + "loss": 0.86110133, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.23693848, + "step": 4271, + "time_per_iteration": 3.1280128955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068814, + "balance_loss_mlp": 1.04395938, + "epoch": 0.8218545594459408, + "flos": 514461528576.0, + "grad_norm": 0.08013013312339903, + "language_loss": 0.82012117, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83080924, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.2487793, + "step": 4272, + "time_per_iteration": 2.6690683364868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070725, + "balance_loss_mlp": 1.04672813, + "epoch": 0.8220469411312044, + "flos": 494536803840.0, + "grad_norm": 0.05213060875832122, + "language_loss": 0.863258, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87396526, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.23986816, + "step": 4273, + "time_per_iteration": 2.6432783603668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107137, + "balance_loss_mlp": 1.04725409, + "epoch": 0.8222393228164678, + "flos": 386433483264.0, + "grad_norm": 0.07702362963996895, + "language_loss": 0.89937019, + "learning_rate": 8.060251166717835e-05, + "loss": 0.91008389, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.24108887, + "step": 4274, + "time_per_iteration": 2.4411094188690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075339, + "balance_loss_mlp": 1.0512464, + "epoch": 0.8224317045017314, + "flos": 536590241280.0, + "grad_norm": 0.05073133205853468, + "language_loss": 0.87501049, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88576388, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.2409668, + "step": 4275, + "time_per_iteration": 2.6580936908721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072046, + "balance_loss_mlp": 1.04795384, + "epoch": 0.822624086186995, + "flos": 554899051008.0, + "grad_norm": 0.061816323283698855, + "language_loss": 0.82731915, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83803964, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.24084473, + "step": 4276, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071877, + "balance_loss_mlp": 1.04751015, + "epoch": 0.8228164678722586, + "flos": 539579791872.0, + "grad_norm": 0.05838521013995571, + "language_loss": 0.80136567, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81208444, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.24377441, + "step": 4277, + "time_per_iteration": 2.7059576511383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074492, + "balance_loss_mlp": 1.0514127, + "epoch": 0.8230088495575221, + "flos": 473253124608.0, + "grad_norm": 0.060736883959351555, + "language_loss": 0.79328179, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80402672, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.23083496, + "step": 4278, + "time_per_iteration": 2.6723952293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074832, + "balance_loss_mlp": 1.05057299, + "epoch": 0.8232012312427857, + "flos": 591672314880.0, + "grad_norm": 0.07690493497404575, + "language_loss": 0.8312307, + "learning_rate": 7.975645631856127e-05, + "loss": 0.84197903, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.24255371, + "step": 4279, + "time_per_iteration": 2.6807098388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070086, + "balance_loss_mlp": 1.04629207, + "epoch": 0.8233936129280492, + "flos": 572644380672.0, + "grad_norm": 0.06508062719684189, + "language_loss": 0.74714917, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75785005, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.23791504, + "step": 4280, + "time_per_iteration": 2.741769790649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074321, + "balance_loss_mlp": 1.05055034, + "epoch": 0.8235859946133128, + "flos": 731337735168.0, + "grad_norm": 0.0503633953001768, + "language_loss": 0.78454876, + "learning_rate": 7.941917579079383e-05, + "loss": 0.7952919, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.23779297, + "step": 4281, + "time_per_iteration": 3.003262758255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_mlp": 1.05114222, + "epoch": 0.8237783762985764, + "flos": 570314483712.0, + "grad_norm": 0.06906172818982871, + "language_loss": 0.81415063, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82489479, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.23278809, + "step": 4282, + "time_per_iteration": 2.717515230178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006714, + "balance_loss_mlp": 0.99979985, + "epoch": 0.8239707579838399, + "flos": 1466232897024.0, + "grad_norm": 0.008081480434335962, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76304388, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.06933594, + "step": 4283, + "time_per_iteration": 5.022932767868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070622, + "balance_loss_mlp": 1.046947, + "epoch": 0.8241631396691035, + "flos": 467313297408.0, + "grad_norm": 0.07325194572667328, + "language_loss": 0.80727637, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81798261, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.23657227, + "step": 4284, + "time_per_iteration": 2.6633455753326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007071, + "balance_loss_mlp": 1.0002048, + "epoch": 0.8243555213543671, + "flos": 1539426290688.0, + "grad_norm": 0.007371028264575339, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78941345, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.06884766, + "step": 4285, + "time_per_iteration": 4.923422336578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070494, + "balance_loss_mlp": 1.04681873, + "epoch": 0.8245479030396307, + "flos": 797429836800.0, + "grad_norm": 0.052917023765971946, + "language_loss": 0.82609844, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83680344, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.23657227, + "step": 4286, + "time_per_iteration": 3.1522793769836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070568, + "balance_loss_mlp": 1.0472033, + "epoch": 0.8247402847248941, + "flos": 646114185216.0, + "grad_norm": 0.05594284535828706, + "language_loss": 0.76698542, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77769113, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.23352051, + "step": 4287, + "time_per_iteration": 2.8717780113220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070808, + "balance_loss_mlp": 1.04741931, + "epoch": 0.8249326664101577, + "flos": 604421595648.0, + "grad_norm": 0.056506252147876544, + "language_loss": 0.79746181, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80816984, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.23388672, + "step": 4288, + "time_per_iteration": 2.810127019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073495, + "balance_loss_mlp": 1.05003452, + "epoch": 0.8251250480954213, + "flos": 824369218560.0, + "grad_norm": 0.12017691240559532, + "language_loss": 0.86732876, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87806374, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.23449707, + "step": 4289, + "time_per_iteration": 3.084991693496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069032, + "balance_loss_mlp": 1.04540503, + "epoch": 0.8253174297806849, + "flos": 757382897664.0, + "grad_norm": 0.06647761450900506, + "language_loss": 0.78061903, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79130936, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.23620605, + "step": 4290, + "time_per_iteration": 2.902520179748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070801, + "balance_loss_mlp": 1.047961, + "epoch": 0.8255098114659485, + "flos": 794469648384.0, + "grad_norm": 0.05336857596764246, + "language_loss": 0.87808669, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88879466, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.22851562, + "step": 4291, + "time_per_iteration": 3.148972749710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069085, + "balance_loss_mlp": 1.04600561, + "epoch": 0.825702193151212, + "flos": 710417475072.0, + "grad_norm": 0.062415193374422175, + "language_loss": 0.77379918, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78448999, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.23071289, + "step": 4292, + "time_per_iteration": 2.851947784423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069176, + "balance_loss_mlp": 1.04618037, + "epoch": 0.8258945748364755, + "flos": 683394029568.0, + "grad_norm": 0.06038634003775471, + "language_loss": 0.80922878, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81992054, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.2298584, + "step": 4293, + "time_per_iteration": 2.892784357070923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073009, + "balance_loss_mlp": 1.04934597, + "epoch": 0.8260869565217391, + "flos": 594563120640.0, + "grad_norm": 0.05548696520695019, + "language_loss": 0.78991163, + "learning_rate": 7.724279585440186e-05, + "loss": 0.80064166, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.23669434, + "step": 4294, + "time_per_iteration": 2.72245192527771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072099, + "balance_loss_mlp": 1.04767334, + "epoch": 0.8262793382070027, + "flos": 651480993792.0, + "grad_norm": 0.06062374979520041, + "language_loss": 0.8531217, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86384273, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.2442627, + "step": 4295, + "time_per_iteration": 2.771437406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.04420578, + "epoch": 0.8264717198922663, + "flos": 538922709504.0, + "grad_norm": 0.08624043107385752, + "language_loss": 0.84494382, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85562551, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.23937988, + "step": 4296, + "time_per_iteration": 2.6370511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070653, + "balance_loss_mlp": 1.04687035, + "epoch": 0.8266641015775298, + "flos": 538949873664.0, + "grad_norm": 0.06654656411960891, + "language_loss": 0.75516403, + "learning_rate": 7.674448824012514e-05, + "loss": 0.76587057, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.23742676, + "step": 4297, + "time_per_iteration": 2.6292786598205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074873, + "balance_loss_mlp": 1.05018449, + "epoch": 0.8268564832627934, + "flos": 585361728000.0, + "grad_norm": 0.053180571919213125, + "language_loss": 0.84044874, + "learning_rate": 7.657871426083979e-05, + "loss": 0.85119742, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.24682617, + "step": 4298, + "time_per_iteration": 2.7611520290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069074, + "balance_loss_mlp": 1.04500568, + "epoch": 0.827048864948057, + "flos": 430661173248.0, + "grad_norm": 0.0881519445405178, + "language_loss": 0.84719664, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85788739, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.24035645, + "step": 4299, + "time_per_iteration": 2.4467363357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106492, + "balance_loss_mlp": 1.04194832, + "epoch": 0.8272412466333205, + "flos": 1388430761472.0, + "grad_norm": 0.07161846854718491, + "language_loss": 0.85220098, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86285019, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.22961426, + "step": 4300, + "time_per_iteration": 3.737535238265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070983, + "balance_loss_mlp": 1.04735565, + "epoch": 0.827433628318584, + "flos": 538230749184.0, + "grad_norm": 0.057611680805270904, + "language_loss": 0.82816952, + "learning_rate": 7.608237890043335e-05, + "loss": 0.83887935, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.23608398, + "step": 4301, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064109, + "balance_loss_mlp": 1.03986228, + "epoch": 0.8276260100038476, + "flos": 730734981120.0, + "grad_norm": 0.06820448267452858, + "language_loss": 0.77710879, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78774989, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.24230957, + "step": 4302, + "time_per_iteration": 2.923125743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071553, + "balance_loss_mlp": 1.04758024, + "epoch": 0.8278183916891112, + "flos": 871102273536.0, + "grad_norm": 0.050152131715166706, + "language_loss": 0.82855344, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83926898, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.23962402, + "step": 4303, + "time_per_iteration": 3.1750757694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068661, + "balance_loss_mlp": 1.04548657, + "epoch": 0.8280107733743748, + "flos": 594543297024.0, + "grad_norm": 0.054286253624127895, + "language_loss": 0.78169298, + "learning_rate": 7.558752475439134e-05, + "loss": 0.79237956, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.23168945, + "step": 4304, + "time_per_iteration": 2.7362046241760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066893, + "balance_loss_mlp": 1.04376626, + "epoch": 0.8282031550596384, + "flos": 768607667712.0, + "grad_norm": 0.057065663401577975, + "language_loss": 0.84445703, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85512602, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.23120117, + "step": 4305, + "time_per_iteration": 3.0129735469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064903, + "balance_loss_mlp": 1.04113245, + "epoch": 0.8283955367449019, + "flos": 696108805632.0, + "grad_norm": 0.05641540030356317, + "language_loss": 0.78055018, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79119921, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.2376709, + "step": 4306, + "time_per_iteration": 2.9275615215301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064861, + "balance_loss_mlp": 1.04099536, + "epoch": 0.8285879184301654, + "flos": 660630256128.0, + "grad_norm": 0.060113634614107715, + "language_loss": 0.83076233, + "learning_rate": 7.509415355178806e-05, + "loss": 0.84141093, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.23864746, + "step": 4307, + "time_per_iteration": 2.9101569652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.04316437, + "epoch": 0.828780300115429, + "flos": 558709042176.0, + "grad_norm": 0.06849376731379381, + "language_loss": 0.78221494, + "learning_rate": 7.493002632534618e-05, + "loss": 0.79288924, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.24255371, + "step": 4308, + "time_per_iteration": 2.653721570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066743, + "balance_loss_mlp": 1.04313982, + "epoch": 0.8289726818006926, + "flos": 830963930112.0, + "grad_norm": 0.06152111373530928, + "language_loss": 0.81996018, + "learning_rate": 7.476606412570352e-05, + "loss": 0.83062756, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.23596191, + "step": 4309, + "time_per_iteration": 3.0518198013305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066895, + "balance_loss_mlp": 1.04339886, + "epoch": 0.8291650634859561, + "flos": 732289227264.0, + "grad_norm": 0.0622755107347832, + "language_loss": 0.80474293, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81541193, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.23486328, + "step": 4310, + "time_per_iteration": 2.912027597427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_mlp": 1.05124211, + "epoch": 0.8293574451712197, + "flos": 860910114816.0, + "grad_norm": 0.09220252331582286, + "language_loss": 0.81365955, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82442182, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.24975586, + "step": 4311, + "time_per_iteration": 3.195178270339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072175, + "balance_loss_mlp": 1.04859567, + "epoch": 0.8295498268564833, + "flos": 495156810240.0, + "grad_norm": 0.05370166855349878, + "language_loss": 0.82026577, + "learning_rate": 7.427516832380948e-05, + "loss": 0.83098745, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.2355957, + "step": 4312, + "time_per_iteration": 2.8094701766967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071559, + "balance_loss_mlp": 1.04825366, + "epoch": 0.8297422085417469, + "flos": 554471391744.0, + "grad_norm": 0.056720068302814244, + "language_loss": 0.77900606, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78972161, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.23266602, + "step": 4313, + "time_per_iteration": 2.8235886096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070784, + "balance_loss_mlp": 1.04769325, + "epoch": 0.8299345902270104, + "flos": 1247497417728.0, + "grad_norm": 0.07777163044205856, + "language_loss": 0.77210194, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78280979, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.23095703, + "step": 4314, + "time_per_iteration": 3.678664207458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073602, + "balance_loss_mlp": 1.04967725, + "epoch": 0.8301269719122739, + "flos": 585260411904.0, + "grad_norm": 0.058448053996127784, + "language_loss": 0.83041668, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84115267, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.23925781, + "step": 4315, + "time_per_iteration": 2.7324447631835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070451, + "balance_loss_mlp": 1.04641795, + "epoch": 0.8303193535975375, + "flos": 509732352000.0, + "grad_norm": 0.05225421465036167, + "language_loss": 0.85065174, + "learning_rate": 7.362295481759412e-05, + "loss": 0.86135626, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.24035645, + "step": 4316, + "time_per_iteration": 2.6747384071350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069014, + "balance_loss_mlp": 1.04514837, + "epoch": 0.8305117352828011, + "flos": 580652375040.0, + "grad_norm": 0.06799760077646158, + "language_loss": 0.839338, + "learning_rate": 7.346031511856722e-05, + "loss": 0.85002816, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.23840332, + "step": 4317, + "time_per_iteration": 2.6837451457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_mlp": 1.04169214, + "epoch": 0.8307041169680647, + "flos": 481626736128.0, + "grad_norm": 0.06243921298560797, + "language_loss": 0.79024559, + "learning_rate": 7.329784101693232e-05, + "loss": 0.80090392, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.24108887, + "step": 4318, + "time_per_iteration": 2.647980213165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069382, + "balance_loss_mlp": 1.04579091, + "epoch": 0.8308964986533282, + "flos": 624605852160.0, + "grad_norm": 0.06930026355697931, + "language_loss": 0.83169067, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84238452, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.23596191, + "step": 4319, + "time_per_iteration": 2.7219338417053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066419, + "balance_loss_mlp": 1.04255295, + "epoch": 0.8310888803385917, + "flos": 827319495168.0, + "grad_norm": 0.062344981143790404, + "language_loss": 0.79345471, + "learning_rate": 7.297338985808589e-05, + "loss": 0.80411887, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.23864746, + "step": 4320, + "time_per_iteration": 3.0479512214660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072012, + "balance_loss_mlp": 1.04771757, + "epoch": 0.8312812620238553, + "flos": 583743241728.0, + "grad_norm": 0.0524694898745761, + "language_loss": 0.82171959, + "learning_rate": 7.281141292683746e-05, + "loss": 0.83243972, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.24304199, + "step": 4321, + "time_per_iteration": 2.7896528244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_mlp": 1.04058981, + "epoch": 0.8314736437091189, + "flos": 1115605052928.0, + "grad_norm": 0.07871576581536682, + "language_loss": 0.74626267, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75691032, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.24169922, + "step": 4322, + "time_per_iteration": 3.396692991256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069483, + "balance_loss_mlp": 1.04443729, + "epoch": 0.8316660253943825, + "flos": 517547625984.0, + "grad_norm": 0.07516139933736624, + "language_loss": 0.8218689, + "learning_rate": 7.248795667511543e-05, + "loss": 0.83256376, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.25061035, + "step": 4323, + "time_per_iteration": 2.781719207763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106835, + "balance_loss_mlp": 1.04479408, + "epoch": 0.831858407079646, + "flos": 795329736192.0, + "grad_norm": 0.060463118214837686, + "language_loss": 0.78359312, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79427665, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.23547363, + "step": 4324, + "time_per_iteration": 2.991947650909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064888, + "balance_loss_mlp": 1.04118955, + "epoch": 0.8320507887649096, + "flos": 549967242240.0, + "grad_norm": 0.06082533886363455, + "language_loss": 0.83160555, + "learning_rate": 7.216516432290843e-05, + "loss": 0.8422544, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.23693848, + "step": 4325, + "time_per_iteration": 2.6645991802215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.04284835, + "epoch": 0.8322431704501732, + "flos": 479398155264.0, + "grad_norm": 0.060833888285275674, + "language_loss": 0.82192004, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83258057, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.23181152, + "step": 4326, + "time_per_iteration": 2.5123751163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064937, + "balance_loss_mlp": 1.04145336, + "epoch": 0.8324355521354367, + "flos": 572434407936.0, + "grad_norm": 0.06156259329877496, + "language_loss": 0.85313761, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86378694, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.23486328, + "step": 4327, + "time_per_iteration": 2.6744894981384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069478, + "balance_loss_mlp": 1.04586303, + "epoch": 0.8326279338207002, + "flos": 503454071808.0, + "grad_norm": 0.05527860499037242, + "language_loss": 0.82345808, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83415288, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.23596191, + "step": 4328, + "time_per_iteration": 2.5938544273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064462, + "balance_loss_mlp": 1.04112041, + "epoch": 0.8328203155059638, + "flos": 605743474176.0, + "grad_norm": 0.054755319996718364, + "language_loss": 0.81181324, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82245785, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.23339844, + "step": 4329, + "time_per_iteration": 2.899338960647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066654, + "balance_loss_mlp": 1.04346812, + "epoch": 0.8330126971912274, + "flos": 697798872576.0, + "grad_norm": 0.05583188678549314, + "language_loss": 0.85963029, + "learning_rate": 7.136109128985663e-05, + "loss": 0.87029678, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.23168945, + "step": 4330, + "time_per_iteration": 2.9158973693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04619408, + "epoch": 0.833205078876491, + "flos": 494042706432.0, + "grad_norm": 0.058167926322513455, + "language_loss": 0.86570764, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87640113, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.23168945, + "step": 4331, + "time_per_iteration": 2.552724838256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067021, + "balance_loss_mlp": 1.04359591, + "epoch": 0.8333974605617546, + "flos": 482812793856.0, + "grad_norm": 0.05466728002891162, + "language_loss": 0.82944643, + "learning_rate": 7.104062652673115e-05, + "loss": 0.84011662, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.23425293, + "step": 4332, + "time_per_iteration": 2.590282440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060788, + "balance_loss_mlp": 1.03719616, + "epoch": 0.833589842247018, + "flos": 686821151232.0, + "grad_norm": 0.07782119871046231, + "language_loss": 0.83271265, + "learning_rate": 7.088064391927818e-05, + "loss": 0.84332061, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.23571777, + "step": 4333, + "time_per_iteration": 2.815814733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106529, + "balance_loss_mlp": 1.04137695, + "epoch": 0.8337822239322816, + "flos": 881739343872.0, + "grad_norm": 0.054030606180818834, + "language_loss": 0.82675385, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83740675, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.23901367, + "step": 4334, + "time_per_iteration": 3.115084171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068989, + "balance_loss_mlp": 1.04486084, + "epoch": 0.8339746056175452, + "flos": 497183132160.0, + "grad_norm": 0.059930468970718645, + "language_loss": 0.82674849, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83743834, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.2409668, + "step": 4335, + "time_per_iteration": 2.587022066116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064499, + "balance_loss_mlp": 1.04075205, + "epoch": 0.8341669873028088, + "flos": 510495892992.0, + "grad_norm": 0.061364026256354405, + "language_loss": 0.86592519, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87657017, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.23730469, + "step": 4336, + "time_per_iteration": 2.6579511165618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065906, + "balance_loss_mlp": 1.04231429, + "epoch": 0.8343593689880723, + "flos": 692321209344.0, + "grad_norm": 0.058708202155102585, + "language_loss": 0.84223795, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85289705, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.23583984, + "step": 4337, + "time_per_iteration": 2.7891974449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064349, + "balance_loss_mlp": 1.04047179, + "epoch": 0.8345517506733359, + "flos": 552408367104.0, + "grad_norm": 0.06224088295195056, + "language_loss": 0.7808466, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79149008, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.23864746, + "step": 4338, + "time_per_iteration": 2.7306392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.04151607, + "epoch": 0.8347441323585995, + "flos": 592052613120.0, + "grad_norm": 0.06265232087954678, + "language_loss": 0.76197761, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77262855, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.23608398, + "step": 4339, + "time_per_iteration": 2.866647243499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066981, + "balance_loss_mlp": 1.04350924, + "epoch": 0.834936514043863, + "flos": 614917702656.0, + "grad_norm": 0.05669391690274369, + "language_loss": 0.84682417, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85749394, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.234375, + "step": 4340, + "time_per_iteration": 2.7687973976135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059648, + "balance_loss_mlp": 1.03600907, + "epoch": 0.8351288957291266, + "flos": 467844470784.0, + "grad_norm": 0.057030478490448615, + "language_loss": 0.79855502, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80915147, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.23608398, + "step": 4341, + "time_per_iteration": 2.600409984588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064852, + "balance_loss_mlp": 1.04139209, + "epoch": 0.8353212774143901, + "flos": 509319747072.0, + "grad_norm": 0.05124905327439443, + "language_loss": 0.79103041, + "learning_rate": 6.944830483504328e-05, + "loss": 0.8016789, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.23474121, + "step": 4342, + "time_per_iteration": 2.6188149452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059803, + "balance_loss_mlp": 1.03578269, + "epoch": 0.8355136590996537, + "flos": 687784753152.0, + "grad_norm": 0.0559253848941325, + "language_loss": 0.80927384, + "learning_rate": 6.928999100098483e-05, + "loss": 0.8198719, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.2401123, + "step": 4343, + "time_per_iteration": 2.90815806388855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_mlp": 1.0387243, + "epoch": 0.8357060407849173, + "flos": 984409417728.0, + "grad_norm": 0.07183527883163628, + "language_loss": 0.8404789, + "learning_rate": 6.913184438338138e-05, + "loss": 0.85109377, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.22753906, + "step": 4344, + "time_per_iteration": 3.2577321529388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063929, + "balance_loss_mlp": 1.0402298, + "epoch": 0.8358984224701809, + "flos": 843026393088.0, + "grad_norm": 0.05241203479209445, + "language_loss": 0.85357249, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86421174, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.23669434, + "step": 4345, + "time_per_iteration": 3.163863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.03823972, + "epoch": 0.8360908041554445, + "flos": 626239019520.0, + "grad_norm": 0.06325665273743743, + "language_loss": 0.82319045, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83380902, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.23620605, + "step": 4346, + "time_per_iteration": 2.764380693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061388, + "balance_loss_mlp": 1.03774893, + "epoch": 0.8362831858407079, + "flos": 576068931072.0, + "grad_norm": 0.05825131988616556, + "language_loss": 0.84927475, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85988867, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.2364502, + "step": 4347, + "time_per_iteration": 2.7342264652252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061671, + "balance_loss_mlp": 1.03713763, + "epoch": 0.8364755675259715, + "flos": 833783155200.0, + "grad_norm": 0.0627395482486057, + "language_loss": 0.80987006, + "learning_rate": 6.850093130450569e-05, + "loss": 0.82048672, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.24523926, + "step": 4348, + "time_per_iteration": 3.092926263809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063399, + "balance_loss_mlp": 1.04023659, + "epoch": 0.8366679492112351, + "flos": 582480834048.0, + "grad_norm": 0.07064717826743147, + "language_loss": 0.8640992, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87473315, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.23168945, + "step": 4349, + "time_per_iteration": 2.7598204612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065652, + "balance_loss_mlp": 1.04170275, + "epoch": 0.8368603308964987, + "flos": 611722948608.0, + "grad_norm": 0.07054756370503083, + "language_loss": 0.87762499, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88828146, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.23950195, + "step": 4350, + "time_per_iteration": 2.7956182956695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106363, + "balance_loss_mlp": 1.04061055, + "epoch": 0.8370527125817622, + "flos": 507264062976.0, + "grad_norm": 0.05272263799194958, + "language_loss": 0.85641217, + "learning_rate": 6.802950527014884e-05, + "loss": 0.8670485, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.22998047, + "step": 4351, + "time_per_iteration": 2.759132146835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057966, + "balance_loss_mlp": 1.03429115, + "epoch": 0.8372450942670258, + "flos": 770952619008.0, + "grad_norm": 0.06709118366747789, + "language_loss": 0.82653809, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83711779, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.23657227, + "step": 4352, + "time_per_iteration": 2.9604341983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069422, + "balance_loss_mlp": 1.04628313, + "epoch": 0.8374374759522893, + "flos": 579276168192.0, + "grad_norm": 0.062462890278505795, + "language_loss": 0.84993398, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86062813, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.23144531, + "step": 4353, + "time_per_iteration": 2.7336621284484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060703, + "balance_loss_mlp": 1.03720701, + "epoch": 0.8376298576375529, + "flos": 788129699328.0, + "grad_norm": 0.06847497017559315, + "language_loss": 0.82433045, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83493751, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.23498535, + "step": 4354, + "time_per_iteration": 3.0050246715545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062824, + "balance_loss_mlp": 1.03801692, + "epoch": 0.8378222393228165, + "flos": 577613265408.0, + "grad_norm": 0.058961273709874806, + "language_loss": 0.80724108, + "learning_rate": 6.74032853891452e-05, + "loss": 0.81786942, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.24804688, + "step": 4355, + "time_per_iteration": 2.796504497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063872, + "balance_loss_mlp": 1.03998232, + "epoch": 0.83801462100808, + "flos": 480865766400.0, + "grad_norm": 0.06959860060167573, + "language_loss": 0.82262826, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83326697, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.23864746, + "step": 4356, + "time_per_iteration": 2.6413910388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063736, + "balance_loss_mlp": 1.04075241, + "epoch": 0.8382070026933436, + "flos": 550817044992.0, + "grad_norm": 0.05613301024813433, + "language_loss": 0.89176285, + "learning_rate": 6.709118289932226e-05, + "loss": 0.9024002, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.2298584, + "step": 4357, + "time_per_iteration": 2.7601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065704, + "balance_loss_mlp": 1.04187393, + "epoch": 0.8383993843786072, + "flos": 624968898048.0, + "grad_norm": 0.06393233563848542, + "language_loss": 0.82275569, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83341277, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.23815918, + "step": 4358, + "time_per_iteration": 2.8808021545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067548, + "balance_loss_mlp": 1.04333663, + "epoch": 0.8385917660638708, + "flos": 491169153024.0, + "grad_norm": 0.06064509036579976, + "language_loss": 0.8677128, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87838829, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.24206543, + "step": 4359, + "time_per_iteration": 2.5335707664489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.04286551, + "epoch": 0.8387841477491342, + "flos": 466900692480.0, + "grad_norm": 0.06112190056703723, + "language_loss": 0.86871219, + "learning_rate": 6.662428984145336e-05, + "loss": 0.87937975, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.23913574, + "step": 4360, + "time_per_iteration": 2.5734517574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002992, + "balance_loss_mlp": 0.99631584, + "epoch": 0.8389765294343978, + "flos": 1564188475392.0, + "grad_norm": 0.006766850503245408, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72783178, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.06689453, + "step": 4361, + "time_per_iteration": 5.083199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068212, + "balance_loss_mlp": 1.04532373, + "epoch": 0.8391689111196614, + "flos": 602160708096.0, + "grad_norm": 0.050669789123315594, + "language_loss": 0.83081377, + "learning_rate": 6.631386895903308e-05, + "loss": 0.84149587, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.22888184, + "step": 4362, + "time_per_iteration": 2.8357574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066938, + "balance_loss_mlp": 1.04351294, + "epoch": 0.839361292804925, + "flos": 443047408128.0, + "grad_norm": 0.06770273603538836, + "language_loss": 0.80559838, + "learning_rate": 6.615891104554261e-05, + "loss": 0.81626773, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.23400879, + "step": 4363, + "time_per_iteration": 2.536062479019165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065386, + "balance_loss_mlp": 1.04129338, + "epoch": 0.8395536744901886, + "flos": 594167768064.0, + "grad_norm": 0.0638325900212512, + "language_loss": 0.83115453, + "learning_rate": 6.600412156410057e-05, + "loss": 0.84180838, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.24060059, + "step": 4364, + "time_per_iteration": 2.71862530708313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063359, + "balance_loss_mlp": 1.03946948, + "epoch": 0.8397460561754521, + "flos": 889836171264.0, + "grad_norm": 0.05544062619942032, + "language_loss": 0.85130864, + "learning_rate": 6.58495005748016e-05, + "loss": 0.86194223, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.23864746, + "step": 4365, + "time_per_iteration": 3.1578779220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068008, + "balance_loss_mlp": 1.04511952, + "epoch": 0.8399384378607156, + "flos": 553503020544.0, + "grad_norm": 0.05406052024104549, + "language_loss": 0.89236188, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90304196, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.22900391, + "step": 4366, + "time_per_iteration": 2.6701974868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064854, + "balance_loss_mlp": 1.0414772, + "epoch": 0.8401308195459792, + "flos": 518923832832.0, + "grad_norm": 0.048370646551648334, + "language_loss": 0.83713633, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84778494, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.23364258, + "step": 4367, + "time_per_iteration": 2.6388163566589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069308, + "balance_loss_mlp": 1.04606247, + "epoch": 0.8403232012312428, + "flos": 684933221376.0, + "grad_norm": 0.07156350657988877, + "language_loss": 0.80791634, + "learning_rate": 6.538664915972648e-05, + "loss": 0.81860942, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.23242188, + "step": 4368, + "time_per_iteration": 3.0422544479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066173, + "balance_loss_mlp": 1.04270053, + "epoch": 0.8405155829165063, + "flos": 577672736256.0, + "grad_norm": 0.09577954267651939, + "language_loss": 0.77953506, + "learning_rate": 6.523270273863652e-05, + "loss": 0.79019678, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.23449707, + "step": 4369, + "time_per_iteration": 2.7166330814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066047, + "balance_loss_mlp": 1.04277718, + "epoch": 0.8407079646017699, + "flos": 456627041280.0, + "grad_norm": 0.06624048319732295, + "language_loss": 0.88075888, + "learning_rate": 6.507892510918079e-05, + "loss": 0.89141929, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.23278809, + "step": 4370, + "time_per_iteration": 2.5422723293304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04669082, + "epoch": 0.8409003462870335, + "flos": 534917426688.0, + "grad_norm": 0.07352168072762383, + "language_loss": 0.81707442, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82778412, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.24291992, + "step": 4371, + "time_per_iteration": 2.7493014335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067886, + "balance_loss_mlp": 1.04424715, + "epoch": 0.8410927279722971, + "flos": 556759443456.0, + "grad_norm": 0.07206783249977043, + "language_loss": 0.77901572, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78969461, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.2364502, + "step": 4372, + "time_per_iteration": 2.724076986312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009208, + "balance_loss_mlp": 1.00253248, + "epoch": 0.8412851096575606, + "flos": 1549754270208.0, + "grad_norm": 0.007754945157954517, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78688329, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.06689453, + "step": 4373, + "time_per_iteration": 4.920220851898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069157, + "balance_loss_mlp": 1.04606605, + "epoch": 0.8414774913428241, + "flos": 552042749952.0, + "grad_norm": 0.05517644936985899, + "language_loss": 0.79005659, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80074823, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.2310791, + "step": 4374, + "time_per_iteration": 2.721017599105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070602, + "balance_loss_mlp": 1.04738045, + "epoch": 0.8416698730280877, + "flos": 573015140352.0, + "grad_norm": 0.06478850683255671, + "language_loss": 0.77759355, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78829956, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.2322998, + "step": 4375, + "time_per_iteration": 2.6687657833099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069077, + "balance_loss_mlp": 1.04429305, + "epoch": 0.8418622547133513, + "flos": 758731940352.0, + "grad_norm": 0.06415120451411167, + "language_loss": 0.8027761, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81346691, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.24768066, + "step": 4376, + "time_per_iteration": 2.8901031017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.04222751, + "epoch": 0.8420546363986149, + "flos": 1074156940800.0, + "grad_norm": 0.0640476481214236, + "language_loss": 0.72992682, + "learning_rate": 6.40072128754366e-05, + "loss": 0.74058723, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.23803711, + "step": 4377, + "time_per_iteration": 3.387580156326294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066874, + "balance_loss_mlp": 1.04272175, + "epoch": 0.8422470180838784, + "flos": 525908754432.0, + "grad_norm": 0.05787479724748053, + "language_loss": 0.82898366, + "learning_rate": 6.385478772280933e-05, + "loss": 0.8396523, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.24133301, + "step": 4378, + "time_per_iteration": 2.709947109222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_mlp": 1.04393244, + "epoch": 0.842439399769142, + "flos": 600834060288.0, + "grad_norm": 0.0566504887729659, + "language_loss": 0.82355988, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83423686, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.23754883, + "step": 4379, + "time_per_iteration": 2.740873098373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066145, + "balance_loss_mlp": 1.04244626, + "epoch": 0.8426317814544055, + "flos": 552222987264.0, + "grad_norm": 0.07635289805941703, + "language_loss": 0.86640501, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87706643, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.23669434, + "step": 4380, + "time_per_iteration": 2.8072140216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067646, + "balance_loss_mlp": 1.04449534, + "epoch": 0.8428241631396691, + "flos": 678832980480.0, + "grad_norm": 0.05882632867978376, + "language_loss": 0.78088343, + "learning_rate": 6.33985284608356e-05, + "loss": 0.79155988, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.23120117, + "step": 4381, + "time_per_iteration": 2.8028671741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068573, + "balance_loss_mlp": 1.04549408, + "epoch": 0.8430165448249327, + "flos": 753730748928.0, + "grad_norm": 0.0434369089484086, + "language_loss": 0.79954326, + "learning_rate": 6.324678096896435e-05, + "loss": 0.81022894, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.23046875, + "step": 4382, + "time_per_iteration": 3.0995900630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070257, + "balance_loss_mlp": 1.0466063, + "epoch": 0.8432089265101962, + "flos": 699140574720.0, + "grad_norm": 0.05667250688060797, + "language_loss": 0.81071454, + "learning_rate": 6.30952030397306e-05, + "loss": 0.82141709, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.23657227, + "step": 4383, + "time_per_iteration": 2.8948311805725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062746, + "balance_loss_mlp": 1.0390358, + "epoch": 0.8434013081954598, + "flos": 485767839744.0, + "grad_norm": 0.05498684151789612, + "language_loss": 0.84662998, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85725743, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.23706055, + "step": 4384, + "time_per_iteration": 2.6783289909362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069983, + "balance_loss_mlp": 1.04623687, + "epoch": 0.8435936898807234, + "flos": 520623811584.0, + "grad_norm": 0.060123898153637105, + "language_loss": 0.85518533, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86588514, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.23718262, + "step": 4385, + "time_per_iteration": 2.6172537803649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070212, + "balance_loss_mlp": 1.04614377, + "epoch": 0.843786071565987, + "flos": 785945534976.0, + "grad_norm": 0.07135730466560408, + "language_loss": 0.80699778, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81769991, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.24060059, + "step": 4386, + "time_per_iteration": 2.981780529022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011968, + "balance_loss_mlp": 1.00543571, + "epoch": 0.8439784532512504, + "flos": 1446278436864.0, + "grad_norm": 0.00922581639131475, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76848477, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.06542969, + "step": 4387, + "time_per_iteration": 4.87928318977356 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.04326701, + "epoch": 0.844170834936514, + "flos": 708700243968.0, + "grad_norm": 0.061285809051873204, + "language_loss": 0.82608801, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83676207, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.24133301, + "step": 4388, + "time_per_iteration": 2.879666566848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067269, + "balance_loss_mlp": 1.0429976, + "epoch": 0.8443632166217776, + "flos": 483428030976.0, + "grad_norm": 0.06596849430486998, + "language_loss": 0.79634511, + "learning_rate": 6.218929957057922e-05, + "loss": 0.8070178, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.24255371, + "step": 4389, + "time_per_iteration": 2.6863036155700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_mlp": 1.04712808, + "epoch": 0.8445555983070412, + "flos": 678694588416.0, + "grad_norm": 0.06649132591646567, + "language_loss": 0.80325556, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81395948, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.23266602, + "step": 4390, + "time_per_iteration": 2.848747968673706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068419, + "balance_loss_mlp": 1.04500639, + "epoch": 0.8447479799923048, + "flos": 741485477376.0, + "grad_norm": 0.05601705691022872, + "language_loss": 0.74468088, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75536507, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.23388672, + "step": 4391, + "time_per_iteration": 2.977398633956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.04123068, + "epoch": 0.8449403616775683, + "flos": 953306537472.0, + "grad_norm": 0.05451644109037178, + "language_loss": 0.8061679, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81681371, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.23339844, + "step": 4392, + "time_per_iteration": 3.253659725189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068446, + "balance_loss_mlp": 1.04412758, + "epoch": 0.8451327433628318, + "flos": 657363921408.0, + "grad_norm": 0.10297975661606834, + "language_loss": 0.72206116, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73274559, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.24328613, + "step": 4393, + "time_per_iteration": 2.8959717750549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106765, + "balance_loss_mlp": 1.04457116, + "epoch": 0.8453251250480954, + "flos": 446113681920.0, + "grad_norm": 0.07045905469561214, + "language_loss": 0.83664286, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84731936, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.23083496, + "step": 4394, + "time_per_iteration": 2.5246009826660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_mlp": 1.04410589, + "epoch": 0.845517506733359, + "flos": 542767205376.0, + "grad_norm": 0.07052005068233351, + "language_loss": 0.71200818, + "learning_rate": 6.128951512927305e-05, + "loss": 0.72269052, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.24108887, + "step": 4395, + "time_per_iteration": 2.672424077987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066143, + "balance_loss_mlp": 1.04233706, + "epoch": 0.8457098884186226, + "flos": 502440910848.0, + "grad_norm": 0.05507620442760256, + "language_loss": 0.84375072, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85441208, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.23791504, + "step": 4396, + "time_per_iteration": 2.6126549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062805, + "balance_loss_mlp": 1.04011917, + "epoch": 0.8459022701038861, + "flos": 448893259776.0, + "grad_norm": 0.0721067921466401, + "language_loss": 0.79833293, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80896091, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.22692871, + "step": 4397, + "time_per_iteration": 2.6915853023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066096, + "balance_loss_mlp": 1.04310095, + "epoch": 0.8460946517891497, + "flos": 743178115584.0, + "grad_norm": 0.06092470430813476, + "language_loss": 0.75112307, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76178408, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.23010254, + "step": 4398, + "time_per_iteration": 2.9150443077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066373, + "balance_loss_mlp": 1.0431869, + "epoch": 0.8462870334744133, + "flos": 553216324608.0, + "grad_norm": 0.05338597842317466, + "language_loss": 0.8020795, + "learning_rate": 6.069306450876389e-05, + "loss": 0.81274319, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.23193359, + "step": 4399, + "time_per_iteration": 2.750760316848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007946, + "balance_loss_mlp": 1.00146127, + "epoch": 0.8464794151596768, + "flos": 1564877864448.0, + "grad_norm": 0.006476140912531384, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82716513, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.06494141, + "step": 4400, + "time_per_iteration": 4.870280742645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064721, + "balance_loss_mlp": 1.04194021, + "epoch": 0.8466717968449403, + "flos": 550197038592.0, + "grad_norm": 0.05700520466186635, + "language_loss": 0.8022759, + "learning_rate": 6.039586229158084e-05, + "loss": 0.81292307, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.2277832, + "step": 4401, + "time_per_iteration": 2.6463844776153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04520464, + "epoch": 0.8468641785302039, + "flos": 551919038976.0, + "grad_norm": 0.07274796217082238, + "language_loss": 0.85117292, + "learning_rate": 6.024751715835314e-05, + "loss": 0.86186635, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.24145508, + "step": 4402, + "time_per_iteration": 2.7529311180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066722, + "balance_loss_mlp": 1.04289234, + "epoch": 0.8470565602154675, + "flos": 572671544832.0, + "grad_norm": 0.056307872604652406, + "language_loss": 0.87157613, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88224334, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.23803711, + "step": 4403, + "time_per_iteration": 2.751302480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066248, + "balance_loss_mlp": 1.04265642, + "epoch": 0.8472489419007311, + "flos": 472833179136.0, + "grad_norm": 0.06396477003256777, + "language_loss": 0.83970821, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85037065, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.23571777, + "step": 4404, + "time_per_iteration": 2.5374879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065348, + "balance_loss_mlp": 1.04250705, + "epoch": 0.8474413235859947, + "flos": 798020481024.0, + "grad_norm": 0.05899694759203786, + "language_loss": 0.80021083, + "learning_rate": 5.980350635103954e-05, + "loss": 0.81086433, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.22839355, + "step": 4405, + "time_per_iteration": 2.9600727558135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106951, + "balance_loss_mlp": 1.04639554, + "epoch": 0.8476337052712581, + "flos": 502379241984.0, + "grad_norm": 0.06477364721529628, + "language_loss": 0.80659878, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81729388, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.23095703, + "step": 4406, + "time_per_iteration": 2.5580904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065117, + "balance_loss_mlp": 1.04226446, + "epoch": 0.8478260869565217, + "flos": 931971101184.0, + "grad_norm": 0.045408059542421414, + "language_loss": 0.83343709, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84408826, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.22839355, + "step": 4407, + "time_per_iteration": 3.1972086429595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066424, + "balance_loss_mlp": 1.04317832, + "epoch": 0.8480184686417853, + "flos": 708811471872.0, + "grad_norm": 0.06839169803815855, + "language_loss": 0.80941093, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.82007527, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.23242188, + "step": 4408, + "time_per_iteration": 2.857830762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063267, + "balance_loss_mlp": 1.04056954, + "epoch": 0.8482108503270489, + "flos": 614440857600.0, + "grad_norm": 0.048590366986801296, + "language_loss": 0.82484585, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83547854, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.22680664, + "step": 4409, + "time_per_iteration": 2.814972162246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065801, + "balance_loss_mlp": 1.04220927, + "epoch": 0.8484032320123124, + "flos": 531016031232.0, + "grad_norm": 0.07117279722087601, + "language_loss": 0.82206416, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83272225, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.23583984, + "step": 4410, + "time_per_iteration": 2.67344069480896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100989, + "balance_loss_mlp": 1.00340486, + "epoch": 0.848595613697576, + "flos": 1542776315904.0, + "grad_norm": 0.006407685796369742, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77306801, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.06494141, + "step": 4411, + "time_per_iteration": 4.883302927017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067859, + "balance_loss_mlp": 1.0438621, + "epoch": 0.8487879953828396, + "flos": 677342974464.0, + "grad_norm": 0.05787110114111616, + "language_loss": 0.74100566, + "learning_rate": 5.877346528406635e-05, + "loss": 0.75168431, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.23986816, + "step": 4412, + "time_per_iteration": 2.839329481124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066525, + "balance_loss_mlp": 1.04281461, + "epoch": 0.8489803770681031, + "flos": 503673956352.0, + "grad_norm": 0.0786964917797223, + "language_loss": 0.79841238, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80907762, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.23693848, + "step": 4413, + "time_per_iteration": 2.590726137161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106631, + "balance_loss_mlp": 1.04268241, + "epoch": 0.8491727587533667, + "flos": 563186027520.0, + "grad_norm": 0.06106365604230901, + "language_loss": 0.77250433, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78316742, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.23620605, + "step": 4414, + "time_per_iteration": 2.7339437007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062739, + "balance_loss_mlp": 1.03908801, + "epoch": 0.8493651404386302, + "flos": 459784719360.0, + "grad_norm": 0.061516297138754804, + "language_loss": 0.78073561, + "learning_rate": 5.833458746159243e-05, + "loss": 0.791363, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.2364502, + "step": 4415, + "time_per_iteration": 2.587347984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066023, + "balance_loss_mlp": 1.04259825, + "epoch": 0.8495575221238938, + "flos": 461170838016.0, + "grad_norm": 0.06256364296979572, + "language_loss": 0.8204776, + "learning_rate": 5.818863771788013e-05, + "loss": 0.8311379, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.23413086, + "step": 4416, + "time_per_iteration": 2.6552000045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065844, + "balance_loss_mlp": 1.04268169, + "epoch": 0.8497499038091574, + "flos": 870712063488.0, + "grad_norm": 0.05735024034400219, + "language_loss": 0.81449145, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82514989, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.23156738, + "step": 4417, + "time_per_iteration": 3.103299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106067, + "balance_loss_mlp": 1.03838944, + "epoch": 0.849942285494421, + "flos": 779600443392.0, + "grad_norm": 0.06271004199707587, + "language_loss": 0.78202587, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79263258, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.22290039, + "step": 4418, + "time_per_iteration": 2.988527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065221, + "balance_loss_mlp": 1.04155779, + "epoch": 0.8501346671796844, + "flos": 513816556032.0, + "grad_norm": 0.06478557665826518, + "language_loss": 0.84965581, + "learning_rate": 5.775181787135819e-05, + "loss": 0.86030805, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.2364502, + "step": 4419, + "time_per_iteration": 2.660832643508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067372, + "balance_loss_mlp": 1.04471087, + "epoch": 0.850327048864948, + "flos": 621445602816.0, + "grad_norm": 0.05805875265489693, + "language_loss": 0.84014624, + "learning_rate": 5.76065545724877e-05, + "loss": 0.85081995, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.2265625, + "step": 4420, + "time_per_iteration": 2.8057973384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062456, + "balance_loss_mlp": 1.03862631, + "epoch": 0.8505194305502116, + "flos": 774221524992.0, + "grad_norm": 0.07039622216102548, + "language_loss": 0.80085033, + "learning_rate": 5.746146302598454e-05, + "loss": 0.81147492, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.23815918, + "step": 4421, + "time_per_iteration": 2.996438980102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065422, + "balance_loss_mlp": 1.04161644, + "epoch": 0.8507118122354752, + "flos": 465257613312.0, + "grad_norm": 0.05957352392179271, + "language_loss": 0.86582476, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87647903, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.23803711, + "step": 4422, + "time_per_iteration": 2.5918169021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067094, + "balance_loss_mlp": 1.04385972, + "epoch": 0.8509041939207388, + "flos": 534413417472.0, + "grad_norm": 0.06352427691563187, + "language_loss": 0.84973729, + "learning_rate": 5.717179541533257e-05, + "loss": 0.86040825, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.23217773, + "step": 4423, + "time_per_iteration": 2.619405508041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106657, + "balance_loss_mlp": 1.04306161, + "epoch": 0.8510965756060023, + "flos": 583738472448.0, + "grad_norm": 0.06302067631207116, + "language_loss": 0.84753847, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85820413, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.23474121, + "step": 4424, + "time_per_iteration": 2.722527027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066143, + "balance_loss_mlp": 1.04253995, + "epoch": 0.8512889572912659, + "flos": 600841400832.0, + "grad_norm": 0.06546706844239303, + "language_loss": 0.77509212, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78575361, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.23608398, + "step": 4425, + "time_per_iteration": 2.7466671466827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064936, + "balance_loss_mlp": 1.04149961, + "epoch": 0.8514813389765294, + "flos": 654791745024.0, + "grad_norm": 0.06580778450289504, + "language_loss": 0.78853273, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79918212, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.23461914, + "step": 4426, + "time_per_iteration": 2.895609140396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061073, + "balance_loss_mlp": 1.03810143, + "epoch": 0.851673720661793, + "flos": 429761811456.0, + "grad_norm": 0.07272347775054663, + "language_loss": 0.7861743, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.796785, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.22973633, + "step": 4427, + "time_per_iteration": 2.5801033973693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068835, + "balance_loss_mlp": 1.04494596, + "epoch": 0.8518661023470565, + "flos": 641572959744.0, + "grad_norm": 0.07439328255951466, + "language_loss": 0.79870641, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80939472, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.2388916, + "step": 4428, + "time_per_iteration": 2.8004066944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067225, + "balance_loss_mlp": 1.04265594, + "epoch": 0.8520584840323201, + "flos": 562143504384.0, + "grad_norm": 0.06621520947267241, + "language_loss": 0.80070293, + "learning_rate": 5.630692048472363e-05, + "loss": 0.81137514, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.24572754, + "step": 4429, + "time_per_iteration": 2.6723525524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065488, + "balance_loss_mlp": 1.04238546, + "epoch": 0.8522508657175837, + "flos": 527050395648.0, + "grad_norm": 0.0643114416219169, + "language_loss": 0.78664088, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79729569, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.23095703, + "step": 4430, + "time_per_iteration": 2.5892906188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067303, + "balance_loss_mlp": 1.04400945, + "epoch": 0.8524432474028473, + "flos": 499120247808.0, + "grad_norm": 0.052370046731540595, + "language_loss": 0.80731219, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81798524, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.23303223, + "step": 4431, + "time_per_iteration": 2.5757946968078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_mlp": 1.04282999, + "epoch": 0.8526356290881109, + "flos": 421314048000.0, + "grad_norm": 0.075289368546831, + "language_loss": 0.79962564, + "learning_rate": 5.587680773323706e-05, + "loss": 0.81029695, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.24291992, + "step": 4432, + "time_per_iteration": 2.5692286491394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065059, + "balance_loss_mlp": 1.04178977, + "epoch": 0.8528280107733743, + "flos": 507328303104.0, + "grad_norm": 0.06440459040466175, + "language_loss": 0.80769801, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.81834859, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.23278809, + "step": 4433, + "time_per_iteration": 2.5663363933563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068841, + "balance_loss_mlp": 1.04665649, + "epoch": 0.8530203924586379, + "flos": 445893797376.0, + "grad_norm": 0.07593628310052838, + "language_loss": 0.83613151, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.84682, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.22167969, + "step": 4434, + "time_per_iteration": 2.5368459224700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066621, + "balance_loss_mlp": 1.04304099, + "epoch": 0.8532127741439015, + "flos": 657759273984.0, + "grad_norm": 0.06368291451038398, + "language_loss": 0.83650082, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84716707, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.23571777, + "step": 4435, + "time_per_iteration": 2.8207342624664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065551, + "balance_loss_mlp": 1.04325891, + "epoch": 0.8534051558291651, + "flos": 536019420672.0, + "grad_norm": 0.059557208389798, + "language_loss": 0.83486569, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84552121, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.22302246, + "step": 4436, + "time_per_iteration": 2.703518867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.0480994, + "epoch": 0.8535975375144286, + "flos": 533000134656.0, + "grad_norm": 0.07770512635670898, + "language_loss": 0.79491788, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80562592, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.22705078, + "step": 4437, + "time_per_iteration": 2.640202283859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066415, + "balance_loss_mlp": 1.04366946, + "epoch": 0.8537899191996922, + "flos": 574141727232.0, + "grad_norm": 0.07771991144134452, + "language_loss": 0.82565296, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83631706, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.22729492, + "step": 4438, + "time_per_iteration": 2.6925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067393, + "balance_loss_mlp": 1.0444448, + "epoch": 0.8539823008849557, + "flos": 465007993344.0, + "grad_norm": 0.05861374859344016, + "language_loss": 0.8332969, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84397078, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.22924805, + "step": 4439, + "time_per_iteration": 2.65185809135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067106, + "balance_loss_mlp": 1.04427707, + "epoch": 0.8541746825702193, + "flos": 554713671168.0, + "grad_norm": 0.05951750277824149, + "language_loss": 0.81889856, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.8295697, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.22839355, + "step": 4440, + "time_per_iteration": 2.674394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_mlp": 1.04465663, + "epoch": 0.8543670642554829, + "flos": 546391816704.0, + "grad_norm": 0.06592019340949207, + "language_loss": 0.77447701, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.7851547, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.2310791, + "step": 4441, + "time_per_iteration": 2.7579543590545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066142, + "balance_loss_mlp": 1.04333735, + "epoch": 0.8545594459407464, + "flos": 512027744256.0, + "grad_norm": 0.055550756440976644, + "language_loss": 0.82326615, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83392757, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.22802734, + "step": 4442, + "time_per_iteration": 2.6374173164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069636, + "balance_loss_mlp": 1.04568636, + "epoch": 0.85475182762601, + "flos": 421185567744.0, + "grad_norm": 0.062356608880068685, + "language_loss": 0.81928778, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82998419, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.23950195, + "step": 4443, + "time_per_iteration": 2.493833065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072435, + "balance_loss_mlp": 1.04957128, + "epoch": 0.8549442093112736, + "flos": 389435516928.0, + "grad_norm": 0.08995749678251017, + "language_loss": 0.7806946, + "learning_rate": 5.41718898228542e-05, + "loss": 0.79141891, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.22851562, + "step": 4444, + "time_per_iteration": 2.4674642086029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064962, + "balance_loss_mlp": 1.04145384, + "epoch": 0.8551365909965372, + "flos": 605926282752.0, + "grad_norm": 0.06096774879214106, + "language_loss": 0.79480731, + "learning_rate": 5.403093707834334e-05, + "loss": 0.805457, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.23510742, + "step": 4445, + "time_per_iteration": 2.797175884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066081, + "balance_loss_mlp": 1.0426091, + "epoch": 0.8553289726818007, + "flos": 504160713216.0, + "grad_norm": 0.054681566669178006, + "language_loss": 0.79001647, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.8006773, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.23449707, + "step": 4446, + "time_per_iteration": 2.5630698204040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070243, + "balance_loss_mlp": 1.04688966, + "epoch": 0.8555213543670642, + "flos": 557009063424.0, + "grad_norm": 0.05883770467526881, + "language_loss": 0.76065564, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77135801, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.23339844, + "step": 4447, + "time_per_iteration": 2.7396435737609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072768, + "balance_loss_mlp": 1.04790044, + "epoch": 0.8557137360523278, + "flos": 548104278528.0, + "grad_norm": 0.05625744975285459, + "language_loss": 0.74883693, + "learning_rate": 5.360911790663775e-05, + "loss": 0.75956464, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.24865723, + "step": 4448, + "time_per_iteration": 2.6279850006103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069234, + "balance_loss_mlp": 1.0455358, + "epoch": 0.8559061177375914, + "flos": 728182628352.0, + "grad_norm": 0.06704266405669905, + "language_loss": 0.78576261, + "learning_rate": 5.346885805197238e-05, + "loss": 0.79645491, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.23693848, + "step": 4449, + "time_per_iteration": 2.98538875579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067263, + "balance_loss_mlp": 1.04454207, + "epoch": 0.856098499422855, + "flos": 535881028608.0, + "grad_norm": 0.07982199666480248, + "language_loss": 0.83324075, + "learning_rate": 5.332877155607085e-05, + "loss": 0.84391338, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.22705078, + "step": 4450, + "time_per_iteration": 2.7068963050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072598, + "balance_loss_mlp": 1.0485177, + "epoch": 0.8562908811081185, + "flos": 573664882176.0, + "grad_norm": 0.05521375375553453, + "language_loss": 0.83632553, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.8470515, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.24072266, + "step": 4451, + "time_per_iteration": 2.6884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066185, + "balance_loss_mlp": 1.0432018, + "epoch": 0.856483262793382, + "flos": 781754872320.0, + "grad_norm": 0.06327105945437797, + "language_loss": 0.80799401, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81865585, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.22973633, + "step": 4452, + "time_per_iteration": 3.093397617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065123, + "balance_loss_mlp": 1.04212737, + "epoch": 0.8566756444786456, + "flos": 455819083776.0, + "grad_norm": 0.04933064645771812, + "language_loss": 0.8479045, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85855579, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.22998047, + "step": 4453, + "time_per_iteration": 2.57102370262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_mlp": 1.04082465, + "epoch": 0.8568680261639092, + "flos": 449382587904.0, + "grad_norm": 0.06346100260124406, + "language_loss": 0.84574735, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85639256, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.23681641, + "step": 4454, + "time_per_iteration": 2.5162353515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066613, + "balance_loss_mlp": 1.04390395, + "epoch": 0.8570604078491728, + "flos": 479976316416.0, + "grad_norm": 0.065712336297566, + "language_loss": 0.8286593, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83932549, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.22705078, + "step": 4455, + "time_per_iteration": 2.533682346343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069011, + "balance_loss_mlp": 1.04520488, + "epoch": 0.8572527895344363, + "flos": 505942184448.0, + "grad_norm": 0.06083249202811438, + "language_loss": 0.85139161, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86208177, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.23815918, + "step": 4456, + "time_per_iteration": 2.5975639820098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067778, + "balance_loss_mlp": 1.04509306, + "epoch": 0.8574451712196999, + "flos": 787044957696.0, + "grad_norm": 0.05429388196694402, + "language_loss": 0.83300918, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84368694, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.22692871, + "step": 4457, + "time_per_iteration": 3.022726535797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061605, + "balance_loss_mlp": 1.03793049, + "epoch": 0.8576375529049635, + "flos": 509252935680.0, + "grad_norm": 0.05416309124653123, + "language_loss": 0.7547797, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76539564, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.23657227, + "step": 4458, + "time_per_iteration": 2.6720643043518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101799, + "balance_loss_mlp": 1.01150465, + "epoch": 0.857829934590227, + "flos": 1460772486144.0, + "grad_norm": 0.012361752759178701, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85785276, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.06494141, + "step": 4459, + "time_per_iteration": 4.948476314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067215, + "balance_loss_mlp": 1.04449415, + "epoch": 0.8580223162754905, + "flos": 479296839168.0, + "grad_norm": 0.06039783526010156, + "language_loss": 0.89551371, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90618587, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.22741699, + "step": 4460, + "time_per_iteration": 2.6287126541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068846, + "balance_loss_mlp": 1.04555237, + "epoch": 0.8582146979607541, + "flos": 706231954944.0, + "grad_norm": 0.06717737486032459, + "language_loss": 0.7925657, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80325413, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.23291016, + "step": 4461, + "time_per_iteration": 2.8171298503875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106575, + "balance_loss_mlp": 1.04228938, + "epoch": 0.8584070796460177, + "flos": 765158524416.0, + "grad_norm": 0.05415728496526175, + "language_loss": 0.8288449, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83950245, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.23449707, + "step": 4462, + "time_per_iteration": 2.9669747352600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072466, + "balance_loss_mlp": 1.04925561, + "epoch": 0.8585994613312813, + "flos": 586829339136.0, + "grad_norm": 0.05497147450516782, + "language_loss": 0.85761333, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86833793, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.23193359, + "step": 4463, + "time_per_iteration": 2.7663209438323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064609, + "balance_loss_mlp": 1.04201901, + "epoch": 0.8587918430165449, + "flos": 608295826944.0, + "grad_norm": 0.05208267140887148, + "language_loss": 0.78707975, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79772592, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.22583008, + "step": 4464, + "time_per_iteration": 2.7761847972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065329, + "balance_loss_mlp": 1.0422976, + "epoch": 0.8589842247018084, + "flos": 588981570048.0, + "grad_norm": 0.06400063819482826, + "language_loss": 0.81218016, + "learning_rate": 5.124831399159535e-05, + "loss": 0.82283336, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.23034668, + "step": 4465, + "time_per_iteration": 2.7178432941436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_mlp": 1.04617512, + "epoch": 0.8591766063870719, + "flos": 543879111168.0, + "grad_norm": 0.07725520700778, + "language_loss": 0.78933436, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.80002844, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.2322998, + "step": 4466, + "time_per_iteration": 2.6696112155914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070192, + "balance_loss_mlp": 1.04685104, + "epoch": 0.8593689880723355, + "flos": 493756010496.0, + "grad_norm": 0.059887817655499394, + "language_loss": 0.80918819, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81989014, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.2331543, + "step": 4467, + "time_per_iteration": 2.6542410850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067848, + "balance_loss_mlp": 1.04422045, + "epoch": 0.8595613697575991, + "flos": 533909408256.0, + "grad_norm": 0.058922665395114725, + "language_loss": 0.83588117, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84655964, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.23608398, + "step": 4468, + "time_per_iteration": 2.603156805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070297, + "balance_loss_mlp": 1.04666996, + "epoch": 0.8597537514428626, + "flos": 617628271104.0, + "grad_norm": 0.06453385462922813, + "language_loss": 0.75768328, + "learning_rate": 5.070013822961328e-05, + "loss": 0.76838624, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.23620605, + "step": 4469, + "time_per_iteration": 2.7231431007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065548, + "balance_loss_mlp": 1.04199243, + "epoch": 0.8599461331281262, + "flos": 608730826752.0, + "grad_norm": 0.07204075811726149, + "language_loss": 0.83518052, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84583598, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.23547363, + "step": 4470, + "time_per_iteration": 2.717693328857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068389, + "balance_loss_mlp": 1.04436803, + "epoch": 0.8601385148133898, + "flos": 551252044800.0, + "grad_norm": 0.06516194685738681, + "language_loss": 0.83412385, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84480774, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.2401123, + "step": 4471, + "time_per_iteration": 2.649050712585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065346, + "balance_loss_mlp": 1.04213572, + "epoch": 0.8603308964986534, + "flos": 581200800768.0, + "grad_norm": 0.045536307661976354, + "language_loss": 0.81605756, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82671106, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.23205566, + "step": 4472, + "time_per_iteration": 2.82643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106475, + "balance_loss_mlp": 1.04182613, + "epoch": 0.8605232781839169, + "flos": 629013828096.0, + "grad_norm": 0.06591962589053263, + "language_loss": 0.75219733, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.7628448, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.22912598, + "step": 4473, + "time_per_iteration": 2.810159206390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064778, + "balance_loss_mlp": 1.04146099, + "epoch": 0.8607156598691804, + "flos": 468141078528.0, + "grad_norm": 0.06408761909970127, + "language_loss": 0.77287406, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78352183, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.23291016, + "step": 4474, + "time_per_iteration": 2.5086567401885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065868, + "balance_loss_mlp": 1.04340887, + "epoch": 0.860908041554444, + "flos": 488394344448.0, + "grad_norm": 0.05383413396861342, + "language_loss": 0.82797289, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83863151, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.22436523, + "step": 4475, + "time_per_iteration": 2.624570608139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107092, + "balance_loss_mlp": 1.04825842, + "epoch": 0.8611004232397076, + "flos": 592094831616.0, + "grad_norm": 0.06546574950256999, + "language_loss": 0.80683541, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81754458, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.2265625, + "step": 4476, + "time_per_iteration": 2.663764476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070396, + "balance_loss_mlp": 1.04612505, + "epoch": 0.8612928049249712, + "flos": 774209041920.0, + "grad_norm": 0.07263779863645746, + "language_loss": 0.86455739, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87526137, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.24255371, + "step": 4477, + "time_per_iteration": 3.027892589569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073125, + "balance_loss_mlp": 1.04973626, + "epoch": 0.8614851866102347, + "flos": 537553843200.0, + "grad_norm": 0.061884066626690507, + "language_loss": 0.82510734, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83583856, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.23388672, + "step": 4478, + "time_per_iteration": 2.6411685943603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066901, + "balance_loss_mlp": 1.04391742, + "epoch": 0.8616775682954982, + "flos": 565916419584.0, + "grad_norm": 0.06082112959712949, + "language_loss": 0.79257238, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80324137, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.2298584, + "step": 4479, + "time_per_iteration": 2.668093681335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070465, + "balance_loss_mlp": 1.04654002, + "epoch": 0.8618699499807618, + "flos": 481592231424.0, + "grad_norm": 0.06361737630648884, + "language_loss": 0.81400502, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82470965, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.23937988, + "step": 4480, + "time_per_iteration": 2.5984365940093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069896, + "balance_loss_mlp": 1.04482687, + "epoch": 0.8620623316660254, + "flos": 649506802176.0, + "grad_norm": 0.07470953092140498, + "language_loss": 0.74230057, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75299954, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.25073242, + "step": 4481, + "time_per_iteration": 2.8063738346099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067634, + "balance_loss_mlp": 1.0446502, + "epoch": 0.862254713351289, + "flos": 751781523456.0, + "grad_norm": 0.07440108509254313, + "language_loss": 0.85886705, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86954337, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.2298584, + "step": 4482, + "time_per_iteration": 2.9675238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069198, + "balance_loss_mlp": 1.04586911, + "epoch": 0.8624470950365525, + "flos": 841543727616.0, + "grad_norm": 0.0531227577741629, + "language_loss": 0.78093559, + "learning_rate": 4.880352388488024e-05, + "loss": 0.79162753, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.23303223, + "step": 4483, + "time_per_iteration": 3.2197213172912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070235, + "balance_loss_mlp": 1.04753709, + "epoch": 0.8626394767218161, + "flos": 754793468928.0, + "grad_norm": 0.06843329806115567, + "language_loss": 0.83035022, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84105253, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.22705078, + "step": 4484, + "time_per_iteration": 2.9174022674560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065694, + "balance_loss_mlp": 1.04180419, + "epoch": 0.8628318584070797, + "flos": 703585626624.0, + "grad_norm": 0.05765697174756403, + "language_loss": 0.82464355, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83530051, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.23876953, + "step": 4485, + "time_per_iteration": 2.8672118186950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072261, + "balance_loss_mlp": 1.04927754, + "epoch": 0.8630242400923432, + "flos": 471244428288.0, + "grad_norm": 0.061061631924157964, + "language_loss": 0.78096372, + "learning_rate": 4.840156846389487e-05, + "loss": 0.7916863, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.22973633, + "step": 4486, + "time_per_iteration": 2.559305429458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068405, + "balance_loss_mlp": 1.04474235, + "epoch": 0.8632166217776067, + "flos": 964363553280.0, + "grad_norm": 0.06474745761042122, + "language_loss": 0.77492851, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78561258, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.23620605, + "step": 4487, + "time_per_iteration": 3.1866891384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.04441333, + "epoch": 0.8634090034628703, + "flos": 767913509376.0, + "grad_norm": 0.07806786642802178, + "language_loss": 0.7887038, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79937685, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.22900391, + "step": 4488, + "time_per_iteration": 2.9155101776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070862, + "balance_loss_mlp": 1.04682934, + "epoch": 0.8636013851481339, + "flos": 520591504896.0, + "grad_norm": 0.06399781254677851, + "language_loss": 0.8339777, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84468627, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.24035645, + "step": 4489, + "time_per_iteration": 2.7343907356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073021, + "balance_loss_mlp": 1.04947686, + "epoch": 0.8637937668333975, + "flos": 632144342016.0, + "grad_norm": 0.06306322296515987, + "language_loss": 0.80910051, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81983078, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.23547363, + "step": 4490, + "time_per_iteration": 2.7213432788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063802, + "balance_loss_mlp": 1.04212999, + "epoch": 0.8639861485186611, + "flos": 856094676480.0, + "grad_norm": 0.0662889748017123, + "language_loss": 0.76684427, + "learning_rate": 4.773514997362e-05, + "loss": 0.77748227, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.21679688, + "step": 4491, + "time_per_iteration": 3.1134567260742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107113, + "balance_loss_mlp": 1.04876614, + "epoch": 0.8641785302039245, + "flos": 481261118976.0, + "grad_norm": 0.061534938742705755, + "language_loss": 0.77915752, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.78986883, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.22375488, + "step": 4492, + "time_per_iteration": 2.5515682697296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076479, + "balance_loss_mlp": 1.05334055, + "epoch": 0.8643709118891881, + "flos": 504637558272.0, + "grad_norm": 0.056633924458127455, + "language_loss": 0.80623692, + "learning_rate": 4.746981130927675e-05, + "loss": 0.8170017, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.23120117, + "step": 4493, + "time_per_iteration": 2.587090015411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066718, + "balance_loss_mlp": 1.04368663, + "epoch": 0.8645632935744517, + "flos": 552368719872.0, + "grad_norm": 0.061248751366218074, + "language_loss": 0.82094586, + "learning_rate": 4.733740548306908e-05, + "loss": 0.831613, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.23034668, + "step": 4494, + "time_per_iteration": 2.8737690448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070355, + "balance_loss_mlp": 1.04671621, + "epoch": 0.8647556752597153, + "flos": 524737751040.0, + "grad_norm": 0.05950385437466058, + "language_loss": 0.84496897, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.85567254, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.23632812, + "step": 4495, + "time_per_iteration": 2.56300687789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067913, + "balance_loss_mlp": 1.04403555, + "epoch": 0.8649480569449788, + "flos": 787768851456.0, + "grad_norm": 0.05732436588564149, + "language_loss": 0.82358348, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83426261, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.23876953, + "step": 4496, + "time_per_iteration": 3.0807862281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067, + "balance_loss_mlp": 1.04376626, + "epoch": 0.8651404386302424, + "flos": 763863810048.0, + "grad_norm": 0.05411692865877634, + "language_loss": 0.76783019, + "learning_rate": 4.694124264495225e-05, + "loss": 0.7785002, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.23254395, + "step": 4497, + "time_per_iteration": 3.0178675651550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069027, + "balance_loss_mlp": 1.04600811, + "epoch": 0.865332820315506, + "flos": 539893651968.0, + "grad_norm": 0.06388786819462051, + "language_loss": 0.82005155, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83074188, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.23010254, + "step": 4498, + "time_per_iteration": 2.7146639823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018384, + "balance_loss_mlp": 1.01185119, + "epoch": 0.8655252020007695, + "flos": 1476632830464.0, + "grad_norm": 0.008099303478645539, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80192828, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.06542969, + "step": 4499, + "time_per_iteration": 4.753086090087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107094, + "balance_loss_mlp": 1.04775345, + "epoch": 0.8657175836860331, + "flos": 517369586688.0, + "grad_norm": 0.053545746789140966, + "language_loss": 0.82837707, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83908641, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.23168945, + "step": 4500, + "time_per_iteration": 2.6927261352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070408, + "balance_loss_mlp": 1.04632783, + "epoch": 0.8659099653712966, + "flos": 590523333120.0, + "grad_norm": 0.0655677470147481, + "language_loss": 0.80042905, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81113315, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.24072266, + "step": 4501, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071603, + "balance_loss_mlp": 1.04906011, + "epoch": 0.8661023470565602, + "flos": 590449181184.0, + "grad_norm": 0.06647391146116884, + "language_loss": 0.88215441, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89287043, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.22546387, + "step": 4502, + "time_per_iteration": 2.8460209369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.04535317, + "epoch": 0.8662947287418238, + "flos": 567670726656.0, + "grad_norm": 0.06408588375541317, + "language_loss": 0.79417884, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80486345, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.2310791, + "step": 4503, + "time_per_iteration": 2.7376766204833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066927, + "balance_loss_mlp": 1.04433703, + "epoch": 0.8664871104270874, + "flos": 515929139712.0, + "grad_norm": 0.04956343701508282, + "language_loss": 0.82059669, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83126605, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.22595215, + "step": 4504, + "time_per_iteration": 2.7517826557159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074981, + "balance_loss_mlp": 1.05140173, + "epoch": 0.866679492112351, + "flos": 557263452672.0, + "grad_norm": 0.0593924195885579, + "language_loss": 0.78475362, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79550344, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.2355957, + "step": 4505, + "time_per_iteration": 2.7980902194976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068297, + "balance_loss_mlp": 1.04519379, + "epoch": 0.8668718737976144, + "flos": 722448004608.0, + "grad_norm": 0.07401296676865261, + "language_loss": 0.82072198, + "learning_rate": 4.57622578599054e-05, + "loss": 0.83140492, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.2310791, + "step": 4506, + "time_per_iteration": 2.901148796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070825, + "balance_loss_mlp": 1.046983, + "epoch": 0.867064255482878, + "flos": 600705580032.0, + "grad_norm": 0.06824743782303751, + "language_loss": 0.84235609, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.8530643, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.23840332, + "step": 4507, + "time_per_iteration": 2.712239980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067376, + "balance_loss_mlp": 1.04275918, + "epoch": 0.8672566371681416, + "flos": 803527879680.0, + "grad_norm": 0.05761881771366499, + "language_loss": 0.76407146, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77474517, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.24621582, + "step": 4508, + "time_per_iteration": 3.0358455181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070674, + "balance_loss_mlp": 1.04858494, + "epoch": 0.8674490188534052, + "flos": 627368177664.0, + "grad_norm": 0.05601064601352948, + "language_loss": 0.83846825, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84917504, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.22094727, + "step": 4509, + "time_per_iteration": 2.733057737350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073461, + "balance_loss_mlp": 1.05034614, + "epoch": 0.8676414005386687, + "flos": 727831692288.0, + "grad_norm": 0.05491630490316898, + "language_loss": 0.86686462, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87759924, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.2310791, + "step": 4510, + "time_per_iteration": 2.9853105545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_mlp": 1.0446918, + "epoch": 0.8678337822239323, + "flos": 539972573184.0, + "grad_norm": 0.06374912494206549, + "language_loss": 0.80658293, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81725538, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.2253418, + "step": 4511, + "time_per_iteration": 2.75742244720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073846, + "balance_loss_mlp": 1.05014682, + "epoch": 0.8680261639091958, + "flos": 507521023488.0, + "grad_norm": 0.07370272998017972, + "language_loss": 0.79475594, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80549443, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.23706055, + "step": 4512, + "time_per_iteration": 2.5551512241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.05068421, + "epoch": 0.8682185455944594, + "flos": 487126794240.0, + "grad_norm": 0.05959366195168138, + "language_loss": 0.81097651, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82170677, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.22338867, + "step": 4513, + "time_per_iteration": 2.6309256553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064992, + "balance_loss_mlp": 1.0413053, + "epoch": 0.868410927279723, + "flos": 603690361344.0, + "grad_norm": 0.08611526780858492, + "language_loss": 0.81241572, + "learning_rate": 4.472626206030528e-05, + "loss": 0.82306564, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.23669434, + "step": 4514, + "time_per_iteration": 2.753772258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065128, + "balance_loss_mlp": 1.04249024, + "epoch": 0.8686033089649865, + "flos": 1118985186816.0, + "grad_norm": 0.06244049051454504, + "language_loss": 0.85057306, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.86122435, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.22631836, + "step": 4515, + "time_per_iteration": 3.37809157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071275, + "balance_loss_mlp": 1.04789805, + "epoch": 0.8687956906502501, + "flos": 568019091456.0, + "grad_norm": 0.07169275987782167, + "language_loss": 0.84147042, + "learning_rate": 4.446902963685862e-05, + "loss": 0.85218316, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.23364258, + "step": 4516, + "time_per_iteration": 2.7013230323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072826, + "balance_loss_mlp": 1.04969954, + "epoch": 0.8689880723355137, + "flos": 544338703872.0, + "grad_norm": 0.05707773988768033, + "language_loss": 0.84542006, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.8561483, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.23132324, + "step": 4517, + "time_per_iteration": 2.680288553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064511, + "balance_loss_mlp": 1.04248095, + "epoch": 0.8691804540207773, + "flos": 457425086976.0, + "grad_norm": 0.054403419993017434, + "language_loss": 0.86638057, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.8770256, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.22045898, + "step": 4518, + "time_per_iteration": 2.600592851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072363, + "balance_loss_mlp": 1.04929602, + "epoch": 0.8693728357060407, + "flos": 591872375808.0, + "grad_norm": 0.10977029070372525, + "language_loss": 0.80449891, + "learning_rate": 4.40845075221456e-05, + "loss": 0.81522256, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.23046875, + "step": 4519, + "time_per_iteration": 2.6959917545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068235, + "balance_loss_mlp": 1.04550183, + "epoch": 0.8695652173913043, + "flos": 680263515648.0, + "grad_norm": 0.07063102668534314, + "language_loss": 0.79929483, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80997729, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.22741699, + "step": 4520, + "time_per_iteration": 2.875837564468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072228, + "balance_loss_mlp": 1.04933965, + "epoch": 0.8697575990765679, + "flos": 492362551296.0, + "grad_norm": 0.05846160693861355, + "language_loss": 0.78477466, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79549694, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.22875977, + "step": 4521, + "time_per_iteration": 2.58884334564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.04863048, + "epoch": 0.8699499807618315, + "flos": 526949079552.0, + "grad_norm": 0.06228511759625967, + "language_loss": 0.81880158, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82952034, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.2322998, + "step": 4522, + "time_per_iteration": 2.6632633209228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070463, + "balance_loss_mlp": 1.04691923, + "epoch": 0.8701423624470951, + "flos": 814342616064.0, + "grad_norm": 0.061067995662179124, + "language_loss": 0.80669498, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81739956, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.23547363, + "step": 4523, + "time_per_iteration": 3.109477996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069602, + "balance_loss_mlp": 1.04596305, + "epoch": 0.8703347441323586, + "flos": 556789178880.0, + "grad_norm": 0.05389586568616872, + "language_loss": 0.88408816, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89478421, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.2364502, + "step": 4524, + "time_per_iteration": 2.649951934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067785, + "balance_loss_mlp": 1.04521906, + "epoch": 0.8705271258176221, + "flos": 585443220480.0, + "grad_norm": 0.050832171139039956, + "language_loss": 0.84775817, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85843599, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.22558594, + "step": 4525, + "time_per_iteration": 2.793938398361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074171, + "balance_loss_mlp": 1.05107999, + "epoch": 0.8707195075028857, + "flos": 669216411648.0, + "grad_norm": 0.05409510643219343, + "language_loss": 0.85498273, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86572444, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.23071289, + "step": 4526, + "time_per_iteration": 2.8929967880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072832, + "balance_loss_mlp": 1.04955089, + "epoch": 0.8709118891881493, + "flos": 520391443968.0, + "grad_norm": 0.05925337750259882, + "language_loss": 0.83831525, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84904361, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.23242188, + "step": 4527, + "time_per_iteration": 2.7890868186950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073321, + "balance_loss_mlp": 1.04992008, + "epoch": 0.8711042708734128, + "flos": 553208984064.0, + "grad_norm": 0.0613716870727963, + "language_loss": 0.81508851, + "learning_rate": 4.294050463490401e-05, + "loss": 0.8258217, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.23376465, + "step": 4528, + "time_per_iteration": 2.6644904613494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071211, + "balance_loss_mlp": 1.04809642, + "epoch": 0.8712966525586764, + "flos": 502193862144.0, + "grad_norm": 0.06354318579498781, + "language_loss": 0.82647032, + "learning_rate": 4.281427977823094e-05, + "loss": 0.8371824, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.23095703, + "step": 4529, + "time_per_iteration": 2.7259349822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.04900551, + "epoch": 0.87148903424394, + "flos": 804096129024.0, + "grad_norm": 0.07176062030792234, + "language_loss": 0.73956883, + "learning_rate": 4.268823241679593e-05, + "loss": 0.75028968, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.23071289, + "step": 4530, + "time_per_iteration": 3.0482122898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066396, + "balance_loss_mlp": 1.04405594, + "epoch": 0.8716814159292036, + "flos": 773438160384.0, + "grad_norm": 0.05324576047171836, + "language_loss": 0.86157966, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87224358, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.2232666, + "step": 4531, + "time_per_iteration": 3.0000054836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_mlp": 1.05127263, + "epoch": 0.8718737976144671, + "flos": 486835329024.0, + "grad_norm": 0.0693772936852786, + "language_loss": 0.85297459, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86371231, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.22509766, + "step": 4532, + "time_per_iteration": 2.5742595195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066232, + "balance_loss_mlp": 1.04420245, + "epoch": 0.8720661792997306, + "flos": 584123913216.0, + "grad_norm": 0.05401834066976159, + "language_loss": 0.78774154, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79840392, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.22021484, + "step": 4533, + "time_per_iteration": 2.7111940383911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012669, + "balance_loss_mlp": 1.00599337, + "epoch": 0.8722585609849942, + "flos": 1495942318080.0, + "grad_norm": 0.007267099236894943, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81979477, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.06689453, + "step": 4534, + "time_per_iteration": 4.824821472167969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066979, + "balance_loss_mlp": 1.04374468, + "epoch": 0.8724509426702578, + "flos": 596169123840.0, + "grad_norm": 0.08694763070174955, + "language_loss": 0.87074769, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88141751, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.23242188, + "step": 4535, + "time_per_iteration": 2.7852017879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073329, + "balance_loss_mlp": 1.04880726, + "epoch": 0.8726433243555214, + "flos": 443635481088.0, + "grad_norm": 0.05794421174501006, + "language_loss": 0.81177545, + "learning_rate": 4.193567838376888e-05, + "loss": 0.82250875, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.24499512, + "step": 4536, + "time_per_iteration": 2.5551156997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_mlp": 1.04312158, + "epoch": 0.8728357060407849, + "flos": 553181819904.0, + "grad_norm": 0.06646899623951459, + "language_loss": 0.82300961, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83366895, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.22827148, + "step": 4537, + "time_per_iteration": 2.639230728149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066126, + "balance_loss_mlp": 1.04342866, + "epoch": 0.8730280877260485, + "flos": 627807946752.0, + "grad_norm": 0.06707141042951141, + "language_loss": 0.79122996, + "learning_rate": 4.16862492117136e-05, + "loss": 0.80189127, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.22692871, + "step": 4538, + "time_per_iteration": 2.806157350540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064927, + "balance_loss_mlp": 1.04149008, + "epoch": 0.873220469411312, + "flos": 535384359936.0, + "grad_norm": 0.06758172406923339, + "language_loss": 0.80222809, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81287742, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.23425293, + "step": 4539, + "time_per_iteration": 2.721412420272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069845, + "balance_loss_mlp": 1.04654002, + "epoch": 0.8734128510965756, + "flos": 561883972608.0, + "grad_norm": 0.057519719081089375, + "language_loss": 0.84056413, + "learning_rate": 4.143753177230242e-05, + "loss": 0.85126257, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.2331543, + "step": 4540, + "time_per_iteration": 2.706616163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067834, + "balance_loss_mlp": 1.04486275, + "epoch": 0.8736052327818392, + "flos": 686467643904.0, + "grad_norm": 0.06653511461765922, + "language_loss": 0.79649061, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80716896, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.22973633, + "step": 4541, + "time_per_iteration": 2.9499471187591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.04360437, + "epoch": 0.8737976144671027, + "flos": 531673113600.0, + "grad_norm": 0.06068280649961363, + "language_loss": 0.81656998, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82723451, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.22875977, + "step": 4542, + "time_per_iteration": 2.791944980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070959, + "balance_loss_mlp": 1.04743886, + "epoch": 0.8739899961523663, + "flos": 575592086016.0, + "grad_norm": 0.05207946464598878, + "language_loss": 0.82126415, + "learning_rate": 4.106579095649649e-05, + "loss": 0.83197367, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.23510742, + "step": 4543, + "time_per_iteration": 2.881683826446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069601, + "balance_loss_mlp": 1.0456636, + "epoch": 0.8741823778376299, + "flos": 731332965888.0, + "grad_norm": 0.06114890089008731, + "language_loss": 0.76864976, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77934569, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.23937988, + "step": 4544, + "time_per_iteration": 2.8996529579162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067897, + "balance_loss_mlp": 1.04386425, + "epoch": 0.8743747595228935, + "flos": 567080082432.0, + "grad_norm": 0.05969360232616191, + "language_loss": 0.8386988, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84937775, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.2401123, + "step": 4545, + "time_per_iteration": 2.7387166023254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067094, + "balance_loss_mlp": 1.04331172, + "epoch": 0.8745671412081569, + "flos": 493370569728.0, + "grad_norm": 0.056432657681556246, + "language_loss": 0.82131648, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83198744, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.23779297, + "step": 4546, + "time_per_iteration": 2.575007915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106455, + "balance_loss_mlp": 1.04134023, + "epoch": 0.8747595228934205, + "flos": 524139766272.0, + "grad_norm": 0.05353066559861675, + "language_loss": 0.83669919, + "learning_rate": 4.057263119533233e-05, + "loss": 0.8473447, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.23193359, + "step": 4547, + "time_per_iteration": 2.627749443054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065846, + "balance_loss_mlp": 1.04327965, + "epoch": 0.8749519045786841, + "flos": 744349118976.0, + "grad_norm": 0.0643040936183351, + "language_loss": 0.80280411, + "learning_rate": 4.044978704935853e-05, + "loss": 0.81346262, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.22570801, + "step": 4548, + "time_per_iteration": 3.0364036560058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.05203199, + "epoch": 0.8751442862639477, + "flos": 594278995968.0, + "grad_norm": 0.06542874520616514, + "language_loss": 0.80029333, + "learning_rate": 4.032712131660027e-05, + "loss": 0.81104398, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.23034668, + "step": 4549, + "time_per_iteration": 2.8610854148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067604, + "balance_loss_mlp": 1.04379821, + "epoch": 0.8753366679492113, + "flos": 496530819072.0, + "grad_norm": 0.0491879081167887, + "language_loss": 0.78780919, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79848522, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.23791504, + "step": 4550, + "time_per_iteration": 2.724942684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067747, + "balance_loss_mlp": 1.04468012, + "epoch": 0.8755290496344748, + "flos": 489864526848.0, + "grad_norm": 0.07218852967777867, + "language_loss": 0.82172048, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.832398, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.23059082, + "step": 4551, + "time_per_iteration": 2.6174588203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069001, + "balance_loss_mlp": 1.04489684, + "epoch": 0.8757214313197383, + "flos": 591859892736.0, + "grad_norm": 0.06050960233679645, + "language_loss": 0.81907594, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82976592, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.24108887, + "step": 4552, + "time_per_iteration": 2.8340346813201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065165, + "balance_loss_mlp": 1.04184747, + "epoch": 0.8759138130050019, + "flos": 976843763712.0, + "grad_norm": 0.06903868945856485, + "language_loss": 0.78443825, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.7950899, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.2331543, + "step": 4553, + "time_per_iteration": 3.2752091884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066678, + "balance_loss_mlp": 1.04353917, + "epoch": 0.8761061946902655, + "flos": 802764338688.0, + "grad_norm": 0.05543328693118016, + "language_loss": 0.78072476, + "learning_rate": 3.971647051542243e-05, + "loss": 0.79139149, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.23132324, + "step": 4554, + "time_per_iteration": 3.1207494735717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106636, + "balance_loss_mlp": 1.04254174, + "epoch": 0.8762985763755291, + "flos": 698495602176.0, + "grad_norm": 0.07396612314810898, + "language_loss": 0.74972981, + "learning_rate": 3.95948762596155e-05, + "loss": 0.76039344, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.23815918, + "step": 4555, + "time_per_iteration": 2.987738847732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066573, + "balance_loss_mlp": 1.04325533, + "epoch": 0.8764909580607926, + "flos": 629717898240.0, + "grad_norm": 0.06305038969435892, + "language_loss": 0.80471361, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81537932, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.2331543, + "step": 4556, + "time_per_iteration": 2.846748113632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071271, + "balance_loss_mlp": 1.04776287, + "epoch": 0.8766833397460562, + "flos": 481545243648.0, + "grad_norm": 0.054215636219797275, + "language_loss": 0.80461884, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81533158, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.23498535, + "step": 4557, + "time_per_iteration": 2.6457924842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070243, + "balance_loss_mlp": 1.04678226, + "epoch": 0.8768757214313198, + "flos": 407734414848.0, + "grad_norm": 0.0647505040176177, + "language_loss": 0.78280514, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79350758, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.23461914, + "step": 4558, + "time_per_iteration": 2.488900899887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069964, + "balance_loss_mlp": 1.04682493, + "epoch": 0.8770681031165833, + "flos": 582582150144.0, + "grad_norm": 0.09809770329517786, + "language_loss": 0.82181263, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.8325122, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.23132324, + "step": 4559, + "time_per_iteration": 2.706878185272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069373, + "balance_loss_mlp": 1.04493523, + "epoch": 0.8772604848018468, + "flos": 508687257600.0, + "grad_norm": 0.06124524384081237, + "language_loss": 0.80961919, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.82031298, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.2442627, + "step": 4560, + "time_per_iteration": 2.6287667751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010685, + "balance_loss_mlp": 1.04520679, + "epoch": 0.8774528664871104, + "flos": 408836408832.0, + "grad_norm": 0.06993098532938485, + "language_loss": 0.85462463, + "learning_rate": 3.886906601970913e-05, + "loss": 0.8653096, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.23254395, + "step": 4561, + "time_per_iteration": 2.4868052005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064436, + "balance_loss_mlp": 1.04136872, + "epoch": 0.877645248172374, + "flos": 500844819456.0, + "grad_norm": 0.047628659589740864, + "language_loss": 0.8361448, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84678912, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.23046875, + "step": 4562, + "time_per_iteration": 2.6149418354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.04525661, + "epoch": 0.8778376298576376, + "flos": 633145019904.0, + "grad_norm": 0.05889169926682073, + "language_loss": 0.78304517, + "learning_rate": 3.862856098834189e-05, + "loss": 0.79372722, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.22912598, + "step": 4563, + "time_per_iteration": 2.857564687728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.04482722, + "epoch": 0.8780300115429012, + "flos": 533988329472.0, + "grad_norm": 0.062171034291002715, + "language_loss": 0.8044073, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81508875, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.2331543, + "step": 4564, + "time_per_iteration": 2.798351526260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064203, + "balance_loss_mlp": 1.04216146, + "epoch": 0.8782223932281646, + "flos": 511662127104.0, + "grad_norm": 0.052130218938398365, + "language_loss": 0.77531612, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78595817, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.22058105, + "step": 4565, + "time_per_iteration": 2.5758793354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064441, + "balance_loss_mlp": 1.04098022, + "epoch": 0.8784147749134282, + "flos": 780714547200.0, + "grad_norm": 0.06308413249676079, + "language_loss": 0.70176268, + "learning_rate": 3.826914695965766e-05, + "loss": 0.71240711, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.23449707, + "step": 4566, + "time_per_iteration": 3.148693084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072907, + "balance_loss_mlp": 1.04900551, + "epoch": 0.8786071565986918, + "flos": 561004434432.0, + "grad_norm": 0.06612384532340969, + "language_loss": 0.75962007, + "learning_rate": 3.814970074111279e-05, + "loss": 0.77034914, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.2388916, + "step": 4567, + "time_per_iteration": 2.6695103645324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066088, + "balance_loss_mlp": 1.0432117, + "epoch": 0.8787995382839554, + "flos": 603448081920.0, + "grad_norm": 0.05080752532897636, + "language_loss": 0.77430034, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78496122, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.2286377, + "step": 4568, + "time_per_iteration": 2.8336360454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010713, + "balance_loss_mlp": 1.04786336, + "epoch": 0.8789919199692189, + "flos": 560233552896.0, + "grad_norm": 0.04760794267274833, + "language_loss": 0.85342216, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.86413515, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.23425293, + "step": 4569, + "time_per_iteration": 2.661607027053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068074, + "balance_loss_mlp": 1.04515028, + "epoch": 0.8791843016544825, + "flos": 539115429888.0, + "grad_norm": 0.07652247848761115, + "language_loss": 0.82203466, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.83271539, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.22912598, + "step": 4570, + "time_per_iteration": 2.638333797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066012, + "balance_loss_mlp": 1.04247975, + "epoch": 0.8793766833397461, + "flos": 1008699899904.0, + "grad_norm": 0.060157796139736874, + "language_loss": 0.79238844, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80304855, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.23522949, + "step": 4571, + "time_per_iteration": 3.3326447010040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.04619253, + "epoch": 0.8795690650250096, + "flos": 678637688832.0, + "grad_norm": 0.06297064041998576, + "language_loss": 0.81085575, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82154483, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.22705078, + "step": 4572, + "time_per_iteration": 2.864213466644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064824, + "balance_loss_mlp": 1.04248405, + "epoch": 0.8797614467102732, + "flos": 453432287232.0, + "grad_norm": 0.07526696415959916, + "language_loss": 0.88917291, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.8998211, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.22338867, + "step": 4573, + "time_per_iteration": 2.5274643898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061759, + "balance_loss_mlp": 1.03906155, + "epoch": 0.8799538283955367, + "flos": 550913591808.0, + "grad_norm": 0.05173001406565122, + "language_loss": 0.84684122, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85745883, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.22692871, + "step": 4574, + "time_per_iteration": 2.658022880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067289, + "balance_loss_mlp": 1.04462719, + "epoch": 0.8801462100808003, + "flos": 807429275136.0, + "grad_norm": 0.06273590895888136, + "language_loss": 0.84730029, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85797316, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.2265625, + "step": 4575, + "time_per_iteration": 3.0476410388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066559, + "balance_loss_mlp": 1.04321766, + "epoch": 0.8803385917660639, + "flos": 768694302720.0, + "grad_norm": 0.06194945709094393, + "language_loss": 0.84745121, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85811675, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.23327637, + "step": 4576, + "time_per_iteration": 2.9632747173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106577, + "balance_loss_mlp": 1.04261923, + "epoch": 0.8805309734513275, + "flos": 567339614208.0, + "grad_norm": 0.05151265510881933, + "language_loss": 0.81348932, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82414699, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.23156738, + "step": 4577, + "time_per_iteration": 2.8133304119110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063951, + "balance_loss_mlp": 1.04054976, + "epoch": 0.880723355136591, + "flos": 679779330048.0, + "grad_norm": 0.05836612347560367, + "language_loss": 0.81520814, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82584763, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.23388672, + "step": 4578, + "time_per_iteration": 2.8322298526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065318, + "balance_loss_mlp": 1.04195356, + "epoch": 0.8809157368218545, + "flos": 565629723648.0, + "grad_norm": 0.05785232001619173, + "language_loss": 0.79189932, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80255246, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.23339844, + "step": 4579, + "time_per_iteration": 2.752981424331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065349, + "balance_loss_mlp": 1.04259169, + "epoch": 0.8811081185071181, + "flos": 515407878144.0, + "grad_norm": 0.05306189241878955, + "language_loss": 0.7616868, + "learning_rate": 3.661323354789586e-05, + "loss": 0.7723403, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.22741699, + "step": 4580, + "time_per_iteration": 2.6905887126922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071852, + "balance_loss_mlp": 1.04929709, + "epoch": 0.8813005001923817, + "flos": 594343236096.0, + "grad_norm": 0.08666583481538859, + "language_loss": 0.81318676, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82390535, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.22558594, + "step": 4581, + "time_per_iteration": 2.717012405395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063192, + "balance_loss_mlp": 1.04038763, + "epoch": 0.8814928818776453, + "flos": 666940843008.0, + "grad_norm": 0.065573570347403, + "language_loss": 0.79248452, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80311644, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.22802734, + "step": 4582, + "time_per_iteration": 2.8327713012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061504, + "balance_loss_mlp": 1.03867507, + "epoch": 0.8816852635629088, + "flos": 609153343488.0, + "grad_norm": 0.06392391603853627, + "language_loss": 0.85894233, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86955738, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.22814941, + "step": 4583, + "time_per_iteration": 2.738196611404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062285, + "balance_loss_mlp": 1.03962302, + "epoch": 0.8818776452481724, + "flos": 480379009536.0, + "grad_norm": 0.05949601361971884, + "language_loss": 0.82285428, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83347714, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.2265625, + "step": 4584, + "time_per_iteration": 2.578489303588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072275, + "balance_loss_mlp": 1.04852891, + "epoch": 0.882070026933436, + "flos": 1045394242560.0, + "grad_norm": 0.062197672953105604, + "language_loss": 0.73814642, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74886918, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.23742676, + "step": 4585, + "time_per_iteration": 3.293839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063064, + "balance_loss_mlp": 1.03912687, + "epoch": 0.8822624086186995, + "flos": 474409446912.0, + "grad_norm": 0.05599443647615175, + "language_loss": 0.79877216, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80940282, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.23925781, + "step": 4586, + "time_per_iteration": 2.6354963779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066824, + "balance_loss_mlp": 1.04409122, + "epoch": 0.882454790303963, + "flos": 654023434752.0, + "grad_norm": 0.187385514178466, + "language_loss": 0.82061517, + "learning_rate": 3.579849183630485e-05, + "loss": 0.83128339, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.22717285, + "step": 4587, + "time_per_iteration": 2.8057334423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065861, + "balance_loss_mlp": 1.04243684, + "epoch": 0.8826471719892266, + "flos": 470325242880.0, + "grad_norm": 0.0652059168593454, + "language_loss": 0.7890746, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79973322, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.23425293, + "step": 4588, + "time_per_iteration": 2.6349856853485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106342, + "balance_loss_mlp": 1.04065084, + "epoch": 0.8828395536744902, + "flos": 468753744384.0, + "grad_norm": 0.05333811526852745, + "language_loss": 0.83772522, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84835941, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.22766113, + "step": 4589, + "time_per_iteration": 2.6871464252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060113, + "balance_loss_mlp": 1.03739214, + "epoch": 0.8830319353597538, + "flos": 721377944064.0, + "grad_norm": 0.05489948946864513, + "language_loss": 0.81401742, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82461852, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.22705078, + "step": 4590, + "time_per_iteration": 2.9603042602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.04144478, + "epoch": 0.8832243170450174, + "flos": 443277204480.0, + "grad_norm": 0.06729262169769755, + "language_loss": 0.81461012, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.8252542, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.22949219, + "step": 4591, + "time_per_iteration": 2.5702083110809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106648, + "balance_loss_mlp": 1.04310322, + "epoch": 0.8834166987302808, + "flos": 566583413760.0, + "grad_norm": 0.07104609696133038, + "language_loss": 0.82291472, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83357948, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.23364258, + "step": 4592, + "time_per_iteration": 2.7355880737304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064561, + "balance_loss_mlp": 1.04203045, + "epoch": 0.8836090804155444, + "flos": 609316328448.0, + "grad_norm": 0.07360243173205684, + "language_loss": 0.82330287, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83394849, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.2253418, + "step": 4593, + "time_per_iteration": 2.7873008251190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068103, + "balance_loss_mlp": 1.04457128, + "epoch": 0.883801462100808, + "flos": 557065963008.0, + "grad_norm": 0.06221990511497487, + "language_loss": 0.80560136, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81628239, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.23510742, + "step": 4594, + "time_per_iteration": 2.729846477508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_mlp": 1.04311657, + "epoch": 0.8839938437860716, + "flos": 516188671488.0, + "grad_norm": 0.07560096457235571, + "language_loss": 0.77495778, + "learning_rate": 3.487817247139064e-05, + "loss": 0.7856288, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.23974609, + "step": 4595, + "time_per_iteration": 2.6014037132263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065125, + "balance_loss_mlp": 1.04190314, + "epoch": 0.8841862254713351, + "flos": 713696292864.0, + "grad_norm": 0.08393319436175375, + "language_loss": 0.79193687, + "learning_rate": 3.47639446766777e-05, + "loss": 0.80258811, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.23193359, + "step": 4596, + "time_per_iteration": 2.880234479904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062087, + "balance_loss_mlp": 1.03944921, + "epoch": 0.8843786071565987, + "flos": 833975875584.0, + "grad_norm": 0.05899737308739052, + "language_loss": 0.82822627, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.8388471, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.22607422, + "step": 4597, + "time_per_iteration": 3.0193350315093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065501, + "balance_loss_mlp": 1.04293513, + "epoch": 0.8845709888418622, + "flos": 656884505088.0, + "grad_norm": 0.051134227194142116, + "language_loss": 0.83159703, + "learning_rate": 3.453603099349462e-05, + "loss": 0.84225208, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.22570801, + "step": 4598, + "time_per_iteration": 2.904973030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065085, + "balance_loss_mlp": 1.04247046, + "epoch": 0.8847633705271258, + "flos": 523326666240.0, + "grad_norm": 0.0641863299399463, + "language_loss": 0.81212854, + "learning_rate": 3.442234519350823e-05, + "loss": 0.82277942, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.22607422, + "step": 4599, + "time_per_iteration": 2.7764077186584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065009, + "balance_loss_mlp": 1.0417037, + "epoch": 0.8849557522123894, + "flos": 548591035392.0, + "grad_norm": 0.05892360485500338, + "language_loss": 0.84439909, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85504919, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.23303223, + "step": 4600, + "time_per_iteration": 2.655726671218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069456, + "balance_loss_mlp": 1.04689002, + "epoch": 0.8851481338976529, + "flos": 622372128768.0, + "grad_norm": 0.05693067043210366, + "language_loss": 0.839571, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.85026556, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.22558594, + "step": 4601, + "time_per_iteration": 2.778089761734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067376, + "balance_loss_mlp": 1.04427338, + "epoch": 0.8853405155829165, + "flos": 444359374848.0, + "grad_norm": 0.07032793668671747, + "language_loss": 0.80884451, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81951827, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.23083496, + "step": 4602, + "time_per_iteration": 2.5838189125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061588, + "balance_loss_mlp": 1.0376389, + "epoch": 0.8855328972681801, + "flos": 730470680064.0, + "grad_norm": 0.05264310047515772, + "language_loss": 0.78328097, + "learning_rate": 3.396940996663683e-05, + "loss": 0.79389679, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.23950195, + "step": 4603, + "time_per_iteration": 2.917466163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064302, + "balance_loss_mlp": 1.04041266, + "epoch": 0.8857252789534437, + "flos": 487376414208.0, + "grad_norm": 0.06259539503163569, + "language_loss": 0.79090303, + "learning_rate": 3.385662837299375e-05, + "loss": 0.8015461, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.23901367, + "step": 4604, + "time_per_iteration": 2.5593533515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066302, + "balance_loss_mlp": 1.04194784, + "epoch": 0.8859176606387072, + "flos": 508556206080.0, + "grad_norm": 0.06681364763275595, + "language_loss": 0.817626, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82828903, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.2434082, + "step": 4605, + "time_per_iteration": 2.7382400035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068054, + "balance_loss_mlp": 1.04427195, + "epoch": 0.8861100423239707, + "flos": 516628440576.0, + "grad_norm": 0.1197682640175093, + "language_loss": 0.85860205, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86928248, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.23791504, + "step": 4606, + "time_per_iteration": 2.738442897796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.04325604, + "epoch": 0.8863024240092343, + "flos": 626975396352.0, + "grad_norm": 0.0522441462901882, + "language_loss": 0.79691124, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80757129, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.22753906, + "step": 4607, + "time_per_iteration": 2.7272424697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063318, + "balance_loss_mlp": 1.03960764, + "epoch": 0.8864948056944979, + "flos": 766910260224.0, + "grad_norm": 0.055573824499614524, + "language_loss": 0.83738887, + "learning_rate": 3.340731216429083e-05, + "loss": 0.8480221, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.23681641, + "step": 4608, + "time_per_iteration": 2.9728434085845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002821, + "balance_loss_mlp": 0.99604982, + "epoch": 0.8866871873797615, + "flos": 1502331452928.0, + "grad_norm": 0.0055616018334323225, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79833776, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.06787109, + "step": 4609, + "time_per_iteration": 4.811368942260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.04073143, + "epoch": 0.886879569065025, + "flos": 811516050432.0, + "grad_norm": 0.05668171590793676, + "language_loss": 0.81769073, + "learning_rate": 3.3183740769755e-05, + "loss": 0.8283273, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.22937012, + "step": 4610, + "time_per_iteration": 3.0735855102539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002743, + "balance_loss_mlp": 0.99597168, + "epoch": 0.8870719507502886, + "flos": 1582838309376.0, + "grad_norm": 0.005567731821103193, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.7791357, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.06787109, + "step": 4611, + "time_per_iteration": 4.918694734573364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064613, + "balance_loss_mlp": 1.0412122, + "epoch": 0.8872643324355521, + "flos": 634027129344.0, + "grad_norm": 0.06376085793205072, + "language_loss": 0.75125146, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76189762, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.23400879, + "step": 4612, + "time_per_iteration": 2.755192995071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067352, + "balance_loss_mlp": 1.04434419, + "epoch": 0.8874567141208157, + "flos": 535755119616.0, + "grad_norm": 0.06524181347271532, + "language_loss": 0.82796997, + "learning_rate": 3.284974304209532e-05, + "loss": 0.83864343, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.2298584, + "step": 4613, + "time_per_iteration": 2.6614327430725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107072, + "balance_loss_mlp": 1.04793918, + "epoch": 0.8876490958060793, + "flos": 1566302552064.0, + "grad_norm": 0.05648931496157644, + "language_loss": 0.79739761, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80810487, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.22766113, + "step": 4614, + "time_per_iteration": 3.8716471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106887, + "balance_loss_mlp": 1.04496849, + "epoch": 0.8878414774913428, + "flos": 636633810432.0, + "grad_norm": 0.057359413666851676, + "language_loss": 0.85207993, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.86276865, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.2388916, + "step": 4615, + "time_per_iteration": 2.82585072517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071116, + "balance_loss_mlp": 1.04854965, + "epoch": 0.8880338591766064, + "flos": 496429502976.0, + "grad_norm": 0.06508067995275697, + "language_loss": 0.81528175, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82599294, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.22558594, + "step": 4616, + "time_per_iteration": 2.6278040409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.04114652, + "epoch": 0.88822624086187, + "flos": 542861180928.0, + "grad_norm": 0.05861112431241466, + "language_loss": 0.7983259, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.80897093, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.23352051, + "step": 4617, + "time_per_iteration": 2.6568470001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067803, + "balance_loss_mlp": 1.0444026, + "epoch": 0.8884186225471336, + "flos": 551822865408.0, + "grad_norm": 0.07539491877952685, + "language_loss": 0.84058613, + "learning_rate": 3.229670801173418e-05, + "loss": 0.85126418, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.23413086, + "step": 4618, + "time_per_iteration": 2.6135919094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003042, + "balance_loss_mlp": 0.99631864, + "epoch": 0.888611004232397, + "flos": 1565263305216.0, + "grad_norm": 0.004685969787491016, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79515243, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.06738281, + "step": 4619, + "time_per_iteration": 4.9886579513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070198, + "balance_loss_mlp": 1.04795337, + "epoch": 0.8888033859176606, + "flos": 767028828672.0, + "grad_norm": 0.052062483272785565, + "language_loss": 0.82744682, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83814883, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.22241211, + "step": 4620, + "time_per_iteration": 2.994854211807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062487, + "balance_loss_mlp": 1.03959835, + "epoch": 0.8889957676029242, + "flos": 934110849024.0, + "grad_norm": 0.05371563368015874, + "language_loss": 0.84353495, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85415977, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.22875977, + "step": 4621, + "time_per_iteration": 3.1302571296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.04323435, + "epoch": 0.8891881492881878, + "flos": 589611488256.0, + "grad_norm": 0.059931764934333366, + "language_loss": 0.81823874, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82889962, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.22851562, + "step": 4622, + "time_per_iteration": 2.8172430992126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065673, + "balance_loss_mlp": 1.04206991, + "epoch": 0.8893805309734514, + "flos": 540718861824.0, + "grad_norm": 0.06080004343800978, + "language_loss": 0.82491469, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83557141, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.23608398, + "step": 4623, + "time_per_iteration": 2.7272207736968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066977, + "balance_loss_mlp": 1.04382658, + "epoch": 0.8895729126587149, + "flos": 560095160832.0, + "grad_norm": 0.0915201090359006, + "language_loss": 0.81910133, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82977104, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.23144531, + "step": 4624, + "time_per_iteration": 2.680405855178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_mlp": 1.042799, + "epoch": 0.8897652943439784, + "flos": 610154021376.0, + "grad_norm": 0.05519265472563439, + "language_loss": 0.81301922, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82367849, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.23132324, + "step": 4625, + "time_per_iteration": 2.7328474521636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061825, + "balance_loss_mlp": 1.0387218, + "epoch": 0.889957676029242, + "flos": 917847811584.0, + "grad_norm": 0.06343567074213599, + "language_loss": 0.77644289, + "learning_rate": 3.142129625539969e-05, + "loss": 0.7870611, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.23095703, + "step": 4626, + "time_per_iteration": 3.2025320529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061503, + "balance_loss_mlp": 1.0387342, + "epoch": 0.8901500577145056, + "flos": 488698292736.0, + "grad_norm": 0.05768505937454957, + "language_loss": 0.8051129, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81572795, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.22753906, + "step": 4627, + "time_per_iteration": 2.544037342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.04235685, + "epoch": 0.8903424393997691, + "flos": 733648181760.0, + "grad_norm": 0.058375027413353715, + "language_loss": 0.81123602, + "learning_rate": 3.120426165316398e-05, + "loss": 0.82189703, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.23754883, + "step": 4628, + "time_per_iteration": 2.983144760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063743, + "balance_loss_mlp": 1.04085517, + "epoch": 0.8905348210850327, + "flos": 519813282816.0, + "grad_norm": 0.052566264576619524, + "language_loss": 0.82051194, + "learning_rate": 3.109601733496881e-05, + "loss": 0.83114934, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.22875977, + "step": 4629, + "time_per_iteration": 2.6456775665283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064735, + "balance_loss_mlp": 1.04144096, + "epoch": 0.8907272027702963, + "flos": 578976989184.0, + "grad_norm": 0.05590754047584808, + "language_loss": 0.7958827, + "learning_rate": 3.098795506144458e-05, + "loss": 0.80653006, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.23278809, + "step": 4630, + "time_per_iteration": 2.833200454711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106493, + "balance_loss_mlp": 1.04224443, + "epoch": 0.8909195844555599, + "flos": 893628910080.0, + "grad_norm": 0.07302437045654908, + "language_loss": 0.79246753, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80311686, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.22668457, + "step": 4631, + "time_per_iteration": 3.1019132137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065408, + "balance_loss_mlp": 1.0416975, + "epoch": 0.8911119661408234, + "flos": 549865926144.0, + "grad_norm": 0.05634010359130886, + "language_loss": 0.84410584, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85475999, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.23681641, + "step": 4632, + "time_per_iteration": 2.674358367919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069142, + "balance_loss_mlp": 1.04496682, + "epoch": 0.8913043478260869, + "flos": 481139979264.0, + "grad_norm": 0.06730907628269114, + "language_loss": 0.83857995, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84927142, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.24182129, + "step": 4633, + "time_per_iteration": 2.632319927215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106984, + "balance_loss_mlp": 1.04641593, + "epoch": 0.8914967295113505, + "flos": 484581782016.0, + "grad_norm": 0.052289564951872355, + "language_loss": 0.85386705, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86456549, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.23425293, + "step": 4634, + "time_per_iteration": 2.6371240615844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065464, + "balance_loss_mlp": 1.04192066, + "epoch": 0.8916891111966141, + "flos": 445664001024.0, + "grad_norm": 0.0695716528121872, + "language_loss": 0.8163662, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82702088, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.23535156, + "step": 4635, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062296, + "balance_loss_mlp": 1.03908598, + "epoch": 0.8918814928818777, + "flos": 564016379904.0, + "grad_norm": 0.05391232226326787, + "language_loss": 0.78488344, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79550636, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.23217773, + "step": 4636, + "time_per_iteration": 2.738752603530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069592, + "balance_loss_mlp": 1.04635811, + "epoch": 0.8920738745671412, + "flos": 575943022080.0, + "grad_norm": 0.06251637060913222, + "language_loss": 0.81325352, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82394946, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.23242188, + "step": 4637, + "time_per_iteration": 2.644911050796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066751, + "balance_loss_mlp": 1.04454279, + "epoch": 0.8922662562524047, + "flos": 620180623872.0, + "grad_norm": 0.0534185422238087, + "language_loss": 0.83658934, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.8472569, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.2220459, + "step": 4638, + "time_per_iteration": 2.768505573272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107185, + "balance_loss_mlp": 1.04829395, + "epoch": 0.8924586379376683, + "flos": 583624673280.0, + "grad_norm": 0.05706830171658043, + "language_loss": 0.79804647, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80876493, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.23522949, + "step": 4639, + "time_per_iteration": 2.731898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067488, + "balance_loss_mlp": 1.04464698, + "epoch": 0.8926510196229319, + "flos": 525177520128.0, + "grad_norm": 0.05117641357239232, + "language_loss": 0.81824827, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82892317, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.22839355, + "step": 4640, + "time_per_iteration": 2.7508621215820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067855, + "balance_loss_mlp": 1.04493129, + "epoch": 0.8928434013081955, + "flos": 486669772800.0, + "grad_norm": 0.063972320379488, + "language_loss": 0.80840433, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81908286, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.22912598, + "step": 4641, + "time_per_iteration": 2.5344278812408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005658, + "balance_loss_mlp": 0.99893486, + "epoch": 0.893035782993459, + "flos": 1448302560768.0, + "grad_norm": 0.003483753852343544, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.8133651, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.06738281, + "step": 4642, + "time_per_iteration": 4.7115797996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066278, + "balance_loss_mlp": 1.04297304, + "epoch": 0.8932281646787226, + "flos": 611320255488.0, + "grad_norm": 0.06490125129473394, + "language_loss": 0.80942553, + "learning_rate": 2.95997305629786e-05, + "loss": 0.82008833, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.23278809, + "step": 4643, + "time_per_iteration": 2.776724100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064375, + "balance_loss_mlp": 1.04103351, + "epoch": 0.8934205463639862, + "flos": 565760775168.0, + "grad_norm": 0.057617911154696326, + "language_loss": 0.84784567, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85848939, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.23352051, + "step": 4644, + "time_per_iteration": 2.630427122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068617, + "balance_loss_mlp": 1.04492998, + "epoch": 0.8936129280492497, + "flos": 488431420416.0, + "grad_norm": 0.06553563989756882, + "language_loss": 0.78306258, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79374874, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.23681641, + "step": 4645, + "time_per_iteration": 2.6144680976867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066216, + "balance_loss_mlp": 1.04387641, + "epoch": 0.8938053097345132, + "flos": 886490542080.0, + "grad_norm": 0.0528405024741521, + "language_loss": 0.80833423, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81899637, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.22338867, + "step": 4646, + "time_per_iteration": 3.2429933547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068666, + "balance_loss_mlp": 1.04599285, + "epoch": 0.8939976914197768, + "flos": 593285658624.0, + "grad_norm": 0.05949361617339142, + "language_loss": 0.84359968, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85428637, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.22680664, + "step": 4647, + "time_per_iteration": 2.710167407989502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068246, + "balance_loss_mlp": 1.04401064, + "epoch": 0.8941900731050404, + "flos": 523247745024.0, + "grad_norm": 0.0659464848891027, + "language_loss": 0.81186658, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82254899, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.2421875, + "step": 4648, + "time_per_iteration": 2.651339292526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064898, + "balance_loss_mlp": 1.04204559, + "epoch": 0.894382454790304, + "flos": 800582745600.0, + "grad_norm": 0.05875794324616096, + "language_loss": 0.81119418, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82184315, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.2286377, + "step": 4649, + "time_per_iteration": 3.044119358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069765, + "balance_loss_mlp": 1.04760432, + "epoch": 0.8945748364755676, + "flos": 479037307392.0, + "grad_norm": 0.05520315549667398, + "language_loss": 0.85046721, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.86116481, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.22180176, + "step": 4650, + "time_per_iteration": 2.6633942127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070963, + "balance_loss_mlp": 1.04739583, + "epoch": 0.894767218160831, + "flos": 508776090624.0, + "grad_norm": 0.06916128271442132, + "language_loss": 0.83380258, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84451222, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.23547363, + "step": 4651, + "time_per_iteration": 2.6715307235717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_mlp": 1.04138315, + "epoch": 0.8949595998460946, + "flos": 685857549312.0, + "grad_norm": 0.0591722650352258, + "language_loss": 0.82184136, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.8324852, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.23010254, + "step": 4652, + "time_per_iteration": 2.8870623111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064747, + "balance_loss_mlp": 1.04148972, + "epoch": 0.8951519815313582, + "flos": 799920520704.0, + "grad_norm": 0.05600884450087048, + "language_loss": 0.77608907, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78673655, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.23242188, + "step": 4653, + "time_per_iteration": 2.9963629245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_mlp": 1.04324377, + "epoch": 0.8953443632166218, + "flos": 666740782080.0, + "grad_norm": 0.06357991469427139, + "language_loss": 0.86607301, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87673807, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.23254395, + "step": 4654, + "time_per_iteration": 2.7694337368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065408, + "balance_loss_mlp": 1.04244781, + "epoch": 0.8955367449018854, + "flos": 644977686528.0, + "grad_norm": 0.055922367337583156, + "language_loss": 0.83340573, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84405977, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.22961426, + "step": 4655, + "time_per_iteration": 2.8329148292541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_mlp": 1.04099762, + "epoch": 0.8957291265871489, + "flos": 808714077696.0, + "grad_norm": 0.05938034328868387, + "language_loss": 0.78094238, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.79159427, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.24194336, + "step": 4656, + "time_per_iteration": 3.0359058380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070668, + "balance_loss_mlp": 1.04611111, + "epoch": 0.8959215082724125, + "flos": 518923832832.0, + "grad_norm": 0.05192477429276855, + "language_loss": 0.77056348, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78127015, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.24572754, + "step": 4657, + "time_per_iteration": 2.618851661682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.04416966, + "epoch": 0.896113889957676, + "flos": 476917383168.0, + "grad_norm": 0.058088302733293115, + "language_loss": 0.77451009, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78519166, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.23950195, + "step": 4658, + "time_per_iteration": 2.6347408294677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065334, + "balance_loss_mlp": 1.04195762, + "epoch": 0.8963062716429396, + "flos": 518162863104.0, + "grad_norm": 0.05167614714836421, + "language_loss": 0.83363634, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84428966, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.23352051, + "step": 4659, + "time_per_iteration": 2.6101901531219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069072, + "balance_loss_mlp": 1.04588568, + "epoch": 0.8964986533282031, + "flos": 508484625408.0, + "grad_norm": 0.061239712200733556, + "language_loss": 0.81603789, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82672858, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.23181152, + "step": 4660, + "time_per_iteration": 2.6643130779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067848, + "balance_loss_mlp": 1.0449475, + "epoch": 0.8966910350134667, + "flos": 536076320256.0, + "grad_norm": 0.07910275074584235, + "language_loss": 0.81423545, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82491398, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.22900391, + "step": 4661, + "time_per_iteration": 2.621864080429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106623, + "balance_loss_mlp": 1.04335344, + "epoch": 0.8968834166987303, + "flos": 723226226688.0, + "grad_norm": 0.058143333452195634, + "language_loss": 0.84462941, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85529172, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.2286377, + "step": 4662, + "time_per_iteration": 2.8383162021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106546, + "balance_loss_mlp": 1.04339433, + "epoch": 0.8970757983839939, + "flos": 681686710272.0, + "grad_norm": 0.05513580414518432, + "language_loss": 0.84041762, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.85107225, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.22070312, + "step": 4663, + "time_per_iteration": 2.9471535682678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067793, + "balance_loss_mlp": 1.04407024, + "epoch": 0.8972681800692575, + "flos": 613037486592.0, + "grad_norm": 0.08773189302167883, + "language_loss": 0.76343608, + "learning_rate": 2.742244971856006e-05, + "loss": 0.77411401, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.23706055, + "step": 4664, + "time_per_iteration": 2.7563586235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.04091692, + "epoch": 0.8974605617545209, + "flos": 572350344192.0, + "grad_norm": 0.05238524787206957, + "language_loss": 0.83254212, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84318304, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.23168945, + "step": 4665, + "time_per_iteration": 2.716057062149048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.04294896, + "epoch": 0.8976529434397845, + "flos": 520418608128.0, + "grad_norm": 0.06037902088521838, + "language_loss": 0.8723467, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88301122, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.23486328, + "step": 4666, + "time_per_iteration": 2.645430564880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_mlp": 1.04494941, + "epoch": 0.8978453251250481, + "flos": 471355656192.0, + "grad_norm": 0.05147791161429302, + "language_loss": 0.82619303, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83687758, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.23498535, + "step": 4667, + "time_per_iteration": 2.6113970279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068602, + "balance_loss_mlp": 1.04610741, + "epoch": 0.8980377068103117, + "flos": 591659831808.0, + "grad_norm": 0.06407487894288708, + "language_loss": 0.82130563, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.83199167, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.22509766, + "step": 4668, + "time_per_iteration": 2.775261640548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010737, + "balance_loss_mlp": 1.05051422, + "epoch": 0.8982300884955752, + "flos": 767619472896.0, + "grad_norm": 0.06672049349716365, + "language_loss": 0.82899266, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83972967, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.23181152, + "step": 4669, + "time_per_iteration": 2.9366719722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065808, + "balance_loss_mlp": 1.04299164, + "epoch": 0.8984224701808388, + "flos": 844575496704.0, + "grad_norm": 0.05547032389255281, + "language_loss": 0.77663457, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78729266, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.22814941, + "step": 4670, + "time_per_iteration": 3.2214982509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070125, + "balance_loss_mlp": 1.04733181, + "epoch": 0.8986148518661023, + "flos": 757661879808.0, + "grad_norm": 0.08084451627969469, + "language_loss": 0.76220548, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77290672, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.2277832, + "step": 4671, + "time_per_iteration": 3.152304172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066997, + "balance_loss_mlp": 1.04413223, + "epoch": 0.8988072335513659, + "flos": 563070030336.0, + "grad_norm": 0.08463920204686945, + "language_loss": 0.77184796, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.78251791, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.2286377, + "step": 4672, + "time_per_iteration": 2.7046709060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067256, + "balance_loss_mlp": 1.04339075, + "epoch": 0.8989996152366295, + "flos": 492683751936.0, + "grad_norm": 0.06216654678509643, + "language_loss": 0.86926317, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87993574, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.23852539, + "step": 4673, + "time_per_iteration": 2.5472865104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067016, + "balance_loss_mlp": 1.04331684, + "epoch": 0.899191996921893, + "flos": 542567144448.0, + "grad_norm": 0.07568061598411091, + "language_loss": 0.76223564, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.77290577, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.23681641, + "step": 4674, + "time_per_iteration": 2.648810863494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066639, + "balance_loss_mlp": 1.04321444, + "epoch": 0.8993843786071566, + "flos": 471325920768.0, + "grad_norm": 0.07030334543630777, + "language_loss": 0.80331755, + "learning_rate": 2.631423662948984e-05, + "loss": 0.81398392, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.23425293, + "step": 4675, + "time_per_iteration": 2.601430892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107006, + "balance_loss_mlp": 1.04677892, + "epoch": 0.8995767602924202, + "flos": 526726623744.0, + "grad_norm": 0.0574995640777522, + "language_loss": 0.82629097, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83699161, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.23278809, + "step": 4676, + "time_per_iteration": 2.780759811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.04313004, + "epoch": 0.8997691419776838, + "flos": 557634212352.0, + "grad_norm": 0.0657953043961064, + "language_loss": 0.84578764, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85645092, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.23181152, + "step": 4677, + "time_per_iteration": 2.6718125343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072385, + "balance_loss_mlp": 1.04937708, + "epoch": 0.8999615236629472, + "flos": 639027947520.0, + "grad_norm": 0.05825009519251717, + "language_loss": 0.80745816, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81818199, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.22998047, + "step": 4678, + "time_per_iteration": 2.8580267429351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003846, + "balance_loss_mlp": 0.99717021, + "epoch": 0.9001539053482108, + "flos": 1431510547968.0, + "grad_norm": 0.007707112448869433, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86787868, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.06689453, + "step": 4679, + "time_per_iteration": 4.804095029830933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106824, + "balance_loss_mlp": 1.0444932, + "epoch": 0.9003462870334744, + "flos": 566877450240.0, + "grad_norm": 0.06250432124299872, + "language_loss": 0.80040658, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.81108892, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.23730469, + "step": 4680, + "time_per_iteration": 2.858875274658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065394, + "balance_loss_mlp": 1.04134905, + "epoch": 0.900538668718738, + "flos": 538655837184.0, + "grad_norm": 0.05470600652394018, + "language_loss": 0.78754449, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79819846, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.2401123, + "step": 4681, + "time_per_iteration": 2.6454641819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066228, + "balance_loss_mlp": 1.04298246, + "epoch": 0.9007310504040016, + "flos": 488387003904.0, + "grad_norm": 0.0857407737002147, + "language_loss": 0.86202192, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.87268418, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.2322998, + "step": 4682, + "time_per_iteration": 2.5687954425811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.04095745, + "epoch": 0.9009234320892651, + "flos": 652901617152.0, + "grad_norm": 0.06514084212539079, + "language_loss": 0.78763062, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79827344, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.2331543, + "step": 4683, + "time_per_iteration": 2.872545003890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.04105759, + "epoch": 0.9011158137745287, + "flos": 545569178112.0, + "grad_norm": 0.13335470747137296, + "language_loss": 0.85694379, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.867589, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.234375, + "step": 4684, + "time_per_iteration": 2.694607734680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062379, + "balance_loss_mlp": 1.03976488, + "epoch": 0.9013081954597922, + "flos": 559699808256.0, + "grad_norm": 0.061629195170929164, + "language_loss": 0.82466996, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83529371, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.22607422, + "step": 4685, + "time_per_iteration": 2.664644956588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106963, + "balance_loss_mlp": 1.04603863, + "epoch": 0.9015005771450558, + "flos": 728652132864.0, + "grad_norm": 0.05255805412732331, + "language_loss": 0.80887723, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.81957352, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.23583984, + "step": 4686, + "time_per_iteration": 2.9304230213165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067801, + "balance_loss_mlp": 1.04424536, + "epoch": 0.9016929588303193, + "flos": 517416574464.0, + "grad_norm": 0.061119646274193154, + "language_loss": 0.81590062, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82657862, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.23535156, + "step": 4687, + "time_per_iteration": 2.7988228797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065289, + "balance_loss_mlp": 1.04235315, + "epoch": 0.9018853405155829, + "flos": 622335052800.0, + "grad_norm": 0.05484090421753951, + "language_loss": 0.86004657, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87069947, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.22937012, + "step": 4688, + "time_per_iteration": 2.86004638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066048, + "balance_loss_mlp": 1.04178929, + "epoch": 0.9020777222008465, + "flos": 523284820992.0, + "grad_norm": 0.06490737682713338, + "language_loss": 0.78058898, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.79124951, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.24255371, + "step": 4689, + "time_per_iteration": 2.6526527404785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061292, + "balance_loss_mlp": 1.03895235, + "epoch": 0.9022701038861101, + "flos": 633713269248.0, + "grad_norm": 0.049701215658093066, + "language_loss": 0.82043481, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.83104765, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.22351074, + "step": 4690, + "time_per_iteration": 2.844237804412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061716, + "balance_loss_mlp": 1.03930461, + "epoch": 0.9024624855713737, + "flos": 513295294464.0, + "grad_norm": 0.056724167517837494, + "language_loss": 0.84453869, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85515589, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.22412109, + "step": 4691, + "time_per_iteration": 2.643986940383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061795, + "balance_loss_mlp": 1.03883481, + "epoch": 0.9026548672566371, + "flos": 477411480576.0, + "grad_norm": 0.05847332764386979, + "language_loss": 0.86467588, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87529379, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.22961426, + "step": 4692, + "time_per_iteration": 2.6611626148223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064053, + "balance_loss_mlp": 1.04028225, + "epoch": 0.9028472489419007, + "flos": 661994353152.0, + "grad_norm": 0.05749458079736531, + "language_loss": 0.73733985, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74798036, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.23779297, + "step": 4693, + "time_per_iteration": 2.8610541820526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062773, + "balance_loss_mlp": 1.03946733, + "epoch": 0.9030396306271643, + "flos": 534588885504.0, + "grad_norm": 0.05856701308084892, + "language_loss": 0.82619125, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.83681893, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.23291016, + "step": 4694, + "time_per_iteration": 2.6105546951293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063992, + "balance_loss_mlp": 1.04143786, + "epoch": 0.9032320123124279, + "flos": 801032426496.0, + "grad_norm": 0.054990789226307006, + "language_loss": 0.82393968, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83457965, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.22546387, + "step": 4695, + "time_per_iteration": 2.9821743965148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064335, + "balance_loss_mlp": 1.04110062, + "epoch": 0.9034243939976914, + "flos": 553942789632.0, + "grad_norm": 0.07759884648218378, + "language_loss": 0.76665241, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77729577, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.23205566, + "step": 4696, + "time_per_iteration": 2.6665611267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061692, + "balance_loss_mlp": 1.03874457, + "epoch": 0.903616775682955, + "flos": 503903752704.0, + "grad_norm": 0.10162314375080486, + "language_loss": 0.82691509, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83753198, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.22949219, + "step": 4697, + "time_per_iteration": 2.584268093109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063613, + "balance_loss_mlp": 1.04124892, + "epoch": 0.9038091573682185, + "flos": 436297052160.0, + "grad_norm": 0.0664097832888091, + "language_loss": 0.78661013, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79724634, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.22387695, + "step": 4698, + "time_per_iteration": 2.604700803756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063555, + "balance_loss_mlp": 1.03962958, + "epoch": 0.9040015390534821, + "flos": 532916070912.0, + "grad_norm": 0.07742814353956143, + "language_loss": 0.81602848, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.82666409, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.23901367, + "step": 4699, + "time_per_iteration": 2.691525936126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064709, + "balance_loss_mlp": 1.04179668, + "epoch": 0.9041939207387457, + "flos": 564307845120.0, + "grad_norm": 0.0614656758033835, + "language_loss": 0.80404103, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81468809, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.22900391, + "step": 4700, + "time_per_iteration": 2.757578134536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061234, + "balance_loss_mlp": 1.03814268, + "epoch": 0.9043863024240092, + "flos": 515509194240.0, + "grad_norm": 0.07278260404802209, + "language_loss": 0.77661544, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78722775, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.23071289, + "step": 4701, + "time_per_iteration": 2.5736379623413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_mlp": 0.99531454, + "epoch": 0.9045786841092728, + "flos": 1277949063168.0, + "grad_norm": 0.011886728368769089, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73931825, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.06738281, + "step": 4702, + "time_per_iteration": 4.96384072303772 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_mlp": 1.04281723, + "epoch": 0.9047710657945364, + "flos": 585841144320.0, + "grad_norm": 0.05895236346667512, + "language_loss": 0.83045083, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.84111637, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.23742676, + "step": 4703, + "time_per_iteration": 2.681264877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061673, + "balance_loss_mlp": 1.0387013, + "epoch": 0.9049634474798, + "flos": 571937739264.0, + "grad_norm": 0.07313912581124954, + "language_loss": 0.79802144, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80863822, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.22961426, + "step": 4704, + "time_per_iteration": 2.7477691173553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064397, + "balance_loss_mlp": 1.0415324, + "epoch": 0.9051558291650635, + "flos": 572619787776.0, + "grad_norm": 0.08577028625924395, + "language_loss": 0.7446878, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75533175, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.22851562, + "step": 4705, + "time_per_iteration": 2.7316272258758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063321, + "balance_loss_mlp": 1.03981328, + "epoch": 0.905348210850327, + "flos": 540538624512.0, + "grad_norm": 0.07345375588943581, + "language_loss": 0.79496109, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80559433, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.23486328, + "step": 4706, + "time_per_iteration": 2.670381784439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061785, + "balance_loss_mlp": 1.03881335, + "epoch": 0.9055405925355906, + "flos": 516381391872.0, + "grad_norm": 0.07600321724106415, + "language_loss": 0.81767797, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82829583, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.22961426, + "step": 4707, + "time_per_iteration": 2.613689661026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059398, + "balance_loss_mlp": 1.03645039, + "epoch": 0.9057329742208542, + "flos": 914643145728.0, + "grad_norm": 0.05847502897319672, + "language_loss": 0.850618, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86121196, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.22949219, + "step": 4708, + "time_per_iteration": 3.24080228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062169, + "balance_loss_mlp": 1.03837466, + "epoch": 0.9059253559061178, + "flos": 905261515776.0, + "grad_norm": 0.08783492473540015, + "language_loss": 0.83353186, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.84415358, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.23791504, + "step": 4709, + "time_per_iteration": 3.145754337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060875, + "balance_loss_mlp": 1.03717613, + "epoch": 0.9061177375913813, + "flos": 664534222848.0, + "grad_norm": 0.06837810531041678, + "language_loss": 0.77734303, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78795183, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.23706055, + "step": 4710, + "time_per_iteration": 2.8780710697174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065307, + "balance_loss_mlp": 1.04175103, + "epoch": 0.9063101192766448, + "flos": 565609900032.0, + "grad_norm": 0.07543882900367092, + "language_loss": 0.82841074, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83906382, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.23547363, + "step": 4711, + "time_per_iteration": 2.748779773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_mlp": 1.03939068, + "epoch": 0.9065025009619084, + "flos": 727377242112.0, + "grad_norm": 0.0645612601116662, + "language_loss": 0.79195869, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80257982, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.22729492, + "step": 4712, + "time_per_iteration": 2.8739047050476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060893, + "balance_loss_mlp": 1.03833866, + "epoch": 0.906694882647172, + "flos": 531512699904.0, + "grad_norm": 0.06746017534167525, + "language_loss": 0.79950291, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81011188, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.22558594, + "step": 4713, + "time_per_iteration": 2.659532308578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060379, + "balance_loss_mlp": 1.03682303, + "epoch": 0.9068872643324356, + "flos": 429788975616.0, + "grad_norm": 0.08111291405332487, + "language_loss": 0.85047996, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.86108375, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.23535156, + "step": 4714, + "time_per_iteration": 2.607717990875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.04185677, + "epoch": 0.9070796460176991, + "flos": 588366332928.0, + "grad_norm": 0.06399071140262953, + "language_loss": 0.79739857, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80805904, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.24182129, + "step": 4715, + "time_per_iteration": 2.844972610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062855, + "balance_loss_mlp": 1.03901279, + "epoch": 0.9072720277029627, + "flos": 571582033920.0, + "grad_norm": 0.06112565203304271, + "language_loss": 0.75674605, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76737463, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.23828125, + "step": 4716, + "time_per_iteration": 2.7184197902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062655, + "balance_loss_mlp": 1.03995764, + "epoch": 0.9074644093882263, + "flos": 555798412800.0, + "grad_norm": 0.05611379446254042, + "language_loss": 0.8870452, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89767182, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.22705078, + "step": 4717, + "time_per_iteration": 2.645702838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064312, + "balance_loss_mlp": 1.04135203, + "epoch": 0.9076567910734898, + "flos": 640994798592.0, + "grad_norm": 0.06604974299876817, + "language_loss": 0.82842344, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83906651, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.22961426, + "step": 4718, + "time_per_iteration": 2.7762739658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062525, + "balance_loss_mlp": 1.03921926, + "epoch": 0.9078491727587533, + "flos": 733998744576.0, + "grad_norm": 0.05329973155355116, + "language_loss": 0.81630248, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82692772, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.23303223, + "step": 4719, + "time_per_iteration": 3.11417555809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106129, + "balance_loss_mlp": 1.03837776, + "epoch": 0.9080415544440169, + "flos": 654774492672.0, + "grad_norm": 0.054605753235731906, + "language_loss": 0.86558378, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87619674, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.22924805, + "step": 4720, + "time_per_iteration": 2.8445792198181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060559, + "balance_loss_mlp": 1.03724146, + "epoch": 0.9082339361292805, + "flos": 597463838208.0, + "grad_norm": 0.055554833776014695, + "language_loss": 0.79743761, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80804324, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.23303223, + "step": 4721, + "time_per_iteration": 2.743527412414551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106695, + "balance_loss_mlp": 1.04333496, + "epoch": 0.9084263178145441, + "flos": 504407761920.0, + "grad_norm": 0.05842313627094751, + "language_loss": 0.84558177, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85625124, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.23608398, + "step": 4722, + "time_per_iteration": 2.6498191356658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065647, + "balance_loss_mlp": 1.04142332, + "epoch": 0.9086186994998077, + "flos": 550031482368.0, + "grad_norm": 0.0722517857317367, + "language_loss": 0.80876398, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81942052, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.2421875, + "step": 4723, + "time_per_iteration": 2.6922495365142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061199, + "balance_loss_mlp": 1.03808427, + "epoch": 0.9088110811850711, + "flos": 1134076847616.0, + "grad_norm": 0.07453846323964182, + "language_loss": 0.75279224, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76340425, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.23120117, + "step": 4724, + "time_per_iteration": 3.5442843437194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068171, + "balance_loss_mlp": 1.04486537, + "epoch": 0.9090034628703347, + "flos": 556991811072.0, + "grad_norm": 0.052486424573562215, + "language_loss": 0.7726813, + "learning_rate": 2.155810244111628e-05, + "loss": 0.78336298, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.23291016, + "step": 4725, + "time_per_iteration": 2.712711811065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064643, + "balance_loss_mlp": 1.04276848, + "epoch": 0.9091958445555983, + "flos": 543970515456.0, + "grad_norm": 0.05499523521606461, + "language_loss": 0.84340453, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85405099, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.21887207, + "step": 4726, + "time_per_iteration": 2.6923320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061768, + "balance_loss_mlp": 1.03845048, + "epoch": 0.9093882262408619, + "flos": 526113957888.0, + "grad_norm": 0.06073661706231448, + "language_loss": 0.81389105, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82450879, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.23291016, + "step": 4727, + "time_per_iteration": 2.64753794670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105825, + "balance_loss_mlp": 1.03569603, + "epoch": 0.9095806079261254, + "flos": 548526795264.0, + "grad_norm": 0.06249947312490014, + "language_loss": 0.82006919, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.83065176, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.22558594, + "step": 4728, + "time_per_iteration": 2.6654014587402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061076, + "balance_loss_mlp": 1.03806877, + "epoch": 0.909772989611389, + "flos": 572535724032.0, + "grad_norm": 0.06570062173363597, + "language_loss": 0.84565747, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85626823, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.23010254, + "step": 4729, + "time_per_iteration": 2.7289013862609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063857, + "balance_loss_mlp": 1.04019403, + "epoch": 0.9099653712966526, + "flos": 561812391936.0, + "grad_norm": 0.09449980732831745, + "language_loss": 0.79957283, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.81021142, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.23657227, + "step": 4730, + "time_per_iteration": 2.6817986965179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059413, + "balance_loss_mlp": 1.03622675, + "epoch": 0.9101577529819161, + "flos": 1093800112128.0, + "grad_norm": 0.06465270600581106, + "language_loss": 0.80294889, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81354308, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.23181152, + "step": 4731, + "time_per_iteration": 3.3533620834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062594, + "balance_loss_mlp": 1.03854942, + "epoch": 0.9103501346671797, + "flos": 445444116480.0, + "grad_norm": 0.07092257188337395, + "language_loss": 0.82299185, + "learning_rate": 2.092919721190678e-05, + "loss": 0.83361781, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.24047852, + "step": 4732, + "time_per_iteration": 2.5148427486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062948, + "balance_loss_mlp": 1.0397495, + "epoch": 0.9105425163524432, + "flos": 500770667520.0, + "grad_norm": 0.06939969815016095, + "language_loss": 0.77697742, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78760689, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.23205566, + "step": 4733, + "time_per_iteration": 2.642012596130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065924, + "balance_loss_mlp": 1.04153371, + "epoch": 0.9107348980377068, + "flos": 657519565824.0, + "grad_norm": 0.05133792278830391, + "language_loss": 0.84146416, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85212338, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.24377441, + "step": 4734, + "time_per_iteration": 2.835914134979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.03760004, + "epoch": 0.9109272797229704, + "flos": 553668576768.0, + "grad_norm": 0.06017445615271462, + "language_loss": 0.84846371, + "learning_rate": 2.066245558029256e-05, + "loss": 0.85907179, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.23205566, + "step": 4735, + "time_per_iteration": 2.6190714836120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066064, + "balance_loss_mlp": 1.04261589, + "epoch": 0.911119661408234, + "flos": 519007896576.0, + "grad_norm": 0.06548686029452257, + "language_loss": 0.83979481, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85045546, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.23449707, + "step": 4736, + "time_per_iteration": 2.619947910308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064319, + "balance_loss_mlp": 1.04119277, + "epoch": 0.9113120430934974, + "flos": 554375218176.0, + "grad_norm": 0.06468079710264198, + "language_loss": 0.8364414, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84708458, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.23144531, + "step": 4737, + "time_per_iteration": 2.673065662384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106403, + "balance_loss_mlp": 1.04109359, + "epoch": 0.911504424778761, + "flos": 501889913856.0, + "grad_norm": 0.06603092013772342, + "language_loss": 0.81557083, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82621109, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.22937012, + "step": 4738, + "time_per_iteration": 2.6410467624664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068098, + "balance_loss_mlp": 1.04519773, + "epoch": 0.9116968064640246, + "flos": 611100370944.0, + "grad_norm": 0.0574544512840337, + "language_loss": 0.82125366, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83193469, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.22888184, + "step": 4739, + "time_per_iteration": 2.7088229656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061118, + "balance_loss_mlp": 1.03840876, + "epoch": 0.9118891881492882, + "flos": 572918593536.0, + "grad_norm": 0.06015748588292996, + "language_loss": 0.82474881, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83536005, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.22717285, + "step": 4740, + "time_per_iteration": 2.7565977573394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.03874695, + "epoch": 0.9120815698345518, + "flos": 635961673728.0, + "grad_norm": 0.0683916463523955, + "language_loss": 0.78226697, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79288852, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.23400879, + "step": 4741, + "time_per_iteration": 2.827404499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059165, + "balance_loss_mlp": 1.0367775, + "epoch": 0.9122739515198153, + "flos": 702300824064.0, + "grad_norm": 0.07740625754984397, + "language_loss": 0.86271739, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.87330902, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.22387695, + "step": 4742, + "time_per_iteration": 2.874316692352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067284, + "balance_loss_mlp": 1.04440832, + "epoch": 0.9124663332050789, + "flos": 524690763264.0, + "grad_norm": 0.07197983695129517, + "language_loss": 0.87550807, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88618088, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.2286377, + "step": 4743, + "time_per_iteration": 2.7014012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064508, + "balance_loss_mlp": 1.04181051, + "epoch": 0.9126587148903424, + "flos": 563299826688.0, + "grad_norm": 0.056151781575863396, + "language_loss": 0.82793915, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83858418, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.22705078, + "step": 4744, + "time_per_iteration": 2.688795566558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062409, + "balance_loss_mlp": 1.03963971, + "epoch": 0.912851096575606, + "flos": 505942184448.0, + "grad_norm": 0.06468850239318244, + "language_loss": 0.79841912, + "learning_rate": 1.978541819374574e-05, + "loss": 0.80904323, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.22766113, + "step": 4745, + "time_per_iteration": 2.5829083919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066216, + "balance_loss_mlp": 1.04246938, + "epoch": 0.9130434782608695, + "flos": 550730783232.0, + "grad_norm": 0.06526804508187888, + "language_loss": 0.82502508, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83568728, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.23718262, + "step": 4746, + "time_per_iteration": 2.6205508708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_mlp": 1.03875685, + "epoch": 0.9132358599461331, + "flos": 468976200192.0, + "grad_norm": 0.072585776540836, + "language_loss": 0.83583778, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84645015, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.22473145, + "step": 4747, + "time_per_iteration": 2.5356056690216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062146, + "balance_loss_mlp": 1.03998518, + "epoch": 0.9134282416313967, + "flos": 506097828864.0, + "grad_norm": 0.06194777952555315, + "language_loss": 0.80193496, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.81255639, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.22167969, + "step": 4748, + "time_per_iteration": 2.6417837142944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062825, + "balance_loss_mlp": 1.03936434, + "epoch": 0.9136206233166603, + "flos": 604819519488.0, + "grad_norm": 0.06274709516347185, + "language_loss": 0.84114301, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85177124, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.23461914, + "step": 4749, + "time_per_iteration": 2.7290050983428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065104, + "balance_loss_mlp": 1.04191816, + "epoch": 0.9138130050019239, + "flos": 561738240000.0, + "grad_norm": 0.05949272956621823, + "language_loss": 0.83238935, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.84304041, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.23181152, + "step": 4750, + "time_per_iteration": 2.6649882793426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065722, + "balance_loss_mlp": 1.04198802, + "epoch": 0.9140053866871873, + "flos": 690117221376.0, + "grad_norm": 0.051828523329950985, + "language_loss": 0.90372276, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91437995, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.23742676, + "step": 4751, + "time_per_iteration": 2.8122756481170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063116, + "balance_loss_mlp": 1.04025221, + "epoch": 0.9141977683724509, + "flos": 551012336640.0, + "grad_norm": 0.05513697084655646, + "language_loss": 0.84049332, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85112453, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.22851562, + "step": 4752, + "time_per_iteration": 2.690352439880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065089, + "balance_loss_mlp": 1.04072297, + "epoch": 0.9143901500577145, + "flos": 540088943616.0, + "grad_norm": 0.059079736183717625, + "language_loss": 0.75681716, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76746798, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.24353027, + "step": 4753, + "time_per_iteration": 2.6606411933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062336, + "balance_loss_mlp": 1.03876805, + "epoch": 0.9145825317429781, + "flos": 528767626752.0, + "grad_norm": 0.06000540619598026, + "language_loss": 0.81170249, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.82232583, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.23547363, + "step": 4754, + "time_per_iteration": 2.6054999828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069817, + "balance_loss_mlp": 1.04665422, + "epoch": 0.9147749134282416, + "flos": 514792641024.0, + "grad_norm": 0.060337107580878784, + "language_loss": 0.79316139, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80385947, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.23156738, + "step": 4755, + "time_per_iteration": 2.69866943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106264, + "balance_loss_mlp": 1.04012132, + "epoch": 0.9149672951135052, + "flos": 514441704960.0, + "grad_norm": 0.057247927056212296, + "language_loss": 0.85906756, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.86969399, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.22521973, + "step": 4756, + "time_per_iteration": 2.657858371734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062369, + "balance_loss_mlp": 1.03919435, + "epoch": 0.9151596767987688, + "flos": 577069608960.0, + "grad_norm": 0.05578816097399521, + "language_loss": 0.82000184, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.83062547, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.23144531, + "step": 4757, + "time_per_iteration": 2.7571098804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106101, + "balance_loss_mlp": 1.03912318, + "epoch": 0.9153520584840323, + "flos": 619335590400.0, + "grad_norm": 0.055386707706292025, + "language_loss": 0.82802898, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83863914, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.21911621, + "step": 4758, + "time_per_iteration": 2.746701240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.04191732, + "epoch": 0.9155444401692959, + "flos": 468921871872.0, + "grad_norm": 0.055003918587598934, + "language_loss": 0.82849371, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83915728, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.2442627, + "step": 4759, + "time_per_iteration": 2.599597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000937, + "balance_loss_mlp": 0.99421328, + "epoch": 0.9157368218545594, + "flos": 1410711054336.0, + "grad_norm": 0.006985610958737596, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75820005, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.06738281, + "step": 4760, + "time_per_iteration": 4.837120294570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000943, + "balance_loss_mlp": 0.99421966, + "epoch": 0.915929203539823, + "flos": 1522019040768.0, + "grad_norm": 0.006987645393502265, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80576992, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.06738281, + "step": 4761, + "time_per_iteration": 4.932307720184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_mlp": 1.03865767, + "epoch": 0.9161215852250866, + "flos": 535752548352.0, + "grad_norm": 0.06103904821656309, + "language_loss": 0.80316961, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81379426, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.23803711, + "step": 4762, + "time_per_iteration": 2.7238500118255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.04348063, + "epoch": 0.9163139669103502, + "flos": 590624649216.0, + "grad_norm": 0.057935792953032535, + "language_loss": 0.80923164, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81989825, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.23181152, + "step": 4763, + "time_per_iteration": 2.7679901123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063834, + "balance_loss_mlp": 1.04057622, + "epoch": 0.9165063485956138, + "flos": 821975081472.0, + "grad_norm": 0.051758643461764085, + "language_loss": 0.84645653, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85709482, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.23254395, + "step": 4764, + "time_per_iteration": 3.0768322944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105787, + "balance_loss_mlp": 1.03519642, + "epoch": 0.9166987302808772, + "flos": 525194772480.0, + "grad_norm": 0.05556449671285437, + "language_loss": 0.8234725, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83405113, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.22668457, + "step": 4765, + "time_per_iteration": 2.660769462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062896, + "balance_loss_mlp": 1.03945971, + "epoch": 0.9168911119661408, + "flos": 655095693312.0, + "grad_norm": 0.049564714523440044, + "language_loss": 0.84340852, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85403752, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.23425293, + "step": 4766, + "time_per_iteration": 2.9556195735931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063242, + "balance_loss_mlp": 1.03998423, + "epoch": 0.9170834936514044, + "flos": 491747314176.0, + "grad_norm": 0.07059908264734456, + "language_loss": 0.852781, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.86341345, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.2322998, + "step": 4767, + "time_per_iteration": 2.568305730819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065741, + "balance_loss_mlp": 1.04228079, + "epoch": 0.917275875336668, + "flos": 628040314368.0, + "grad_norm": 0.0724753532377933, + "language_loss": 0.80676091, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81741834, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.234375, + "step": 4768, + "time_per_iteration": 2.8312792778015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01001118, + "balance_loss_mlp": 0.99439502, + "epoch": 0.9174682570219315, + "flos": 1517981824512.0, + "grad_norm": 0.00699799346191008, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79181355, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.06738281, + "step": 4769, + "time_per_iteration": 4.91847825050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106499, + "balance_loss_mlp": 1.04138637, + "epoch": 0.917660638707195, + "flos": 560021008896.0, + "grad_norm": 0.06313513734114481, + "language_loss": 0.85138547, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86203539, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.23608398, + "step": 4770, + "time_per_iteration": 2.701441764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065065, + "balance_loss_mlp": 1.04179573, + "epoch": 0.9178530203924586, + "flos": 447252751872.0, + "grad_norm": 0.05958541802971449, + "language_loss": 0.83878446, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.84943509, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.23242188, + "step": 4771, + "time_per_iteration": 2.461113691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061371, + "balance_loss_mlp": 1.03884006, + "epoch": 0.9180454020777222, + "flos": 465981507072.0, + "grad_norm": 0.055111772688767, + "language_loss": 0.8086372, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81925088, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.2253418, + "step": 4772, + "time_per_iteration": 2.603609561920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063308, + "balance_loss_mlp": 1.04161239, + "epoch": 0.9182377837629858, + "flos": 596314856448.0, + "grad_norm": 0.05471349144669564, + "language_loss": 0.87369776, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88433087, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.21728516, + "step": 4773, + "time_per_iteration": 2.780123233795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065111, + "balance_loss_mlp": 1.04175746, + "epoch": 0.9184301654482493, + "flos": 996671941632.0, + "grad_norm": 0.04652666616854213, + "language_loss": 0.82479548, + "learning_rate": 1.734755767142876e-05, + "loss": 0.8354466, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.23352051, + "step": 4774, + "time_per_iteration": 3.391968011856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065291, + "balance_loss_mlp": 1.04208064, + "epoch": 0.9186225471335129, + "flos": 508860154368.0, + "grad_norm": 0.04816843723354433, + "language_loss": 0.8497929, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.8604458, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.23242188, + "step": 4775, + "time_per_iteration": 2.6323952674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065856, + "balance_loss_mlp": 1.04232407, + "epoch": 0.9188149288187765, + "flos": 940423633920.0, + "grad_norm": 0.05644006303487493, + "language_loss": 0.79136419, + "learning_rate": 1.718522925136551e-05, + "loss": 0.8020227, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.23535156, + "step": 4776, + "time_per_iteration": 3.336750030517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057658, + "balance_loss_mlp": 1.03536582, + "epoch": 0.91900731050404, + "flos": 583674232320.0, + "grad_norm": 0.05731866001862462, + "language_loss": 0.84241879, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.8529954, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.22302246, + "step": 4777, + "time_per_iteration": 2.6611621379852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063108, + "balance_loss_mlp": 1.03984988, + "epoch": 0.9191996921893035, + "flos": 581213283840.0, + "grad_norm": 0.07522007256153451, + "language_loss": 0.7975679, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80819893, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.23242188, + "step": 4778, + "time_per_iteration": 2.712597370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063497, + "balance_loss_mlp": 1.04007208, + "epoch": 0.9193920738745671, + "flos": 908935686144.0, + "grad_norm": 0.08774646197038768, + "language_loss": 0.80187458, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81250954, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.23413086, + "step": 4779, + "time_per_iteration": 3.1365673542022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01000653, + "balance_loss_mlp": 0.99392956, + "epoch": 0.9195844555598307, + "flos": 1558372359168.0, + "grad_norm": 0.007681036007533729, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80796051, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.06738281, + "step": 4780, + "time_per_iteration": 4.7075746059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064217, + "balance_loss_mlp": 1.04043531, + "epoch": 0.9197768372450943, + "flos": 474053741568.0, + "grad_norm": 0.06999993636245827, + "language_loss": 0.78756964, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79821181, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.23791504, + "step": 4781, + "time_per_iteration": 2.5254733562469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065513, + "balance_loss_mlp": 1.0416832, + "epoch": 0.9199692189303579, + "flos": 857016059904.0, + "grad_norm": 0.06498458366260695, + "language_loss": 0.84505671, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85571182, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.23840332, + "step": 4782, + "time_per_iteration": 3.2329697608947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_mlp": 1.04155171, + "epoch": 0.9201616006156214, + "flos": 504390509568.0, + "grad_norm": 0.060298368895520246, + "language_loss": 0.77588093, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78652751, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.23071289, + "step": 4783, + "time_per_iteration": 2.6278817653656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_mlp": 1.04122746, + "epoch": 0.9203539823008849, + "flos": 548781184512.0, + "grad_norm": 0.12392133111659505, + "language_loss": 0.84999621, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.86064506, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.2364502, + "step": 4784, + "time_per_iteration": 2.7395739555358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065218, + "balance_loss_mlp": 1.04228246, + "epoch": 0.9205463639861485, + "flos": 540004879872.0, + "grad_norm": 0.07902513425092665, + "language_loss": 0.82629323, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83694541, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.22912598, + "step": 4785, + "time_per_iteration": 2.6370961666107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064652, + "balance_loss_mlp": 1.04096556, + "epoch": 0.9207387456714121, + "flos": 799725229056.0, + "grad_norm": 0.05792536374091078, + "language_loss": 0.78340071, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79404724, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.23681641, + "step": 4786, + "time_per_iteration": 3.0584778785705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065903, + "balance_loss_mlp": 1.04258537, + "epoch": 0.9209311273566756, + "flos": 502848746496.0, + "grad_norm": 0.058744664162000707, + "language_loss": 0.78801596, + "learning_rate": 1.630583198044333e-05, + "loss": 0.798675, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.23303223, + "step": 4787, + "time_per_iteration": 2.7021265029907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063265, + "balance_loss_mlp": 1.04010296, + "epoch": 0.9211235090419392, + "flos": 569323717632.0, + "grad_norm": 0.056492166982787126, + "language_loss": 0.82674456, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83737719, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.23168945, + "step": 4788, + "time_per_iteration": 2.7052977085113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064131, + "balance_loss_mlp": 1.04143345, + "epoch": 0.9213158907272028, + "flos": 806549736960.0, + "grad_norm": 0.07226120806501657, + "language_loss": 0.82702368, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83766496, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.22717285, + "step": 4789, + "time_per_iteration": 2.9990592002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065389, + "balance_loss_mlp": 1.04177368, + "epoch": 0.9215082724124664, + "flos": 490682396160.0, + "grad_norm": 0.05541559131448945, + "language_loss": 0.76416653, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77482045, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.23632812, + "step": 4790, + "time_per_iteration": 2.5554823875427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002062, + "balance_loss_mlp": 0.99533838, + "epoch": 0.9217006540977299, + "flos": 1514495232000.0, + "grad_norm": 0.006305010355530082, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78072327, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.06738281, + "step": 4791, + "time_per_iteration": 5.046639442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106511, + "balance_loss_mlp": 1.04191208, + "epoch": 0.9218930357829934, + "flos": 743793352704.0, + "grad_norm": 0.05641676981244117, + "language_loss": 0.76365674, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77430785, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.23181152, + "step": 4792, + "time_per_iteration": 2.9390647411346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061659, + "balance_loss_mlp": 1.03788924, + "epoch": 0.922085417468257, + "flos": 453036934656.0, + "grad_norm": 0.06341496172003151, + "language_loss": 0.80764413, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81826079, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.23742676, + "step": 4793, + "time_per_iteration": 2.519322633743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059745, + "balance_loss_mlp": 1.03674984, + "epoch": 0.9222777991535206, + "flos": 500249405952.0, + "grad_norm": 0.062344579720582986, + "language_loss": 0.85091174, + "learning_rate": 1.575804349061616e-05, + "loss": 0.8615092, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.2298584, + "step": 4794, + "time_per_iteration": 2.595057249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064993, + "balance_loss_mlp": 1.04115093, + "epoch": 0.9224701808387842, + "flos": 527959669248.0, + "grad_norm": 0.06403867116354088, + "language_loss": 0.78786629, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.79851627, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.23864746, + "step": 4795, + "time_per_iteration": 2.566840887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066799, + "balance_loss_mlp": 1.04436338, + "epoch": 0.9226625625240477, + "flos": 874640623104.0, + "grad_norm": 0.06236913833267484, + "language_loss": 0.75503278, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76570076, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.2244873, + "step": 4796, + "time_per_iteration": 3.1768314838409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068962, + "balance_loss_mlp": 1.04589522, + "epoch": 0.9228549442093112, + "flos": 502774594560.0, + "grad_norm": 0.09209245774607971, + "language_loss": 0.88015127, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89084095, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.23059082, + "step": 4797, + "time_per_iteration": 2.5515801906585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067427, + "balance_loss_mlp": 1.04427648, + "epoch": 0.9230473258945748, + "flos": 599989026816.0, + "grad_norm": 0.062149405212805944, + "language_loss": 0.85267746, + "learning_rate": 1.544915681564829e-05, + "loss": 0.8633517, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.23156738, + "step": 4798, + "time_per_iteration": 2.7861833572387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060939, + "balance_loss_mlp": 1.03778911, + "epoch": 0.9232397075798384, + "flos": 822508826112.0, + "grad_norm": 0.06277805502935233, + "language_loss": 0.79576015, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80636954, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.23132324, + "step": 4799, + "time_per_iteration": 3.07875394821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067703, + "balance_loss_mlp": 1.04437423, + "epoch": 0.923432089265102, + "flos": 707030000640.0, + "grad_norm": 0.06720277789755563, + "language_loss": 0.84717023, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85784721, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.2331543, + "step": 4800, + "time_per_iteration": 2.8867857456207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062759, + "balance_loss_mlp": 1.03977597, + "epoch": 0.9236244709503655, + "flos": 701861054976.0, + "grad_norm": 0.06232849747601707, + "language_loss": 0.76892114, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77954876, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.22973633, + "step": 4801, + "time_per_iteration": 2.8227624893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106451, + "balance_loss_mlp": 1.0407877, + "epoch": 0.9238168526356291, + "flos": 515039689728.0, + "grad_norm": 0.052686499610644186, + "language_loss": 0.84125519, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85190028, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.23706055, + "step": 4802, + "time_per_iteration": 2.60945725440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.04199064, + "epoch": 0.9240092343208927, + "flos": 492024098304.0, + "grad_norm": 0.07585539209620575, + "language_loss": 0.81734359, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82800424, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.24072266, + "step": 4803, + "time_per_iteration": 2.5775294303894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064809, + "balance_loss_mlp": 1.04138434, + "epoch": 0.9242016160061562, + "flos": 647218750464.0, + "grad_norm": 0.05855132746903116, + "language_loss": 0.74013615, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.75078428, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.234375, + "step": 4804, + "time_per_iteration": 2.8631527423858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064495, + "balance_loss_mlp": 1.0423224, + "epoch": 0.9243939976914197, + "flos": 729430354944.0, + "grad_norm": 0.0571546365091257, + "language_loss": 0.79579568, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80644059, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.22167969, + "step": 4805, + "time_per_iteration": 2.956089496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064454, + "balance_loss_mlp": 1.04110074, + "epoch": 0.9245863793766833, + "flos": 452246229504.0, + "grad_norm": 0.060184319008054675, + "language_loss": 0.90989661, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.92054117, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.23339844, + "step": 4806, + "time_per_iteration": 2.59251070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066465, + "balance_loss_mlp": 1.04270649, + "epoch": 0.9247787610619469, + "flos": 755030605824.0, + "grad_norm": 0.056769763464111264, + "language_loss": 0.77370489, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78436947, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.23754883, + "step": 4807, + "time_per_iteration": 2.9537925720214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067303, + "balance_loss_mlp": 1.04301965, + "epoch": 0.9249711427472105, + "flos": 561928389120.0, + "grad_norm": 0.058387380742388154, + "language_loss": 0.85072255, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.8613956, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.24267578, + "step": 4808, + "time_per_iteration": 2.76755952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063513, + "balance_loss_mlp": 1.03921795, + "epoch": 0.9251635244324741, + "flos": 526699459584.0, + "grad_norm": 0.061698884078940947, + "language_loss": 0.85564888, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86628401, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.24291992, + "step": 4809, + "time_per_iteration": 2.674180507659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061235, + "balance_loss_mlp": 1.03777432, + "epoch": 0.9253559061177375, + "flos": 611280608256.0, + "grad_norm": 0.06228045172593029, + "language_loss": 0.79177982, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80239218, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.234375, + "step": 4810, + "time_per_iteration": 2.8329577445983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003033, + "balance_loss_mlp": 0.9963572, + "epoch": 0.9255482878030011, + "flos": 1551258957312.0, + "grad_norm": 0.005180533105187962, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77928424, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.06689453, + "step": 4811, + "time_per_iteration": 4.778438329696655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070987, + "balance_loss_mlp": 1.04688334, + "epoch": 0.9257406694882647, + "flos": 766366603776.0, + "grad_norm": 0.06467551917975638, + "language_loss": 0.81310445, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82381433, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.24084473, + "step": 4812, + "time_per_iteration": 3.0328586101531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065422, + "balance_loss_mlp": 1.04172337, + "epoch": 0.9259330511735283, + "flos": 497991089664.0, + "grad_norm": 0.06532975975063249, + "language_loss": 0.83375204, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84440625, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.23669434, + "step": 4813, + "time_per_iteration": 2.560148239135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063752, + "balance_loss_mlp": 1.04049397, + "epoch": 0.9261254328587919, + "flos": 626874080256.0, + "grad_norm": 0.07028520883418425, + "language_loss": 0.79285073, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80348825, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.23254395, + "step": 4814, + "time_per_iteration": 2.7584128379821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_mlp": 1.04060006, + "epoch": 0.9263178145440554, + "flos": 597382345728.0, + "grad_norm": 0.055695870259143146, + "language_loss": 0.85310483, + "learning_rate": 1.416999056594831e-05, + "loss": 0.8637417, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.23095703, + "step": 4815, + "time_per_iteration": 2.713562250137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065101, + "balance_loss_mlp": 1.04192686, + "epoch": 0.926510196229319, + "flos": 388563319296.0, + "grad_norm": 0.060924009285229154, + "language_loss": 0.83667755, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84732854, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.23168945, + "step": 4816, + "time_per_iteration": 2.450631856918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067046, + "balance_loss_mlp": 1.04412222, + "epoch": 0.9267025779145825, + "flos": 545798974464.0, + "grad_norm": 0.06638467341927559, + "language_loss": 0.84540778, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85607827, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.22900391, + "step": 4817, + "time_per_iteration": 2.6362295150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065467, + "balance_loss_mlp": 1.04217339, + "epoch": 0.9268949595998461, + "flos": 499789813248.0, + "grad_norm": 0.06886658822302974, + "language_loss": 0.81961125, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.83026588, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.23266602, + "step": 4818, + "time_per_iteration": 2.6441164016723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062045, + "balance_loss_mlp": 1.03836977, + "epoch": 0.9270873412851096, + "flos": 432828085248.0, + "grad_norm": 0.05488858492503732, + "language_loss": 0.82954144, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.84016186, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.23669434, + "step": 4819, + "time_per_iteration": 2.62613844871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062926, + "balance_loss_mlp": 1.03994298, + "epoch": 0.9272797229703732, + "flos": 466769640960.0, + "grad_norm": 0.05813716940805213, + "language_loss": 0.86372411, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87435341, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.22973633, + "step": 4820, + "time_per_iteration": 2.588118553161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_mlp": 1.03865016, + "epoch": 0.9274721046556368, + "flos": 704838122496.0, + "grad_norm": 0.08851698631196843, + "language_loss": 0.78835869, + "learning_rate": 1.373152729763938e-05, + "loss": 0.79898179, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.23632812, + "step": 4821, + "time_per_iteration": 3.013256788253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003926, + "balance_loss_mlp": 0.99725056, + "epoch": 0.9276644863409004, + "flos": 1402255950336.0, + "grad_norm": 0.0043274702842664575, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83384389, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.06689453, + "step": 4822, + "time_per_iteration": 4.84527587890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063845, + "balance_loss_mlp": 1.04090953, + "epoch": 0.927856868026164, + "flos": 741722614272.0, + "grad_norm": 0.06082082219392593, + "language_loss": 0.80463547, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81527394, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.22937012, + "step": 4823, + "time_per_iteration": 3.0070438385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067648, + "balance_loss_mlp": 1.04371095, + "epoch": 0.9280492497114274, + "flos": 412223883264.0, + "grad_norm": 0.07327384984258292, + "language_loss": 0.74385369, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.75453013, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.23925781, + "step": 4824, + "time_per_iteration": 2.4685919284820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065774, + "balance_loss_mlp": 1.04250407, + "epoch": 0.928241631396691, + "flos": 646504768512.0, + "grad_norm": 0.0635930076131742, + "language_loss": 0.84229624, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85295397, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.23254395, + "step": 4825, + "time_per_iteration": 2.7666497230529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065352, + "balance_loss_mlp": 1.04148602, + "epoch": 0.9284340130819546, + "flos": 696855094272.0, + "grad_norm": 0.06956360391060491, + "language_loss": 0.80905682, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81971031, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.23852539, + "step": 4826, + "time_per_iteration": 2.9280033111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067687, + "balance_loss_mlp": 1.0441432, + "epoch": 0.9286263947672182, + "flos": 759132062208.0, + "grad_norm": 0.0534141212700452, + "language_loss": 0.83933532, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.85001218, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.23535156, + "step": 4827, + "time_per_iteration": 3.05427622795105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065524, + "balance_loss_mlp": 1.04190862, + "epoch": 0.9288187764524817, + "flos": 672823770624.0, + "grad_norm": 0.0517820493277977, + "language_loss": 0.8033545, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81400979, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.23632812, + "step": 4828, + "time_per_iteration": 2.95302152633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.0422368, + "epoch": 0.9290111581377453, + "flos": 500469290496.0, + "grad_norm": 0.06212228001767646, + "language_loss": 0.8410219, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.85167575, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.23156738, + "step": 4829, + "time_per_iteration": 2.6186985969543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003756, + "balance_loss_mlp": 0.99707985, + "epoch": 0.9292035398230089, + "flos": 1563627566592.0, + "grad_norm": 0.004318231630907556, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73125815, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.06689453, + "step": 4830, + "time_per_iteration": 4.89445424079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004264, + "balance_loss_mlp": 0.99758852, + "epoch": 0.9293959215082724, + "flos": 1518673411584.0, + "grad_norm": 0.00373636872499248, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.805161, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.06689453, + "step": 4831, + "time_per_iteration": 4.8943281173706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063576, + "balance_loss_mlp": 1.0404253, + "epoch": 0.929588303193536, + "flos": 557836844544.0, + "grad_norm": 0.07212229346213533, + "language_loss": 0.84529984, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85593563, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.23120117, + "step": 4832, + "time_per_iteration": 2.670252561569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071068, + "balance_loss_mlp": 1.0462724, + "epoch": 0.9297806848787995, + "flos": 478580285952.0, + "grad_norm": 0.07862143445369829, + "language_loss": 0.80253655, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.8132472, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.2479248, + "step": 4833, + "time_per_iteration": 2.5253655910491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.04317367, + "epoch": 0.9299730665640631, + "flos": 564537641472.0, + "grad_norm": 0.06499018101587725, + "language_loss": 0.80202889, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81269199, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.23144531, + "step": 4834, + "time_per_iteration": 2.754465341567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_mlp": 1.04458618, + "epoch": 0.9301654482493267, + "flos": 560174082048.0, + "grad_norm": 0.059674157785584915, + "language_loss": 0.82694227, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83761966, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.23132324, + "step": 4835, + "time_per_iteration": 2.80916690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003151, + "balance_loss_mlp": 0.9964751, + "epoch": 0.9303578299345903, + "flos": 1520096606208.0, + "grad_norm": 0.004416175426765907, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77855599, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.06689453, + "step": 4836, + "time_per_iteration": 5.02410364151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063476, + "balance_loss_mlp": 1.04054022, + "epoch": 0.9305502116198537, + "flos": 530843134464.0, + "grad_norm": 0.0782967296120336, + "language_loss": 0.83038974, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84102452, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.22937012, + "step": 4837, + "time_per_iteration": 2.6249191761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062018, + "balance_loss_mlp": 1.03849769, + "epoch": 0.9307425933051173, + "flos": 474898775040.0, + "grad_norm": 0.06822826243671351, + "language_loss": 0.81786454, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82848471, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.23522949, + "step": 4838, + "time_per_iteration": 2.540736675262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068071, + "balance_loss_mlp": 1.04443145, + "epoch": 0.9309349749903809, + "flos": 584892223488.0, + "grad_norm": 0.05350792740286597, + "language_loss": 0.86948222, + "learning_rate": 1.245693929549213e-05, + "loss": 0.88016295, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.23620605, + "step": 4839, + "time_per_iteration": 2.7612762451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063218, + "balance_loss_mlp": 1.04080653, + "epoch": 0.9311273566756445, + "flos": 861666315264.0, + "grad_norm": 0.0537936621956177, + "language_loss": 0.76869768, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77932984, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.22412109, + "step": 4840, + "time_per_iteration": 3.152477502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070778, + "balance_loss_mlp": 1.0476867, + "epoch": 0.9313197383609081, + "flos": 548094366720.0, + "grad_norm": 0.06050694450577352, + "language_loss": 0.82516444, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83587223, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.23083496, + "step": 4841, + "time_per_iteration": 2.6213271617889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069132, + "balance_loss_mlp": 1.04567182, + "epoch": 0.9315121200461716, + "flos": 468756315648.0, + "grad_norm": 0.07421356756847869, + "language_loss": 0.8145088, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82520008, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.234375, + "step": 4842, + "time_per_iteration": 2.519787549972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066338, + "balance_loss_mlp": 1.04315162, + "epoch": 0.9317045017314352, + "flos": 417659701248.0, + "grad_norm": 0.06920394282735456, + "language_loss": 0.77784127, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78850466, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.23181152, + "step": 4843, + "time_per_iteration": 2.499638080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065399, + "balance_loss_mlp": 1.04216504, + "epoch": 0.9318968834166987, + "flos": 540489065472.0, + "grad_norm": 0.05744852961978849, + "language_loss": 0.77309132, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78374529, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.23217773, + "step": 4844, + "time_per_iteration": 2.7709178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066931, + "balance_loss_mlp": 1.04338753, + "epoch": 0.9320892651019623, + "flos": 521330452992.0, + "grad_norm": 0.05796617525488239, + "language_loss": 0.80614036, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81680971, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.23522949, + "step": 4845, + "time_per_iteration": 2.6631603240966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060552, + "balance_loss_mlp": 1.03811717, + "epoch": 0.9322816467872258, + "flos": 582072998400.0, + "grad_norm": 0.05607447986709382, + "language_loss": 0.80583751, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.81644303, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.22424316, + "step": 4846, + "time_per_iteration": 2.787560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.03873289, + "epoch": 0.9324740284724894, + "flos": 484747338240.0, + "grad_norm": 0.06368063166693694, + "language_loss": 0.82036614, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83098686, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.2331543, + "step": 4847, + "time_per_iteration": 2.7033135890960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067437, + "balance_loss_mlp": 1.04408431, + "epoch": 0.932666410157753, + "flos": 732585461760.0, + "grad_norm": 0.061109195979706835, + "language_loss": 0.82817626, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83885062, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.23352051, + "step": 4848, + "time_per_iteration": 3.0654428005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065268, + "balance_loss_mlp": 1.04224896, + "epoch": 0.9328587918430166, + "flos": 965537127936.0, + "grad_norm": 0.05439499186899724, + "language_loss": 0.78829134, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.798944, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.23010254, + "step": 4849, + "time_per_iteration": 3.2687015533447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068837, + "balance_loss_mlp": 1.04509044, + "epoch": 0.9330511735282802, + "flos": 614552085504.0, + "grad_norm": 0.06187219780448243, + "language_loss": 0.80620849, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81689686, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.23706055, + "step": 4850, + "time_per_iteration": 2.7616846561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064216, + "balance_loss_mlp": 1.04126775, + "epoch": 0.9332435552135436, + "flos": 559101823488.0, + "grad_norm": 0.057342153109796054, + "language_loss": 0.8581, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86874211, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.22937012, + "step": 4851, + "time_per_iteration": 2.715174913406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069652, + "balance_loss_mlp": 1.0457387, + "epoch": 0.9334359368988072, + "flos": 515536358400.0, + "grad_norm": 0.057032353569238206, + "language_loss": 0.8192578, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.82995433, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.23901367, + "step": 4852, + "time_per_iteration": 2.608262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064317, + "balance_loss_mlp": 1.0412972, + "epoch": 0.9336283185840708, + "flos": 539809588224.0, + "grad_norm": 0.07695737048357706, + "language_loss": 0.82685083, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83749396, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.23022461, + "step": 4853, + "time_per_iteration": 2.755443811416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002947, + "balance_loss_mlp": 0.99627131, + "epoch": 0.9338207002693344, + "flos": 1562824751616.0, + "grad_norm": 0.004836287294760912, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79457963, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.06689453, + "step": 4854, + "time_per_iteration": 4.906665325164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064754, + "balance_loss_mlp": 1.04140043, + "epoch": 0.9340130819545979, + "flos": 645261811200.0, + "grad_norm": 0.05327327951901491, + "language_loss": 0.81519377, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82584137, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.23352051, + "step": 4855, + "time_per_iteration": 2.9034512042999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064262, + "balance_loss_mlp": 1.04152846, + "epoch": 0.9342054636398615, + "flos": 503441588736.0, + "grad_norm": 0.059211036123640204, + "language_loss": 0.77279824, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.78344083, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.22729492, + "step": 4856, + "time_per_iteration": 2.6446688175201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065745, + "balance_loss_mlp": 1.04301167, + "epoch": 0.934397845325125, + "flos": 593026126848.0, + "grad_norm": 0.0688579272968578, + "language_loss": 0.84582496, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85648245, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.22741699, + "step": 4857, + "time_per_iteration": 2.8205456733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063182, + "balance_loss_mlp": 1.04081821, + "epoch": 0.9345902270103886, + "flos": 499891129344.0, + "grad_norm": 0.05512577442694981, + "language_loss": 0.80538815, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81601995, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.22363281, + "step": 4858, + "time_per_iteration": 2.559678554534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100331, + "balance_loss_mlp": 0.99663389, + "epoch": 0.9347826086956522, + "flos": 1520329347072.0, + "grad_norm": 0.003912108369513215, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76991028, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.06689453, + "step": 4859, + "time_per_iteration": 4.678871393203735 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064544, + "balance_loss_mlp": 1.04120278, + "epoch": 0.9349749903809157, + "flos": 504550923264.0, + "grad_norm": 0.07717239642081329, + "language_loss": 0.81449348, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82513893, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.23327637, + "step": 4860, + "time_per_iteration": 2.7969114780426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106448, + "balance_loss_mlp": 1.04100811, + "epoch": 0.9351673720661793, + "flos": 568901200896.0, + "grad_norm": 0.07866007536371793, + "language_loss": 0.78877968, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.79942441, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.23474121, + "step": 4861, + "time_per_iteration": 2.647122383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061655, + "balance_loss_mlp": 1.03809905, + "epoch": 0.9353597537514429, + "flos": 544605576192.0, + "grad_norm": 0.06625680401164971, + "language_loss": 0.86626673, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87688333, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.23547363, + "step": 4862, + "time_per_iteration": 2.6402182579040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061832, + "balance_loss_mlp": 1.0390749, + "epoch": 0.9355521354367065, + "flos": 518997984768.0, + "grad_norm": 0.045603108717518104, + "language_loss": 0.84642792, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85704625, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.22753906, + "step": 4863, + "time_per_iteration": 2.7369048595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062849, + "balance_loss_mlp": 1.04003191, + "epoch": 0.93574451712197, + "flos": 446316314112.0, + "grad_norm": 0.06738020369816806, + "language_loss": 0.78858817, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79921663, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.22790527, + "step": 4864, + "time_per_iteration": 2.5434770584106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062216, + "balance_loss_mlp": 1.03907764, + "epoch": 0.9359368988072335, + "flos": 480517401600.0, + "grad_norm": 0.06755819858546042, + "language_loss": 0.77495611, + "learning_rate": 1.072417553472832e-05, + "loss": 0.78557825, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.23132324, + "step": 4865, + "time_per_iteration": 2.51901912689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063392, + "balance_loss_mlp": 1.04114723, + "epoch": 0.9361292804924971, + "flos": 497118892032.0, + "grad_norm": 0.06722920807040486, + "language_loss": 0.85406494, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86469889, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.22265625, + "step": 4866, + "time_per_iteration": 2.626563787460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062707, + "balance_loss_mlp": 1.04009354, + "epoch": 0.9363216621777607, + "flos": 618122368512.0, + "grad_norm": 0.058043061809792344, + "language_loss": 0.84187984, + "learning_rate": 1.059619902982184e-05, + "loss": 0.852507, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.22595215, + "step": 4867, + "time_per_iteration": 2.7498879432678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002771, + "balance_loss_mlp": 0.99609506, + "epoch": 0.9365140438630243, + "flos": 1415929559040.0, + "grad_norm": 0.0036492832338999074, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80206108, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.06689453, + "step": 4868, + "time_per_iteration": 4.8916120529174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_mlp": 1.04079998, + "epoch": 0.9367064255482878, + "flos": 590503509504.0, + "grad_norm": 0.05992901808998059, + "language_loss": 0.81489742, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82553828, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.23291016, + "step": 4869, + "time_per_iteration": 2.6934800148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063, + "balance_loss_mlp": 1.04038572, + "epoch": 0.9368988072335513, + "flos": 526637790720.0, + "grad_norm": 0.05806012703031958, + "language_loss": 0.82367408, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83430409, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.22631836, + "step": 4870, + "time_per_iteration": 2.6466493606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.04546094, + "epoch": 0.9370911889188149, + "flos": 743205279744.0, + "grad_norm": 0.05796983188837931, + "language_loss": 0.79065406, + "learning_rate": 1.034252625822113e-05, + "loss": 0.80134535, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.2364502, + "step": 4871, + "time_per_iteration": 2.895653009414673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010612, + "balance_loss_mlp": 1.03949165, + "epoch": 0.9372835706040785, + "flos": 546038682624.0, + "grad_norm": 0.05945674857621565, + "language_loss": 0.79093826, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.80155027, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.21716309, + "step": 4872, + "time_per_iteration": 2.643869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065754, + "balance_loss_mlp": 1.04218626, + "epoch": 0.9374759522893421, + "flos": 491633515008.0, + "grad_norm": 0.05500205513131922, + "language_loss": 0.81993419, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.83059168, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.23571777, + "step": 4873, + "time_per_iteration": 2.6483852863311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.04451275, + "epoch": 0.9376683339746056, + "flos": 578421222912.0, + "grad_norm": 0.058395935150287376, + "language_loss": 0.82688534, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83756995, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.23925781, + "step": 4874, + "time_per_iteration": 2.6801791191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065584, + "balance_loss_mlp": 1.04139686, + "epoch": 0.9378607156598692, + "flos": 506290549248.0, + "grad_norm": 0.07310615221307289, + "language_loss": 0.80254924, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81320512, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.24182129, + "step": 4875, + "time_per_iteration": 2.646641492843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062329, + "balance_loss_mlp": 1.03869009, + "epoch": 0.9380530973451328, + "flos": 520015915008.0, + "grad_norm": 0.07308635184773467, + "language_loss": 0.77842414, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.78904748, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.23632812, + "step": 4876, + "time_per_iteration": 2.6062300205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_mlp": 1.04104531, + "epoch": 0.9382454790303963, + "flos": 557799768576.0, + "grad_norm": 0.06375694667062341, + "language_loss": 0.84655243, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85719502, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.23193359, + "step": 4877, + "time_per_iteration": 2.645962953567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064251, + "balance_loss_mlp": 1.04075491, + "epoch": 0.9384378607156598, + "flos": 695476316160.0, + "grad_norm": 0.06248438585170171, + "language_loss": 0.81895542, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82959789, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.23486328, + "step": 4878, + "time_per_iteration": 2.89534330368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065421, + "balance_loss_mlp": 1.04216325, + "epoch": 0.9386302424009234, + "flos": 554750747136.0, + "grad_norm": 0.06499784582552266, + "language_loss": 0.81068319, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82133734, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.23254395, + "step": 4879, + "time_per_iteration": 2.637808084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106776, + "balance_loss_mlp": 1.04457378, + "epoch": 0.938822624086187, + "flos": 566981337600.0, + "grad_norm": 0.06495588202273726, + "language_loss": 0.79970872, + "learning_rate": 9.782885847304469e-06, + "loss": 0.8103863, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.23181152, + "step": 4880, + "time_per_iteration": 2.6463756561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_mlp": 1.04586959, + "epoch": 0.9390150057714506, + "flos": 417602801664.0, + "grad_norm": 0.055703733749820204, + "language_loss": 0.80662251, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81730676, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.22546387, + "step": 4881, + "time_per_iteration": 2.5764780044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064129, + "balance_loss_mlp": 1.04130054, + "epoch": 0.9392073874567142, + "flos": 1553839967232.0, + "grad_norm": 0.06423619472618826, + "language_loss": 0.76624274, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77688408, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.22802734, + "step": 4882, + "time_per_iteration": 3.687774181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066846, + "balance_loss_mlp": 1.0433979, + "epoch": 0.9393997691419776, + "flos": 652536000000.0, + "grad_norm": 0.061865375214900514, + "language_loss": 0.78620249, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79687095, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.23449707, + "step": 4883, + "time_per_iteration": 2.762404441833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002747, + "balance_loss_mlp": 0.9960711, + "epoch": 0.9395921508272412, + "flos": 1553294817792.0, + "grad_norm": 0.003644027064254583, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79173422, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.06689453, + "step": 4884, + "time_per_iteration": 4.8244874477386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072097, + "balance_loss_mlp": 1.04874396, + "epoch": 0.9397845325125048, + "flos": 498144162816.0, + "grad_norm": 0.05308961569497824, + "language_loss": 0.78810966, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79883063, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.23339844, + "step": 4885, + "time_per_iteration": 2.616199254989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_mlp": 1.04230857, + "epoch": 0.9399769141977684, + "flos": 498596414976.0, + "grad_norm": 0.06818307389884938, + "language_loss": 0.83659059, + "learning_rate": 9.418355513755638e-06, + "loss": 0.8472355, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.22180176, + "step": 4886, + "time_per_iteration": 2.601780891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003192, + "balance_loss_mlp": 0.99651599, + "epoch": 0.9401692958830319, + "flos": 1402500427776.0, + "grad_norm": 0.0034647396608713126, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80335385, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.06689453, + "step": 4887, + "time_per_iteration": 4.828904151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063627, + "balance_loss_mlp": 1.04023838, + "epoch": 0.9403616775682955, + "flos": 540123448320.0, + "grad_norm": 0.04725299945235895, + "language_loss": 0.85205865, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86269492, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.23400879, + "step": 4888, + "time_per_iteration": 2.7369189262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003192, + "balance_loss_mlp": 0.99651647, + "epoch": 0.9405540592535591, + "flos": 1322058184704.0, + "grad_norm": 0.0034657140386961023, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76172626, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.06689453, + "step": 4889, + "time_per_iteration": 4.934547662734985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062139, + "balance_loss_mlp": 1.03936982, + "epoch": 0.9407464409388226, + "flos": 572362827264.0, + "grad_norm": 0.08003864930040928, + "language_loss": 0.8301568, + "learning_rate": 9.179144190235799e-06, + "loss": 0.84077823, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.22766113, + "step": 4890, + "time_per_iteration": 2.6717724800109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065219, + "balance_loss_mlp": 1.04135287, + "epoch": 0.9409388226240862, + "flos": 511264203264.0, + "grad_norm": 0.06453478244721644, + "language_loss": 0.77116787, + "learning_rate": 9.119817685386112e-06, + "loss": 0.78182006, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.23840332, + "step": 4891, + "time_per_iteration": 2.7476096153259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003182, + "balance_loss_mlp": 0.99650663, + "epoch": 0.9411312043093497, + "flos": 1569901077504.0, + "grad_norm": 0.003464956988218373, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81245065, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.06689453, + "step": 4892, + "time_per_iteration": 4.850857496261597 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067071, + "balance_loss_mlp": 1.04444468, + "epoch": 0.9413235859946133, + "flos": 569469450240.0, + "grad_norm": 0.06650955489841599, + "language_loss": 0.78249395, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79316461, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.22619629, + "step": 4893, + "time_per_iteration": 2.7343316078186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_mlp": 1.04241848, + "epoch": 0.9415159676798769, + "flos": 781905747456.0, + "grad_norm": 0.06610671863402162, + "language_loss": 0.80428064, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81493282, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.22814941, + "step": 4894, + "time_per_iteration": 2.9975225925445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063059, + "balance_loss_mlp": 1.03932428, + "epoch": 0.9417083493651405, + "flos": 849341749248.0, + "grad_norm": 0.05474998370074054, + "language_loss": 0.80078602, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81141663, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.23730469, + "step": 4895, + "time_per_iteration": 3.1650521755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065606, + "balance_loss_mlp": 1.04243147, + "epoch": 0.941900731050404, + "flos": 529333304832.0, + "grad_norm": 0.06924558330939024, + "language_loss": 0.85654449, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86720055, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.23144531, + "step": 4896, + "time_per_iteration": 2.689373731613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062226, + "balance_loss_mlp": 1.03861082, + "epoch": 0.9420931127356675, + "flos": 557073303552.0, + "grad_norm": 0.05304188510469451, + "language_loss": 0.79942775, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81005001, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.23583984, + "step": 4897, + "time_per_iteration": 2.7777161598205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064715, + "balance_loss_mlp": 1.04198134, + "epoch": 0.9422854944209311, + "flos": 652543340544.0, + "grad_norm": 0.06756755796170955, + "language_loss": 0.86650455, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87715167, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.22717285, + "step": 4898, + "time_per_iteration": 2.8182201385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065783, + "balance_loss_mlp": 1.04313302, + "epoch": 0.9424778761061947, + "flos": 553685829120.0, + "grad_norm": 0.057617902456089394, + "language_loss": 0.84157532, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85223317, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.22644043, + "step": 4899, + "time_per_iteration": 2.6730167865753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061325, + "balance_loss_mlp": 1.03837681, + "epoch": 0.9426702577914583, + "flos": 588559053312.0, + "grad_norm": 0.06833282478625986, + "language_loss": 0.79837596, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80898917, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.22937012, + "step": 4900, + "time_per_iteration": 2.716416358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106177, + "balance_loss_mlp": 1.03855944, + "epoch": 0.9428626394767218, + "flos": 616625021952.0, + "grad_norm": 0.07344461443720736, + "language_loss": 0.78565979, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79627746, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.2322998, + "step": 4901, + "time_per_iteration": 2.8687591552734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106181, + "balance_loss_mlp": 1.03870749, + "epoch": 0.9430550211619854, + "flos": 610410981888.0, + "grad_norm": 0.06554635995861575, + "language_loss": 0.81850672, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82912481, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.2310791, + "step": 4902, + "time_per_iteration": 2.7098758220672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062796, + "balance_loss_mlp": 1.04032564, + "epoch": 0.943247402847249, + "flos": 565990571520.0, + "grad_norm": 0.05937812811143957, + "language_loss": 0.78285533, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79348326, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.22460938, + "step": 4903, + "time_per_iteration": 2.7645294666290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067307, + "balance_loss_mlp": 1.04396605, + "epoch": 0.9434397845325125, + "flos": 527040483840.0, + "grad_norm": 0.06344384420879408, + "language_loss": 0.81625634, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82692945, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.23339844, + "step": 4904, + "time_per_iteration": 2.635781764984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066045, + "balance_loss_mlp": 1.0433718, + "epoch": 0.943632166217776, + "flos": 593451214848.0, + "grad_norm": 0.05889111669093531, + "language_loss": 0.83035827, + "learning_rate": 8.309267504391593e-06, + "loss": 0.84101868, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.2265625, + "step": 4905, + "time_per_iteration": 2.713594436645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063603, + "balance_loss_mlp": 1.04072678, + "epoch": 0.9438245479030396, + "flos": 572770289664.0, + "grad_norm": 0.050528675559877, + "language_loss": 0.85556394, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86620003, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.22875977, + "step": 4906, + "time_per_iteration": 2.828354597091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066076, + "balance_loss_mlp": 1.04285407, + "epoch": 0.9440169295883032, + "flos": 488258523648.0, + "grad_norm": 0.05995138028136088, + "language_loss": 0.81837797, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82903874, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.23217773, + "step": 4907, + "time_per_iteration": 2.5686655044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063058, + "balance_loss_mlp": 1.03981233, + "epoch": 0.9442093112735668, + "flos": 731742999552.0, + "grad_norm": 0.0676773058435964, + "language_loss": 0.73885441, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74948502, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.23242188, + "step": 4908, + "time_per_iteration": 2.980459451675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060915, + "balance_loss_mlp": 1.03826475, + "epoch": 0.9444016929588304, + "flos": 571031036928.0, + "grad_norm": 0.06922149829853487, + "language_loss": 0.81962436, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83023351, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.22668457, + "step": 4909, + "time_per_iteration": 2.657578229904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_mlp": 1.0400275, + "epoch": 0.9445940746440938, + "flos": 509292582912.0, + "grad_norm": 0.06711231181162121, + "language_loss": 0.86318266, + "learning_rate": 8.028849459169318e-06, + "loss": 0.87381876, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.23583984, + "step": 4910, + "time_per_iteration": 2.610914707183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067641, + "balance_loss_mlp": 1.04381144, + "epoch": 0.9447864563293574, + "flos": 624556293120.0, + "grad_norm": 0.07115824086322278, + "language_loss": 0.80972213, + "learning_rate": 7.97333876382028e-06, + "loss": 0.82039851, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.23828125, + "step": 4911, + "time_per_iteration": 2.835008144378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064397, + "balance_loss_mlp": 1.04086518, + "epoch": 0.944978838014621, + "flos": 505270047744.0, + "grad_norm": 0.06363073940641663, + "language_loss": 0.80821174, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81885576, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.23498535, + "step": 4912, + "time_per_iteration": 2.706355571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004085, + "balance_loss_mlp": 0.99740899, + "epoch": 0.9451712196998846, + "flos": 1484205451776.0, + "grad_norm": 0.0037181691886226226, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79291421, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.06689453, + "step": 4913, + "time_per_iteration": 4.950761318206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064844, + "balance_loss_mlp": 1.04196763, + "epoch": 0.9453636013851482, + "flos": 521137732608.0, + "grad_norm": 0.06260745945333845, + "language_loss": 0.90274841, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91339684, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.2286377, + "step": 4914, + "time_per_iteration": 2.673654079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004093, + "balance_loss_mlp": 0.99741739, + "epoch": 0.9455559830704117, + "flos": 1496902975488.0, + "grad_norm": 0.003715128651478327, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84566444, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.06689453, + "step": 4915, + "time_per_iteration": 4.9783148765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.04273486, + "epoch": 0.9457483647556753, + "flos": 498126910464.0, + "grad_norm": 0.056832232509337464, + "language_loss": 0.81964576, + "learning_rate": 7.698651040865534e-06, + "loss": 0.8303082, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.23486328, + "step": 4916, + "time_per_iteration": 2.621641159057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066257, + "balance_loss_mlp": 1.04305935, + "epoch": 0.9459407464409388, + "flos": 1019405979648.0, + "grad_norm": 0.05378819222671923, + "language_loss": 0.82485378, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83551633, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.23168945, + "step": 4917, + "time_per_iteration": 3.4095513820648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066193, + "balance_loss_mlp": 1.04319763, + "epoch": 0.9461331281262024, + "flos": 513589330944.0, + "grad_norm": 0.060476805644650536, + "language_loss": 0.81080627, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82146823, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.22998047, + "step": 4918, + "time_per_iteration": 2.597259283065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067245, + "balance_loss_mlp": 1.04371297, + "epoch": 0.9463255098114659, + "flos": 528023909376.0, + "grad_norm": 0.06478433442615823, + "language_loss": 0.78112018, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79179263, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.23522949, + "step": 4919, + "time_per_iteration": 2.6052768230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106469, + "balance_loss_mlp": 1.04148018, + "epoch": 0.9465178914967295, + "flos": 506043500544.0, + "grad_norm": 0.08274002373229264, + "language_loss": 0.83619392, + "learning_rate": 7.482341043430485e-06, + "loss": 0.84684086, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.23193359, + "step": 4920, + "time_per_iteration": 2.5524473190307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010621, + "balance_loss_mlp": 1.03874683, + "epoch": 0.9467102731819931, + "flos": 660254727168.0, + "grad_norm": 0.07960983863422562, + "language_loss": 0.85914946, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86977041, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.23339844, + "step": 4921, + "time_per_iteration": 2.9114954471588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063495, + "balance_loss_mlp": 1.04008269, + "epoch": 0.9469026548672567, + "flos": 675183403008.0, + "grad_norm": 0.06504331603068483, + "language_loss": 0.89519143, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90582639, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.23400879, + "step": 4922, + "time_per_iteration": 2.917511463165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064556, + "balance_loss_mlp": 1.04024911, + "epoch": 0.9470950365525203, + "flos": 513964859904.0, + "grad_norm": 0.06142738243292726, + "language_loss": 0.79989028, + "learning_rate": 7.32211620090012e-06, + "loss": 0.81053579, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.24291992, + "step": 4923, + "time_per_iteration": 2.594297170639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065563, + "balance_loss_mlp": 1.04267466, + "epoch": 0.9472874182377837, + "flos": 550103063040.0, + "grad_norm": 0.11094317680560754, + "language_loss": 0.81383359, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82448924, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.22888184, + "step": 4924, + "time_per_iteration": 2.7379791736602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062378, + "balance_loss_mlp": 1.03937054, + "epoch": 0.9474797999230473, + "flos": 542769776640.0, + "grad_norm": 0.0655870162966716, + "language_loss": 0.80459619, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81522, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.22998047, + "step": 4925, + "time_per_iteration": 2.6647040843963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063317, + "balance_loss_mlp": 1.03977299, + "epoch": 0.9476721816083109, + "flos": 844644879360.0, + "grad_norm": 0.057403972523752636, + "language_loss": 0.85956061, + "learning_rate": 7.163612828585242e-06, + "loss": 0.87019372, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.23522949, + "step": 4926, + "time_per_iteration": 3.1154632568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065094, + "balance_loss_mlp": 1.04253972, + "epoch": 0.9478645632935745, + "flos": 638002676736.0, + "grad_norm": 0.05945675651641381, + "language_loss": 0.79295301, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80360401, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.22546387, + "step": 4927, + "time_per_iteration": 2.7876851558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062569, + "balance_loss_mlp": 1.04003882, + "epoch": 0.948056944978838, + "flos": 656832748032.0, + "grad_norm": 0.06197261766309, + "language_loss": 0.76143181, + "learning_rate": 7.058900559793469e-06, + "loss": 0.77205747, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.22521973, + "step": 4928, + "time_per_iteration": 2.8076236248016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061132, + "balance_loss_mlp": 1.03792191, + "epoch": 0.9482493266641016, + "flos": 440907660288.0, + "grad_norm": 0.06807992051239511, + "language_loss": 0.83396912, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84458041, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.23193359, + "step": 4929, + "time_per_iteration": 3.9939382076263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067537, + "balance_loss_mlp": 1.04376614, + "epoch": 0.9484417083493651, + "flos": 545989123584.0, + "grad_norm": 0.0624157967268652, + "language_loss": 0.78332889, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.7940042, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.23742676, + "step": 4930, + "time_per_iteration": 2.833548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066671, + "balance_loss_mlp": 1.04350913, + "epoch": 0.9486340900346287, + "flos": 538598937600.0, + "grad_norm": 0.054248039463331794, + "language_loss": 0.79658103, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80724776, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.23144531, + "step": 4931, + "time_per_iteration": 2.6925623416900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065473, + "balance_loss_mlp": 1.04097533, + "epoch": 0.9488264717198923, + "flos": 681669457920.0, + "grad_norm": 0.05662742875275373, + "language_loss": 0.85853779, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86919254, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.24487305, + "step": 4932, + "time_per_iteration": 2.824794054031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059786, + "balance_loss_mlp": 1.03684974, + "epoch": 0.9490188534051558, + "flos": 462603944448.0, + "grad_norm": 0.06760036759869348, + "language_loss": 0.87858427, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.88918209, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.22949219, + "step": 4933, + "time_per_iteration": 2.523742914199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064105, + "balance_loss_mlp": 1.04112101, + "epoch": 0.9492112350904194, + "flos": 543135393792.0, + "grad_norm": 0.05780879872940495, + "language_loss": 0.82862514, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83926618, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.22973633, + "step": 4934, + "time_per_iteration": 2.7232470512390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058669, + "balance_loss_mlp": 1.03604269, + "epoch": 0.949403616775683, + "flos": 550322947584.0, + "grad_norm": 0.05314252301196156, + "language_loss": 0.84149778, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85208452, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.22607422, + "step": 4935, + "time_per_iteration": 2.6983773708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065017, + "balance_loss_mlp": 1.04250991, + "epoch": 0.9495959984609466, + "flos": 598383023616.0, + "grad_norm": 0.05800504294431128, + "language_loss": 0.83177608, + "learning_rate": 6.647708160456678e-06, + "loss": 0.84242618, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.22485352, + "step": 4936, + "time_per_iteration": 2.698988437652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_mlp": 1.04014874, + "epoch": 0.94978838014621, + "flos": 608409626112.0, + "grad_norm": 0.05755130237878741, + "language_loss": 0.82729852, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83793151, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.23132324, + "step": 4937, + "time_per_iteration": 2.7943813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066129, + "balance_loss_mlp": 1.04247832, + "epoch": 0.9499807618314736, + "flos": 540832660992.0, + "grad_norm": 0.05836751006762688, + "language_loss": 0.86831605, + "learning_rate": 6.546825027775427e-06, + "loss": 0.8789773, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.23620605, + "step": 4938, + "time_per_iteration": 2.627950668334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106465, + "balance_loss_mlp": 1.0412848, + "epoch": 0.9501731435167372, + "flos": 594600196608.0, + "grad_norm": 0.05400011970264735, + "language_loss": 0.82876295, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83940947, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.23364258, + "step": 4939, + "time_per_iteration": 2.6900975704193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061908, + "balance_loss_mlp": 1.03847146, + "epoch": 0.9503655252020008, + "flos": 454138928640.0, + "grad_norm": 0.06191054950237094, + "language_loss": 0.80292475, + "learning_rate": 6.446708197070161e-06, + "loss": 0.8135438, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.234375, + "step": 4940, + "time_per_iteration": 2.5250589847564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062795, + "balance_loss_mlp": 1.03952527, + "epoch": 0.9505579068872644, + "flos": 667944092160.0, + "grad_norm": 0.05771055255851234, + "language_loss": 0.84632671, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85695469, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.23266602, + "step": 4941, + "time_per_iteration": 2.778857946395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066363, + "balance_loss_mlp": 1.04283106, + "epoch": 0.9507502885725279, + "flos": 402207192576.0, + "grad_norm": 0.062273521229545936, + "language_loss": 0.818012, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82867563, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.23535156, + "step": 4942, + "time_per_iteration": 2.4828665256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063047, + "balance_loss_mlp": 1.04011106, + "epoch": 0.9509426702577914, + "flos": 700358565888.0, + "grad_norm": 0.058771865769073964, + "language_loss": 0.79593182, + "learning_rate": 6.297970106994011e-06, + "loss": 0.8065623, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.22912598, + "step": 4943, + "time_per_iteration": 2.98268723487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063645, + "balance_loss_mlp": 1.04156792, + "epoch": 0.951135051943055, + "flos": 501415640064.0, + "grad_norm": 0.062322209370641694, + "language_loss": 0.82714278, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83777922, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.22070312, + "step": 4944, + "time_per_iteration": 2.57051157951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_mlp": 1.03986096, + "epoch": 0.9513274336283186, + "flos": 614621094912.0, + "grad_norm": 0.0547837860585051, + "language_loss": 0.81620574, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82683134, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.22692871, + "step": 4945, + "time_per_iteration": 2.9212303161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062052, + "balance_loss_mlp": 1.03908038, + "epoch": 0.9515198153135821, + "flos": 519586057728.0, + "grad_norm": 0.06872755929487602, + "language_loss": 0.81921458, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82983512, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.22949219, + "step": 4946, + "time_per_iteration": 2.5664422512054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065752, + "balance_loss_mlp": 1.04230392, + "epoch": 0.9517121969988457, + "flos": 664954168320.0, + "grad_norm": 0.05309985428265593, + "language_loss": 0.7667439, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77740133, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.23449707, + "step": 4947, + "time_per_iteration": 2.97707200050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062851, + "balance_loss_mlp": 1.03965259, + "epoch": 0.9519045786841093, + "flos": 676409107968.0, + "grad_norm": 0.06883652339076757, + "language_loss": 0.76289392, + "learning_rate": 6.053906985658553e-06, + "loss": 0.7735225, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.23193359, + "step": 4948, + "time_per_iteration": 2.8374738693237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067243, + "balance_loss_mlp": 1.04439044, + "epoch": 0.9520969603693729, + "flos": 652901617152.0, + "grad_norm": 0.058068098639605055, + "language_loss": 0.8045215, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81519395, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.22851562, + "step": 4949, + "time_per_iteration": 2.82055401802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.04118955, + "epoch": 0.9522893420546364, + "flos": 743284200960.0, + "grad_norm": 0.050157087967027066, + "language_loss": 0.83311605, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84375304, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.22521973, + "step": 4950, + "time_per_iteration": 3.023888111114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.03994191, + "epoch": 0.9524817237398999, + "flos": 761696898048.0, + "grad_norm": 0.0679983686289968, + "language_loss": 0.81021792, + "learning_rate": 5.909770163964545e-06, + "loss": 0.82085323, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.23571777, + "step": 4951, + "time_per_iteration": 2.947537660598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059285, + "balance_loss_mlp": 1.03614664, + "epoch": 0.9526741054251635, + "flos": 529125903360.0, + "grad_norm": 0.0629324357168766, + "language_loss": 0.81972486, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83031774, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.23120117, + "step": 4952, + "time_per_iteration": 2.5630245208740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064616, + "balance_loss_mlp": 1.04153693, + "epoch": 0.9528664871104271, + "flos": 488441332224.0, + "grad_norm": 0.06485983174912871, + "language_loss": 0.80928588, + "learning_rate": 5.814638032609787e-06, + "loss": 0.81993204, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.23083496, + "step": 4953, + "time_per_iteration": 2.5676066875457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061965, + "balance_loss_mlp": 1.03951788, + "epoch": 0.9530588687956907, + "flos": 517745115648.0, + "grad_norm": 0.05287711517893565, + "language_loss": 0.85460746, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86522716, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.2244873, + "step": 4954, + "time_per_iteration": 2.8287205696105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066092, + "balance_loss_mlp": 1.04302478, + "epoch": 0.9532512504809542, + "flos": 675148898304.0, + "grad_norm": 0.06332484587058154, + "language_loss": 0.8102749, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82093585, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.23059082, + "step": 4955, + "time_per_iteration": 2.8266055583953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_mlp": 1.03993309, + "epoch": 0.9534436321662177, + "flos": 489523502592.0, + "grad_norm": 0.06865327401481261, + "language_loss": 0.84251958, + "learning_rate": 5.673378829575249e-06, + "loss": 0.8531605, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.24121094, + "step": 4956, + "time_per_iteration": 2.5656938552856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065867, + "balance_loss_mlp": 1.04219234, + "epoch": 0.9536360138514813, + "flos": 496585147392.0, + "grad_norm": 0.0615272303032046, + "language_loss": 0.8198781, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83053672, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.23657227, + "step": 4957, + "time_per_iteration": 2.621752977371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067276, + "balance_loss_mlp": 1.04463863, + "epoch": 0.9538283955367449, + "flos": 801462283776.0, + "grad_norm": 0.05741952715263904, + "language_loss": 0.84171546, + "learning_rate": 5.580165570157114e-06, + "loss": 0.8523882, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.22644043, + "step": 4958, + "time_per_iteration": 3.0494062900543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010663, + "balance_loss_mlp": 1.04333985, + "epoch": 0.9540207772220085, + "flos": 556668039168.0, + "grad_norm": 0.048223393326200514, + "language_loss": 0.80173922, + "learning_rate": 5.533846857624203e-06, + "loss": 0.81240225, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.22949219, + "step": 4959, + "time_per_iteration": 2.7545664310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061401, + "balance_loss_mlp": 1.03826261, + "epoch": 0.954213158907272, + "flos": 684505935360.0, + "grad_norm": 0.055758007049536804, + "language_loss": 0.81805837, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82867241, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.23156738, + "step": 4960, + "time_per_iteration": 2.9386799335479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.04121327, + "epoch": 0.9544055405925356, + "flos": 535752548352.0, + "grad_norm": 0.08819296696550104, + "language_loss": 0.8276701, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83831513, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.23303223, + "step": 4961, + "time_per_iteration": 2.7551167011260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066455, + "balance_loss_mlp": 1.04275656, + "epoch": 0.9545979222777992, + "flos": 825404401152.0, + "grad_norm": 0.06311984743506972, + "language_loss": 0.80238789, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81305242, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.23693848, + "step": 4962, + "time_per_iteration": 3.1598803997039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064254, + "balance_loss_mlp": 1.04111564, + "epoch": 0.9547903039630627, + "flos": 761691755520.0, + "grad_norm": 0.05994282205738248, + "language_loss": 0.77416027, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78480279, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.23144531, + "step": 4963, + "time_per_iteration": 3.1220405101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067536, + "balance_loss_mlp": 1.04460001, + "epoch": 0.9549826856483262, + "flos": 515306562048.0, + "grad_norm": 0.060039013851122376, + "language_loss": 0.83022189, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.84089726, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.22961426, + "step": 4964, + "time_per_iteration": 2.5976028442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065198, + "balance_loss_mlp": 1.04267979, + "epoch": 0.9551750673335898, + "flos": 643107382272.0, + "grad_norm": 0.05890053188259814, + "language_loss": 0.82927918, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83993113, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.22509766, + "step": 4965, + "time_per_iteration": 2.8856465816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063869, + "balance_loss_mlp": 1.04065895, + "epoch": 0.9553674490188534, + "flos": 472208030208.0, + "grad_norm": 0.05765142290447974, + "language_loss": 0.83205676, + "learning_rate": 5.214991993520546e-06, + "loss": 0.84269542, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.23205566, + "step": 4966, + "time_per_iteration": 2.5833020210266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068625, + "balance_loss_mlp": 1.04520023, + "epoch": 0.955559830704117, + "flos": 528317945856.0, + "grad_norm": 0.06557271954059918, + "language_loss": 0.82097799, + "learning_rate": 5.170209528521763e-06, + "loss": 0.8316642, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.23400879, + "step": 4967, + "time_per_iteration": 2.5960209369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062797, + "balance_loss_mlp": 1.03951526, + "epoch": 0.9557522123893806, + "flos": 548168518656.0, + "grad_norm": 0.058067854328256556, + "language_loss": 0.84217876, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85280675, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.23266602, + "step": 4968, + "time_per_iteration": 2.653374195098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062828, + "balance_loss_mlp": 1.03951049, + "epoch": 0.955944594074644, + "flos": 509465479680.0, + "grad_norm": 0.05965226251084018, + "language_loss": 0.82160389, + "learning_rate": 5.08122094572222e-06, + "loss": 0.83223224, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.2331543, + "step": 4969, + "time_per_iteration": 2.675502061843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065215, + "balance_loss_mlp": 1.04239869, + "epoch": 0.9561369757599076, + "flos": 527578997760.0, + "grad_norm": 0.05789220147713751, + "language_loss": 0.79880643, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80945861, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.22827148, + "step": 4970, + "time_per_iteration": 2.7603402137756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066302, + "balance_loss_mlp": 1.04406953, + "epoch": 0.9563293574451712, + "flos": 498201062400.0, + "grad_norm": 0.07045752519217358, + "language_loss": 0.80387723, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81454021, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.22241211, + "step": 4971, + "time_per_iteration": 2.6467912197113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003461, + "balance_loss_mlp": 0.99678528, + "epoch": 0.9565217391304348, + "flos": 1408875628032.0, + "grad_norm": 0.0039371398806634095, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82777023, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.06689453, + "step": 4972, + "time_per_iteration": 4.884695291519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063136, + "balance_loss_mlp": 1.04022372, + "epoch": 0.9567141208156984, + "flos": 503846853120.0, + "grad_norm": 0.058137821309089344, + "language_loss": 0.78417355, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79480487, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.22912598, + "step": 4973, + "time_per_iteration": 2.7980639934539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.04048562, + "epoch": 0.9569065025009619, + "flos": 433213526016.0, + "grad_norm": 0.06846975795352252, + "language_loss": 0.79742157, + "learning_rate": 4.86211231669359e-06, + "loss": 0.80805671, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.23022461, + "step": 4974, + "time_per_iteration": 2.478752613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_mlp": 1.04140127, + "epoch": 0.9570988841862255, + "flos": 589959853056.0, + "grad_norm": 0.06934685034456788, + "language_loss": 0.78280979, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79345274, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.2286377, + "step": 4975, + "time_per_iteration": 2.7857437133789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064696, + "balance_loss_mlp": 1.04105639, + "epoch": 0.957291265871489, + "flos": 767278448640.0, + "grad_norm": 0.0946486602746724, + "language_loss": 0.78849113, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79913813, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.23632812, + "step": 4976, + "time_per_iteration": 2.942918062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063015, + "balance_loss_mlp": 1.04046106, + "epoch": 0.9574836475567526, + "flos": 639104670720.0, + "grad_norm": 0.07529166601163789, + "language_loss": 0.84990984, + "learning_rate": 4.732953758233849e-06, + "loss": 0.86054003, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.22546387, + "step": 4977, + "time_per_iteration": 2.7789368629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003403, + "balance_loss_mlp": 0.99677485, + "epoch": 0.9576760292420161, + "flos": 1575939649536.0, + "grad_norm": 0.003929886062785851, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79610658, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.06640625, + "step": 4978, + "time_per_iteration": 4.893427848815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065644, + "balance_loss_mlp": 1.04185009, + "epoch": 0.9578684109272797, + "flos": 496345439232.0, + "grad_norm": 0.05080656218249429, + "language_loss": 0.87270832, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.8833648, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.23779297, + "step": 4979, + "time_per_iteration": 2.6311700344085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066355, + "balance_loss_mlp": 1.04242969, + "epoch": 0.9580607926125433, + "flos": 429954531840.0, + "grad_norm": 0.08390852397863828, + "language_loss": 0.85619473, + "learning_rate": 4.605525716805337e-06, + "loss": 0.8668583, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.23913574, + "step": 4980, + "time_per_iteration": 2.4578068256378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065714, + "balance_loss_mlp": 1.04258728, + "epoch": 0.9582531742978069, + "flos": 1127262251520.0, + "grad_norm": 0.05376140040349082, + "language_loss": 0.80332768, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81398481, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.23144531, + "step": 4981, + "time_per_iteration": 3.5303521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066494, + "balance_loss_mlp": 1.04290295, + "epoch": 0.9584455559830705, + "flos": 524458395648.0, + "grad_norm": 0.06188839718179842, + "language_loss": 0.79525679, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80592173, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.23596191, + "step": 4982, + "time_per_iteration": 2.6728785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066093, + "balance_loss_mlp": 1.04308546, + "epoch": 0.9586379376683339, + "flos": 634187543040.0, + "grad_norm": 0.05647162716885676, + "language_loss": 0.80907547, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81973636, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.23010254, + "step": 4983, + "time_per_iteration": 2.8294382095336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067857, + "balance_loss_mlp": 1.04449201, + "epoch": 0.9588303193535975, + "flos": 416061038592.0, + "grad_norm": 0.06368713584012144, + "language_loss": 0.83991754, + "learning_rate": 4.438314345641459e-06, + "loss": 0.85059619, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.23352051, + "step": 4984, + "time_per_iteration": 2.4864554405212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058115, + "balance_loss_mlp": 1.03466618, + "epoch": 0.9590227010388611, + "flos": 481683635712.0, + "grad_norm": 0.06774888803703895, + "language_loss": 0.78104466, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79162586, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.23425293, + "step": 4985, + "time_per_iteration": 2.59116530418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059271, + "balance_loss_mlp": 1.03722906, + "epoch": 0.9592150827241247, + "flos": 684540440064.0, + "grad_norm": 0.06301853068999348, + "language_loss": 0.80390298, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81449568, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.22045898, + "step": 4986, + "time_per_iteration": 2.954206705093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063557, + "balance_loss_mlp": 1.04100311, + "epoch": 0.9594074644093882, + "flos": 574490092032.0, + "grad_norm": 0.053195859640902336, + "language_loss": 0.70766115, + "learning_rate": 4.314925898349642e-06, + "loss": 0.71829671, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.2253418, + "step": 4987, + "time_per_iteration": 2.7368128299713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067667, + "balance_loss_mlp": 1.04468322, + "epoch": 0.9595998460946518, + "flos": 546871233024.0, + "grad_norm": 0.06464800221271895, + "language_loss": 0.78132504, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79200172, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.22973633, + "step": 4988, + "time_per_iteration": 2.771296977996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106073, + "balance_loss_mlp": 1.03774643, + "epoch": 0.9597922277799154, + "flos": 474043829760.0, + "grad_norm": 0.0631980320315198, + "language_loss": 0.78296041, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79356772, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.22998047, + "step": 4989, + "time_per_iteration": 2.5766162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064136, + "balance_loss_mlp": 1.04084253, + "epoch": 0.9599846094651789, + "flos": 514691324928.0, + "grad_norm": 0.05683022845710803, + "language_loss": 0.8579731, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86861444, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.23278809, + "step": 4990, + "time_per_iteration": 2.5965628623962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_mlp": 1.03919303, + "epoch": 0.9601769911504425, + "flos": 594983066112.0, + "grad_norm": 0.06110034995600321, + "language_loss": 0.78472888, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79535127, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.23034668, + "step": 4991, + "time_per_iteration": 2.760037660598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066688, + "balance_loss_mlp": 1.04364491, + "epoch": 0.960369372835706, + "flos": 493012293120.0, + "grad_norm": 0.05123755509819416, + "language_loss": 0.79453242, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80519927, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.23022461, + "step": 4992, + "time_per_iteration": 2.6341288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065116, + "balance_loss_mlp": 1.04189432, + "epoch": 0.9605617545209696, + "flos": 579293420544.0, + "grad_norm": 0.04990578397455196, + "language_loss": 0.82886499, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83951616, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.23193359, + "step": 4993, + "time_per_iteration": 2.673229217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062918, + "balance_loss_mlp": 1.04065001, + "epoch": 0.9607541362062332, + "flos": 927708857856.0, + "grad_norm": 0.054118063874610094, + "language_loss": 0.86487305, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87550223, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.22265625, + "step": 4994, + "time_per_iteration": 3.2317514419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106405, + "balance_loss_mlp": 1.04008889, + "epoch": 0.9609465178914968, + "flos": 573121225728.0, + "grad_norm": 0.059331665161215484, + "language_loss": 0.75806749, + "learning_rate": 3.994358637073036e-06, + "loss": 0.76870805, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.23950195, + "step": 4995, + "time_per_iteration": 2.8493144512176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062782, + "balance_loss_mlp": 1.03969097, + "epoch": 0.9611388995767602, + "flos": 530850475008.0, + "grad_norm": 0.05160071442281887, + "language_loss": 0.85746717, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86809498, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.23083496, + "step": 4996, + "time_per_iteration": 2.6498258113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_mlp": 1.03623509, + "epoch": 0.9613312812620238, + "flos": 646247808000.0, + "grad_norm": 0.05978415787629307, + "language_loss": 0.82224667, + "learning_rate": 3.916142178097881e-06, + "loss": 0.8328442, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.23522949, + "step": 4997, + "time_per_iteration": 2.825035333633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065595, + "balance_loss_mlp": 1.04313636, + "epoch": 0.9615236629472874, + "flos": 496152718848.0, + "grad_norm": 0.05181929898036546, + "language_loss": 0.78011382, + "learning_rate": 3.877322836288888e-06, + "loss": 0.79076982, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.2244873, + "step": 4998, + "time_per_iteration": 2.841170310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062329, + "balance_loss_mlp": 1.03926229, + "epoch": 0.961716044632551, + "flos": 512974093824.0, + "grad_norm": 0.05566555595708184, + "language_loss": 0.75968587, + "learning_rate": 3.838696106385153e-06, + "loss": 0.77030915, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.23059082, + "step": 4999, + "time_per_iteration": 2.6079280376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068691, + "balance_loss_mlp": 1.04517114, + "epoch": 0.9619084263178146, + "flos": 501084527616.0, + "grad_norm": 0.0634969920505646, + "language_loss": 0.80885196, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81953883, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.23498535, + "step": 5000, + "time_per_iteration": 2.5831449031829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106539, + "balance_loss_mlp": 1.04257321, + "epoch": 0.9621008080030781, + "flos": 595635379200.0, + "grad_norm": 0.0788330829504222, + "language_loss": 0.74819994, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.75885379, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.22790527, + "step": 5001, + "time_per_iteration": 2.8088111877441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066486, + "balance_loss_mlp": 1.0436573, + "epoch": 0.9622931896883417, + "flos": 502250761728.0, + "grad_norm": 0.06376625274094212, + "language_loss": 0.82308555, + "learning_rate": 3.723971737693899e-06, + "loss": 0.83375037, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.22827148, + "step": 5002, + "time_per_iteration": 2.645930767059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063524, + "balance_loss_mlp": 1.03970599, + "epoch": 0.9624855713736052, + "flos": 607287808512.0, + "grad_norm": 0.05456162093483777, + "language_loss": 0.80836141, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81899667, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.23815918, + "step": 5003, + "time_per_iteration": 2.7728164196014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065524, + "balance_loss_mlp": 1.0426712, + "epoch": 0.9626779530588688, + "flos": 510715777536.0, + "grad_norm": 0.07008234047327347, + "language_loss": 0.84970057, + "learning_rate": 3.648452157695936e-06, + "loss": 0.86035579, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.22827148, + "step": 5004, + "time_per_iteration": 2.5669493675231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066163, + "balance_loss_mlp": 1.04288161, + "epoch": 0.9628703347441323, + "flos": 627294025728.0, + "grad_norm": 0.06552748478093777, + "language_loss": 0.82791591, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83857757, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.23266602, + "step": 5005, + "time_per_iteration": 2.8153672218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068034, + "balance_loss_mlp": 1.04431152, + "epoch": 0.9630627164293959, + "flos": 630758223360.0, + "grad_norm": 0.05992229311372926, + "language_loss": 0.77596062, + "learning_rate": 3.573703380666149e-06, + "loss": 0.786641, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.23693848, + "step": 5006, + "time_per_iteration": 2.7764079570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_mlp": 1.0408895, + "epoch": 0.9632550981146595, + "flos": 570558961152.0, + "grad_norm": 0.05008951632274407, + "language_loss": 0.78372812, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79436582, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.2286377, + "step": 5007, + "time_per_iteration": 2.8054816722869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062955, + "balance_loss_mlp": 1.03951883, + "epoch": 0.9634474797999231, + "flos": 466117327872.0, + "grad_norm": 0.06832318603954099, + "language_loss": 0.80910051, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.81973004, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.234375, + "step": 5008, + "time_per_iteration": 2.6352286338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064861, + "balance_loss_mlp": 1.04144847, + "epoch": 0.9636398614851867, + "flos": 526600714752.0, + "grad_norm": 0.07994370035866022, + "language_loss": 0.85493284, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86558145, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.23413086, + "step": 5009, + "time_per_iteration": 2.603555917739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060923, + "balance_loss_mlp": 1.03749835, + "epoch": 0.9638322431704501, + "flos": 564831677952.0, + "grad_norm": 0.058787878064300095, + "language_loss": 0.752451, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.234375, + "step": 5010, + "time_per_iteration": 2.7728347778320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067099, + "balance_loss_mlp": 1.04449689, + "epoch": 0.9640246248557137, + "flos": 477772328448.0, + "grad_norm": 0.08181968777797018, + "language_loss": 0.84829634, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85896736, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.22619629, + "step": 5011, + "time_per_iteration": 2.6068520545959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060574, + "balance_loss_mlp": 1.03792429, + "epoch": 0.9642170065409773, + "flos": 539318062080.0, + "grad_norm": 0.0552728686967885, + "language_loss": 0.88548124, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89608705, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.22644043, + "step": 5012, + "time_per_iteration": 2.6157679557800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.03952003, + "epoch": 0.9644093882262409, + "flos": 523754325504.0, + "grad_norm": 0.05861545033938659, + "language_loss": 0.83724725, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84786797, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.22546387, + "step": 5013, + "time_per_iteration": 2.5874183177948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063855, + "balance_loss_mlp": 1.04178977, + "epoch": 0.9646017699115044, + "flos": 574290031104.0, + "grad_norm": 0.06290318681113366, + "language_loss": 0.79001546, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.80065405, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.22070312, + "step": 5014, + "time_per_iteration": 2.697143077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_mlp": 1.04372108, + "epoch": 0.964794151596768, + "flos": 636799366656.0, + "grad_norm": 0.07022535288197708, + "language_loss": 0.84282309, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85350025, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.23986816, + "step": 5015, + "time_per_iteration": 2.721156358718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067023, + "balance_loss_mlp": 1.04393208, + "epoch": 0.9649865332820315, + "flos": 617435550720.0, + "grad_norm": 0.06772054018077114, + "language_loss": 0.86324394, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87391412, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.23095703, + "step": 5016, + "time_per_iteration": 2.7218070030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066045, + "balance_loss_mlp": 1.04256094, + "epoch": 0.9651789149672951, + "flos": 516183528960.0, + "grad_norm": 0.054732773145454494, + "language_loss": 0.81263906, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82329947, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.23461914, + "step": 5017, + "time_per_iteration": 2.763035535812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.03928423, + "epoch": 0.9653712966525587, + "flos": 492940712448.0, + "grad_norm": 0.06149986612463951, + "language_loss": 0.80204356, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.81267327, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.23693848, + "step": 5018, + "time_per_iteration": 2.6125636100769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064154, + "balance_loss_mlp": 1.04086018, + "epoch": 0.9655636783378222, + "flos": 536560505856.0, + "grad_norm": 0.06357882552992955, + "language_loss": 0.82340455, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83404613, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.23278809, + "step": 5019, + "time_per_iteration": 2.7467122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064987, + "balance_loss_mlp": 1.04167008, + "epoch": 0.9657560600230858, + "flos": 459023749632.0, + "grad_norm": 0.05685331710974359, + "language_loss": 0.82253885, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83318865, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.2331543, + "step": 5020, + "time_per_iteration": 2.670370101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066255, + "balance_loss_mlp": 1.04366493, + "epoch": 0.9659484417083494, + "flos": 686178749952.0, + "grad_norm": 0.06470708717020164, + "language_loss": 0.83449835, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84516084, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.22583008, + "step": 5021, + "time_per_iteration": 2.8630871772766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003402, + "balance_loss_mlp": 0.99677426, + "epoch": 0.966140823393613, + "flos": 1502292178944.0, + "grad_norm": 0.003927469064220584, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81697649, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.06640625, + "step": 5022, + "time_per_iteration": 4.691370487213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064655, + "balance_loss_mlp": 1.04190993, + "epoch": 0.9663332050788765, + "flos": 464899336704.0, + "grad_norm": 0.07461109320403748, + "language_loss": 0.81199974, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82264626, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.22741699, + "step": 5023, + "time_per_iteration": 2.5815436840057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067341, + "balance_loss_mlp": 1.04451227, + "epoch": 0.96652558676414, + "flos": 500834907648.0, + "grad_norm": 0.054429682823401944, + "language_loss": 0.85471153, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86538494, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.22827148, + "step": 5024, + "time_per_iteration": 2.696014165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.04140997, + "epoch": 0.9667179684494036, + "flos": 424839914496.0, + "grad_norm": 0.07944496939889083, + "language_loss": 0.82808036, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.83873314, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.23864746, + "step": 5025, + "time_per_iteration": 2.438119888305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067473, + "balance_loss_mlp": 1.04450107, + "epoch": 0.9669103501346672, + "flos": 516996628992.0, + "grad_norm": 0.062276776313522096, + "language_loss": 0.86128414, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.87195885, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.22961426, + "step": 5026, + "time_per_iteration": 2.644537925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063932, + "balance_loss_mlp": 1.04109097, + "epoch": 0.9671027318199308, + "flos": 456241600512.0, + "grad_norm": 0.0854616119175948, + "language_loss": 0.75941432, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.77005363, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.22827148, + "step": 5027, + "time_per_iteration": 2.5741615295410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064978, + "balance_loss_mlp": 1.04238749, + "epoch": 0.9672951135051943, + "flos": 525058951680.0, + "grad_norm": 0.05741509378155617, + "language_loss": 0.80092812, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81157786, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.22595215, + "step": 5028, + "time_per_iteration": 2.603698492050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060972, + "balance_loss_mlp": 1.03829837, + "epoch": 0.9674874951904578, + "flos": 573986082816.0, + "grad_norm": 0.06194459113870459, + "language_loss": 0.80152857, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.81213832, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.22680664, + "step": 5029, + "time_per_iteration": 2.7249526977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064927, + "balance_loss_mlp": 1.04158545, + "epoch": 0.9676798768757214, + "flos": 629184153600.0, + "grad_norm": 0.04555589373001173, + "language_loss": 0.80289042, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.81353974, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.23352051, + "step": 5030, + "time_per_iteration": 2.90915846824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003379, + "balance_loss_mlp": 0.99675071, + "epoch": 0.967872258560985, + "flos": 1463880605184.0, + "grad_norm": 0.003928487667441442, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76566738, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.06640625, + "step": 5031, + "time_per_iteration": 4.642369747161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.04282784, + "epoch": 0.9680646402462486, + "flos": 565503814656.0, + "grad_norm": 0.06195217532413699, + "language_loss": 0.79083759, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80149376, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.22790527, + "step": 5032, + "time_per_iteration": 2.6946024894714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003375, + "balance_loss_mlp": 0.99674749, + "epoch": 0.9682570219315121, + "flos": 1434463022592.0, + "grad_norm": 0.0039263261842775645, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79078174, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.06640625, + "step": 5033, + "time_per_iteration": 4.833622455596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064933, + "balance_loss_mlp": 1.04119873, + "epoch": 0.9684494036167757, + "flos": 584610670080.0, + "grad_norm": 0.07322568294021568, + "language_loss": 0.81921077, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82986003, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.23693848, + "step": 5034, + "time_per_iteration": 2.7054622173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_mlp": 1.0416739, + "epoch": 0.9686417853020393, + "flos": 559064747520.0, + "grad_norm": 0.05265422766774479, + "language_loss": 0.84284925, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85349661, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.23071289, + "step": 5035, + "time_per_iteration": 2.6912314891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064116, + "balance_loss_mlp": 1.04046464, + "epoch": 0.9688341669873028, + "flos": 784927604736.0, + "grad_norm": 0.05870206875682227, + "language_loss": 0.83489573, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84553695, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.23632812, + "step": 5036, + "time_per_iteration": 2.947582721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062286, + "balance_loss_mlp": 1.03920674, + "epoch": 0.9690265486725663, + "flos": 395899176960.0, + "grad_norm": 0.06292583451816228, + "language_loss": 0.79677629, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80739915, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.23071289, + "step": 5037, + "time_per_iteration": 2.4658944606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065139, + "balance_loss_mlp": 1.04173779, + "epoch": 0.9692189303578299, + "flos": 476373726720.0, + "grad_norm": 0.07250551131476936, + "language_loss": 0.77617192, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78682327, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.23400879, + "step": 5038, + "time_per_iteration": 2.7228803634643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106111, + "balance_loss_mlp": 1.03956842, + "epoch": 0.9694113120430935, + "flos": 597575066112.0, + "grad_norm": 0.06788850088092563, + "language_loss": 0.79366446, + "learning_rate": 2.451732453851385e-06, + "loss": 0.80427551, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.21557617, + "step": 5039, + "time_per_iteration": 2.7196829319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_mlp": 1.03825676, + "epoch": 0.9696036937283571, + "flos": 500881895424.0, + "grad_norm": 0.053804028005827634, + "language_loss": 0.82859206, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83920407, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.22949219, + "step": 5040, + "time_per_iteration": 2.603468179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_mlp": 1.03809977, + "epoch": 0.9697960754136207, + "flos": 432277088256.0, + "grad_norm": 0.06156976605988288, + "language_loss": 0.87425447, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88487267, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.23718262, + "step": 5041, + "time_per_iteration": 2.4655745029449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106621, + "balance_loss_mlp": 1.04332173, + "epoch": 0.9699884570988841, + "flos": 568540353024.0, + "grad_norm": 0.054950180555539796, + "language_loss": 0.85486805, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86553025, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.22900391, + "step": 5042, + "time_per_iteration": 2.7471725940704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069068, + "balance_loss_mlp": 1.04596519, + "epoch": 0.9701808387841477, + "flos": 516215835648.0, + "grad_norm": 0.05677739964495203, + "language_loss": 0.81870991, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82940054, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.2310791, + "step": 5043, + "time_per_iteration": 2.7095541954040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060887, + "balance_loss_mlp": 1.03853512, + "epoch": 0.9703732204694113, + "flos": 491517517824.0, + "grad_norm": 0.06567489938132548, + "language_loss": 0.76418149, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77479041, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.22351074, + "step": 5044, + "time_per_iteration": 2.5679051876068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_mlp": 1.03968978, + "epoch": 0.9705656021546749, + "flos": 626120451072.0, + "grad_norm": 0.06280101868126062, + "language_loss": 0.80511069, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.8157413, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.23352051, + "step": 5045, + "time_per_iteration": 2.754497766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.04334414, + "epoch": 0.9707579838399384, + "flos": 471437148672.0, + "grad_norm": 0.0575037191710409, + "language_loss": 0.82947087, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.84013402, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.22961426, + "step": 5046, + "time_per_iteration": 2.6000595092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066433, + "balance_loss_mlp": 1.04273403, + "epoch": 0.970950365525202, + "flos": 492103019520.0, + "grad_norm": 0.06395406690389444, + "language_loss": 0.80750513, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81816947, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.23718262, + "step": 5047, + "time_per_iteration": 2.6433827877044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058641, + "balance_loss_mlp": 1.03622973, + "epoch": 0.9711427472104656, + "flos": 557322923520.0, + "grad_norm": 0.06536793289339991, + "language_loss": 0.80885148, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81943792, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.22424316, + "step": 5048, + "time_per_iteration": 2.7203571796417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106497, + "balance_loss_mlp": 1.04177177, + "epoch": 0.9713351288957291, + "flos": 625841095680.0, + "grad_norm": 0.05247386429416099, + "language_loss": 0.83924532, + "learning_rate": 2.153250946564489e-06, + "loss": 0.849895, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.23193359, + "step": 5049, + "time_per_iteration": 2.915627956390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068661, + "balance_loss_mlp": 1.04564154, + "epoch": 0.9715275105809927, + "flos": 499073260032.0, + "grad_norm": 0.06425293622373854, + "language_loss": 0.81309581, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82378244, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.23022461, + "step": 5050, + "time_per_iteration": 2.720860242843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065169, + "balance_loss_mlp": 1.04197037, + "epoch": 0.9717198922662562, + "flos": 477515367936.0, + "grad_norm": 0.0640839658378964, + "language_loss": 0.77984011, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.79049182, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.23205566, + "step": 5051, + "time_per_iteration": 2.544619560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059378, + "balance_loss_mlp": 1.03681207, + "epoch": 0.9719122739515198, + "flos": 553446120960.0, + "grad_norm": 0.09980134854797204, + "language_loss": 0.78635657, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79695034, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.22546387, + "step": 5052, + "time_per_iteration": 2.6725540161132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069486, + "balance_loss_mlp": 1.0463829, + "epoch": 0.9721046556367834, + "flos": 565852179456.0, + "grad_norm": 0.07128218141456762, + "language_loss": 0.80016816, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.81086302, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.23083496, + "step": 5053, + "time_per_iteration": 2.664026975631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.03906298, + "epoch": 0.972297037322047, + "flos": 560315045376.0, + "grad_norm": 0.059142247701586874, + "language_loss": 0.78148782, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79209924, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.22094727, + "step": 5054, + "time_per_iteration": 2.7956132888793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064724, + "balance_loss_mlp": 1.04113269, + "epoch": 0.9724894190073105, + "flos": 512440349184.0, + "grad_norm": 0.06082537810675151, + "language_loss": 0.79474902, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.8053962, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.2355957, + "step": 5055, + "time_per_iteration": 2.671006202697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067024, + "balance_loss_mlp": 1.04326606, + "epoch": 0.972681800692574, + "flos": 613832961024.0, + "grad_norm": 0.07513583056263037, + "language_loss": 0.80732191, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81799209, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.23754883, + "step": 5056, + "time_per_iteration": 2.779228925704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065639, + "balance_loss_mlp": 1.04301322, + "epoch": 0.9728741823778376, + "flos": 833911635456.0, + "grad_norm": 0.06408364503125263, + "language_loss": 0.83977121, + "learning_rate": 1.92838141509849e-06, + "loss": 0.85042763, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.22619629, + "step": 5057, + "time_per_iteration": 3.0838735103607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066592, + "balance_loss_mlp": 1.04308391, + "epoch": 0.9730665640631012, + "flos": 571450982400.0, + "grad_norm": 0.059458566299808, + "language_loss": 0.8458215, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85648739, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.23522949, + "step": 5058, + "time_per_iteration": 2.7313530445098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064891, + "balance_loss_mlp": 1.0414784, + "epoch": 0.9732589457483648, + "flos": 506520345600.0, + "grad_norm": 0.058993279317627906, + "language_loss": 0.77250987, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78315884, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.23388672, + "step": 5059, + "time_per_iteration": 2.568094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060711, + "balance_loss_mlp": 1.0382638, + "epoch": 0.9734513274336283, + "flos": 926977623552.0, + "grad_norm": 0.061554025846400975, + "language_loss": 0.81109071, + "learning_rate": 1.84724562509897e-06, + "loss": 0.82169777, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.22473145, + "step": 5060, + "time_per_iteration": 3.157397747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062998, + "balance_loss_mlp": 1.03989542, + "epoch": 0.9736437091188919, + "flos": 491930122752.0, + "grad_norm": 0.058836820412195824, + "language_loss": 0.78267944, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79330945, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.23071289, + "step": 5061, + "time_per_iteration": 2.7482900619506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_mlp": 1.04470706, + "epoch": 0.9738360908041555, + "flos": 613321611264.0, + "grad_norm": 0.0694756230310086, + "language_loss": 0.83708924, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.8477726, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.23596191, + "step": 5062, + "time_per_iteration": 2.7663509845733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003347, + "balance_loss_mlp": 0.99671924, + "epoch": 0.974028472489419, + "flos": 1549561549824.0, + "grad_norm": 0.003919403605797286, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.76995444, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.06640625, + "step": 5063, + "time_per_iteration": 4.96152138710022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100335, + "balance_loss_mlp": 0.99672168, + "epoch": 0.9742208541746825, + "flos": 1411155965952.0, + "grad_norm": 0.0039186810532365335, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80681062, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.06640625, + "step": 5064, + "time_per_iteration": 5.010188817977905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067385, + "balance_loss_mlp": 1.04452014, + "epoch": 0.9744132358599461, + "flos": 674884597248.0, + "grad_norm": 0.06180371506572385, + "language_loss": 0.7679944, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77866822, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.2286377, + "step": 5065, + "time_per_iteration": 2.936842203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062455, + "balance_loss_mlp": 1.03975797, + "epoch": 0.9746056175452097, + "flos": 598407616512.0, + "grad_norm": 0.06002895188447741, + "language_loss": 0.78050154, + "learning_rate": 1.690196122544896e-06, + "loss": 0.79112613, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.22717285, + "step": 5066, + "time_per_iteration": 2.780294895172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064534, + "balance_loss_mlp": 1.04207551, + "epoch": 0.9747979992304733, + "flos": 732175428096.0, + "grad_norm": 0.0638520746537111, + "language_loss": 0.82705855, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83770382, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.22485352, + "step": 5067, + "time_per_iteration": 4.443971395492554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_mlp": 1.04508257, + "epoch": 0.9749903809157369, + "flos": 616499112960.0, + "grad_norm": 0.08421424574447112, + "language_loss": 0.76827443, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77895868, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.23327637, + "step": 5068, + "time_per_iteration": 2.6909236907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058974, + "balance_loss_mlp": 1.03682458, + "epoch": 0.9751827626010003, + "flos": 468398039040.0, + "grad_norm": 0.05243523587913791, + "language_loss": 0.83662784, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84721756, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.22167969, + "step": 5069, + "time_per_iteration": 2.554863929748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_mlp": 1.04154027, + "epoch": 0.9753751442862639, + "flos": 599215574016.0, + "grad_norm": 0.08363220915033362, + "language_loss": 0.84941483, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86006248, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.23217773, + "step": 5070, + "time_per_iteration": 2.772955894470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060273, + "balance_loss_mlp": 1.03805256, + "epoch": 0.9755675259715275, + "flos": 650806285824.0, + "grad_norm": 0.0635669703618464, + "language_loss": 0.82277942, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83338213, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.22216797, + "step": 5071, + "time_per_iteration": 2.922133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065735, + "balance_loss_mlp": 1.0425607, + "epoch": 0.9757599076567911, + "flos": 563658103296.0, + "grad_norm": 0.06500360890677431, + "language_loss": 0.79198158, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80263901, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.23181152, + "step": 5072, + "time_per_iteration": 2.725067615509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060385, + "balance_loss_mlp": 1.03761649, + "epoch": 0.9759522893420547, + "flos": 504637558272.0, + "grad_norm": 0.061072535254532934, + "language_loss": 0.8039192, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81452304, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.22753906, + "step": 5073, + "time_per_iteration": 2.5948638916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061976, + "balance_loss_mlp": 1.03927875, + "epoch": 0.9761446710273182, + "flos": 583728560640.0, + "grad_norm": 0.07099648020053952, + "language_loss": 0.82122862, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83184838, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.22680664, + "step": 5074, + "time_per_iteration": 2.7220845222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106256, + "balance_loss_mlp": 1.03939795, + "epoch": 0.9763370527125818, + "flos": 482207468544.0, + "grad_norm": 0.07455408744394738, + "language_loss": 0.81895626, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.82958186, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.23168945, + "step": 5075, + "time_per_iteration": 2.591758966445923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065565, + "balance_loss_mlp": 1.04274833, + "epoch": 0.9765294343978453, + "flos": 618987225600.0, + "grad_norm": 0.05817197872146041, + "language_loss": 0.78506422, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79571986, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.22814941, + "step": 5076, + "time_per_iteration": 2.7101328372955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060528, + "balance_loss_mlp": 1.03771114, + "epoch": 0.9767218160831089, + "flos": 526573550592.0, + "grad_norm": 0.06386112095671416, + "language_loss": 0.85028458, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86088979, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.22814941, + "step": 5077, + "time_per_iteration": 2.6009128093719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062594, + "balance_loss_mlp": 1.04019439, + "epoch": 0.9769141977683724, + "flos": 525194772480.0, + "grad_norm": 0.058942301821321326, + "language_loss": 0.83781874, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.8484447, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.22399902, + "step": 5078, + "time_per_iteration": 2.737980604171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106952, + "balance_loss_mlp": 1.0459764, + "epoch": 0.977106579453636, + "flos": 457615236096.0, + "grad_norm": 0.06395353271069072, + "language_loss": 0.80623591, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81693113, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.23522949, + "step": 5079, + "time_per_iteration": 2.783198833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063803, + "balance_loss_mlp": 1.04111767, + "epoch": 0.9772989611388996, + "flos": 532090861056.0, + "grad_norm": 0.06963280842478477, + "language_loss": 0.81529284, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82593083, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.22680664, + "step": 5080, + "time_per_iteration": 2.615165948867798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065016, + "balance_loss_mlp": 1.04231882, + "epoch": 0.9774913428241632, + "flos": 755349235200.0, + "grad_norm": 0.05783942863824456, + "language_loss": 0.85814047, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.86879063, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.22680664, + "step": 5081, + "time_per_iteration": 3.005134344100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003318, + "balance_loss_mlp": 0.99668992, + "epoch": 0.9776837245094268, + "flos": 1554320088576.0, + "grad_norm": 0.003915824492337212, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.7989881, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.06640625, + "step": 5082, + "time_per_iteration": 5.032810211181641 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060089, + "balance_loss_mlp": 1.03662872, + "epoch": 0.9778761061946902, + "flos": 592534600704.0, + "grad_norm": 0.06772639363056118, + "language_loss": 0.84042907, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85102993, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.234375, + "step": 5083, + "time_per_iteration": 2.683605194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_mlp": 1.04470205, + "epoch": 0.9780684878799538, + "flos": 414951704064.0, + "grad_norm": 0.11848839079428937, + "language_loss": 0.81910384, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82978618, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.23535156, + "step": 5084, + "time_per_iteration": 2.510514974594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106606, + "balance_loss_mlp": 1.04306436, + "epoch": 0.9782608695652174, + "flos": 568411872768.0, + "grad_norm": 0.06042021971117926, + "language_loss": 0.84641409, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85707462, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.2298584, + "step": 5085, + "time_per_iteration": 2.7490227222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062587, + "balance_loss_mlp": 1.03903139, + "epoch": 0.978453251250481, + "flos": 690472926720.0, + "grad_norm": 0.06323661108562777, + "language_loss": 0.83238792, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84301388, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.2355957, + "step": 5086, + "time_per_iteration": 2.8859827518463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064613, + "balance_loss_mlp": 1.04137921, + "epoch": 0.9786456329357445, + "flos": 502505150976.0, + "grad_norm": 0.06308015457512217, + "language_loss": 0.77418578, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78483194, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.23217773, + "step": 5087, + "time_per_iteration": 2.640166997909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064813, + "balance_loss_mlp": 1.04157913, + "epoch": 0.9788380146210081, + "flos": 863183485440.0, + "grad_norm": 0.06605726461697956, + "language_loss": 0.80655509, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81720316, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.23254395, + "step": 5088, + "time_per_iteration": 3.0655131340026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062744, + "balance_loss_mlp": 1.04057157, + "epoch": 0.9790303963062716, + "flos": 512717133312.0, + "grad_norm": 0.06638727169087975, + "language_loss": 0.84402472, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85465217, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.22192383, + "step": 5089, + "time_per_iteration": 2.63153338432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065094, + "balance_loss_mlp": 1.04174054, + "epoch": 0.9792227779915352, + "flos": 494428147200.0, + "grad_norm": 0.06762738713186066, + "language_loss": 0.86859965, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87925059, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.23352051, + "step": 5090, + "time_per_iteration": 2.627124309539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063588, + "balance_loss_mlp": 1.04065216, + "epoch": 0.9794151596767988, + "flos": 608325562368.0, + "grad_norm": 0.06322157277550289, + "language_loss": 0.81654972, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82718563, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.22924805, + "step": 5091, + "time_per_iteration": 2.776026964187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_mlp": 1.04480207, + "epoch": 0.9796075413620623, + "flos": 478222009344.0, + "grad_norm": 0.10119893374448796, + "language_loss": 0.86482507, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87550843, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.23510742, + "step": 5092, + "time_per_iteration": 2.5913615226745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063991, + "balance_loss_mlp": 1.04055452, + "epoch": 0.9797999230473259, + "flos": 592220740608.0, + "grad_norm": 0.06249893375532654, + "language_loss": 0.84391701, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85455692, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.23425293, + "step": 5093, + "time_per_iteration": 2.7546114921569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064835, + "balance_loss_mlp": 1.04194725, + "epoch": 0.9799923047325895, + "flos": 556381343232.0, + "grad_norm": 0.06263423673512068, + "language_loss": 0.81759882, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82824719, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.22875977, + "step": 5094, + "time_per_iteration": 2.9061594009399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_mlp": 1.04176164, + "epoch": 0.9801846864178531, + "flos": 579456405504.0, + "grad_norm": 0.049311129804513056, + "language_loss": 0.8460865, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.8567307, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.2265625, + "step": 5095, + "time_per_iteration": 2.7628769874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067247, + "balance_loss_mlp": 1.0450387, + "epoch": 0.9803770681031165, + "flos": 515101358592.0, + "grad_norm": 0.05430804653354444, + "language_loss": 0.79867494, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.80934739, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.22216797, + "step": 5096, + "time_per_iteration": 2.704636335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063536, + "balance_loss_mlp": 1.04074311, + "epoch": 0.9805694497883801, + "flos": 566988678144.0, + "grad_norm": 0.061103858425466714, + "language_loss": 0.78479362, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79542893, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.22790527, + "step": 5097, + "time_per_iteration": 2.761188507080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061653, + "balance_loss_mlp": 1.03853846, + "epoch": 0.9807618314736437, + "flos": 479351167488.0, + "grad_norm": 0.057577727000740944, + "language_loss": 0.73809493, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74871147, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.23132324, + "step": 5098, + "time_per_iteration": 2.6452760696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066484, + "balance_loss_mlp": 1.04276156, + "epoch": 0.9809542131589073, + "flos": 545285053440.0, + "grad_norm": 0.10492983204691474, + "language_loss": 0.80005634, + "learning_rate": 9.509698444908344e-07, + "loss": 0.81072116, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.23718262, + "step": 5099, + "time_per_iteration": 2.62776255607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.04059947, + "epoch": 0.9811465948441709, + "flos": 520843696128.0, + "grad_norm": 0.06439125090315591, + "language_loss": 0.79966128, + "learning_rate": 9.318612999057452e-07, + "loss": 0.81029904, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.23144531, + "step": 5100, + "time_per_iteration": 2.590998649597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063439, + "balance_loss_mlp": 1.04044354, + "epoch": 0.9813389765294344, + "flos": 541282341888.0, + "grad_norm": 0.06775361472467173, + "language_loss": 0.80269879, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81333315, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.23010254, + "step": 5101, + "time_per_iteration": 2.677475690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062592, + "balance_loss_mlp": 1.03938198, + "epoch": 0.981531358214698, + "flos": 567356866560.0, + "grad_norm": 0.055868673578204395, + "language_loss": 0.8439014, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85452735, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.23217773, + "step": 5102, + "time_per_iteration": 2.8073782920837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062341, + "balance_loss_mlp": 1.03984594, + "epoch": 0.9817237398999615, + "flos": 577272241152.0, + "grad_norm": 0.0521173779260352, + "language_loss": 0.81328356, + "learning_rate": 8.756982280578307e-07, + "loss": 0.82390702, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.22497559, + "step": 5103, + "time_per_iteration": 2.7432682514190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106656, + "balance_loss_mlp": 1.04287314, + "epoch": 0.9819161215852251, + "flos": 701507547648.0, + "grad_norm": 0.05634160003679975, + "language_loss": 0.81942677, + "learning_rate": 8.573647489714676e-07, + "loss": 0.83009243, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.23657227, + "step": 5104, + "time_per_iteration": 2.944218873977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064768, + "balance_loss_mlp": 1.04151022, + "epoch": 0.9821085032704886, + "flos": 624188104704.0, + "grad_norm": 0.0740075344049142, + "language_loss": 0.84234631, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85299402, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.23266602, + "step": 5105, + "time_per_iteration": 2.960850238800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105814, + "balance_loss_mlp": 1.03584814, + "epoch": 0.9823008849557522, + "flos": 499505688576.0, + "grad_norm": 0.059286633961085584, + "language_loss": 0.81240541, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82298684, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.22302246, + "step": 5106, + "time_per_iteration": 2.668607473373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106239, + "balance_loss_mlp": 1.03929949, + "epoch": 0.9824932666410158, + "flos": 523815994368.0, + "grad_norm": 0.06438064273190143, + "language_loss": 0.72763407, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73825794, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.23095703, + "step": 5107, + "time_per_iteration": 2.6748061180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064285, + "balance_loss_mlp": 1.04153991, + "epoch": 0.9826856483262794, + "flos": 502663366656.0, + "grad_norm": 0.058904571353806466, + "language_loss": 0.8261292, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83677202, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.22753906, + "step": 5108, + "time_per_iteration": 2.630704402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.04453397, + "epoch": 0.982878030011543, + "flos": 562056869376.0, + "grad_norm": 0.054352316826208384, + "language_loss": 0.84295356, + "learning_rate": 7.686042586151354e-07, + "loss": 0.853634, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.23510742, + "step": 5109, + "time_per_iteration": 2.8963773250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064593, + "balance_loss_mlp": 1.04148996, + "epoch": 0.9830704116968064, + "flos": 537101591040.0, + "grad_norm": 0.05702599379327277, + "language_loss": 0.83052921, + "learning_rate": 7.514335898027857e-07, + "loss": 0.84117514, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.23095703, + "step": 5110, + "time_per_iteration": 2.7798235416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063272, + "balance_loss_mlp": 1.04049134, + "epoch": 0.98326279338207, + "flos": 458949597696.0, + "grad_norm": 0.07220377225871695, + "language_loss": 0.84532428, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85595697, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.22766113, + "step": 5111, + "time_per_iteration": 2.510430335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061353, + "balance_loss_mlp": 1.03872716, + "epoch": 0.9834551750673336, + "flos": 640974974976.0, + "grad_norm": 0.06064500141416257, + "language_loss": 0.79539931, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80601287, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.22607422, + "step": 5112, + "time_per_iteration": 2.809610366821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106535, + "balance_loss_mlp": 1.04302168, + "epoch": 0.9836475567525972, + "flos": 1071807220224.0, + "grad_norm": 0.05639844002899201, + "language_loss": 0.79515636, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80580986, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.22338867, + "step": 5113, + "time_per_iteration": 3.4230575561523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064426, + "balance_loss_mlp": 1.04124022, + "epoch": 0.9838399384378607, + "flos": 565209778176.0, + "grad_norm": 0.07537003994735685, + "language_loss": 0.75971985, + "learning_rate": 6.846892349181566e-07, + "loss": 0.77036417, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.23181152, + "step": 5114, + "time_per_iteration": 2.6420531272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066558, + "balance_loss_mlp": 1.04363441, + "epoch": 0.9840323201231242, + "flos": 772805670912.0, + "grad_norm": 0.060824497499080046, + "language_loss": 0.79589713, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80656278, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.22912598, + "step": 5115, + "time_per_iteration": 2.9657704830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063418, + "balance_loss_mlp": 1.04027963, + "epoch": 0.9842247018083878, + "flos": 472262358528.0, + "grad_norm": 0.057744957305318596, + "language_loss": 0.85976076, + "learning_rate": 6.524801401249225e-07, + "loss": 0.87039495, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.23132324, + "step": 5116, + "time_per_iteration": 2.558858633041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062689, + "balance_loss_mlp": 1.04007459, + "epoch": 0.9844170834936514, + "flos": 525259012608.0, + "grad_norm": 0.0686306369668635, + "language_loss": 0.85068059, + "learning_rate": 6.366663854713295e-07, + "loss": 0.8613075, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.22607422, + "step": 5117, + "time_per_iteration": 2.60532546043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004337, + "balance_loss_mlp": 0.99770856, + "epoch": 0.984609465178915, + "flos": 1567247408640.0, + "grad_norm": 0.0029769817335575824, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78166854, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.06640625, + "step": 5118, + "time_per_iteration": 4.922377586364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066915, + "balance_loss_mlp": 1.04326415, + "epoch": 0.9848018468641785, + "flos": 519548981760.0, + "grad_norm": 0.07012559788215562, + "language_loss": 0.82266271, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83333188, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.23632812, + "step": 5119, + "time_per_iteration": 2.6348206996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063866, + "balance_loss_mlp": 1.04067993, + "epoch": 0.9849942285494421, + "flos": 493004952576.0, + "grad_norm": 0.05674795107066025, + "language_loss": 0.83400589, + "learning_rate": 5.903883659301167e-07, + "loss": 0.84464455, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.23168945, + "step": 5120, + "time_per_iteration": 2.563413619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067769, + "balance_loss_mlp": 1.04472589, + "epoch": 0.9851866102347057, + "flos": 546001606656.0, + "grad_norm": 0.06041502302174469, + "language_loss": 0.80996847, + "learning_rate": 5.753501275193029e-07, + "loss": 0.82064617, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.23046875, + "step": 5121, + "time_per_iteration": 2.650181293487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061842, + "balance_loss_mlp": 1.03858471, + "epoch": 0.9853789919199692, + "flos": 476257729536.0, + "grad_norm": 0.06754220961608856, + "language_loss": 0.80218607, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81280452, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.23254395, + "step": 5122, + "time_per_iteration": 2.593015193939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066, + "balance_loss_mlp": 1.04334998, + "epoch": 0.9855713736052328, + "flos": 1032619995648.0, + "grad_norm": 0.06282848940738642, + "language_loss": 0.76010883, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77076876, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.22644043, + "step": 5123, + "time_per_iteration": 3.397169828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067794, + "balance_loss_mlp": 1.04490566, + "epoch": 0.9857637552904963, + "flos": 495050724864.0, + "grad_norm": 0.053030998912210484, + "language_loss": 0.82679278, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83747071, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.22900391, + "step": 5124, + "time_per_iteration": 2.599963665008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.04034376, + "epoch": 0.9859561369757599, + "flos": 592267728384.0, + "grad_norm": 0.06500711995807343, + "language_loss": 0.8369258, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84756613, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.23657227, + "step": 5125, + "time_per_iteration": 2.771230936050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061253, + "balance_loss_mlp": 1.03853178, + "epoch": 0.9861485186610235, + "flos": 486971149824.0, + "grad_norm": 0.06307068299560632, + "language_loss": 0.78874338, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79935598, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.22705078, + "step": 5126, + "time_per_iteration": 2.6188127994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062656, + "balance_loss_mlp": 1.03972054, + "epoch": 0.9863409003462871, + "flos": 518795352576.0, + "grad_norm": 0.04863083160531793, + "language_loss": 0.82588637, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83651292, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.22924805, + "step": 5127, + "time_per_iteration": 2.6533150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004341, + "balance_loss_mlp": 0.99771261, + "epoch": 0.9865332820315506, + "flos": 1486026570240.0, + "grad_norm": 0.0029781002545871183, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.8018707, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.06640625, + "step": 5128, + "time_per_iteration": 4.873580694198608 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065539, + "balance_loss_mlp": 1.0421623, + "epoch": 0.9867256637168141, + "flos": 582112645632.0, + "grad_norm": 0.06379657534480902, + "language_loss": 0.79116714, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80182254, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.23376465, + "step": 5129, + "time_per_iteration": 2.6797971725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063082, + "balance_loss_mlp": 1.0404557, + "epoch": 0.9869180454020777, + "flos": 959303264256.0, + "grad_norm": 0.053165441114429356, + "language_loss": 0.86395758, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87458837, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.22631836, + "step": 5130, + "time_per_iteration": 3.2278151512145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065979, + "balance_loss_mlp": 1.04216099, + "epoch": 0.9871104270873413, + "flos": 770730163200.0, + "grad_norm": 0.08455721104882859, + "language_loss": 0.82508123, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.83574104, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.23803711, + "step": 5131, + "time_per_iteration": 3.013922691345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106287, + "balance_loss_mlp": 1.03873003, + "epoch": 0.9873028087726049, + "flos": 446444794368.0, + "grad_norm": 0.06717519377101111, + "language_loss": 0.78595114, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.79657984, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.24121094, + "step": 5132, + "time_per_iteration": 2.5394835472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_mlp": 1.04094672, + "epoch": 0.9874951904578684, + "flos": 507612427776.0, + "grad_norm": 0.0838918478695267, + "language_loss": 0.86701912, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87765592, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.22741699, + "step": 5133, + "time_per_iteration": 2.5988612174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067946, + "balance_loss_mlp": 1.04530835, + "epoch": 0.987687572143132, + "flos": 716742743040.0, + "grad_norm": 0.05777251050052068, + "language_loss": 0.8243646, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83504403, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.2265625, + "step": 5134, + "time_per_iteration": 2.9052164554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004341, + "balance_loss_mlp": 0.99771327, + "epoch": 0.9878799538283956, + "flos": 1538647695360.0, + "grad_norm": 0.0029781392554615676, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80822289, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.06640625, + "step": 5135, + "time_per_iteration": 4.864543676376343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058561, + "balance_loss_mlp": 1.03648376, + "epoch": 0.9880723355136591, + "flos": 721424931840.0, + "grad_norm": 0.05497798109654085, + "language_loss": 0.81718028, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82776594, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.22070312, + "step": 5136, + "time_per_iteration": 2.9499542713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106326, + "balance_loss_mlp": 1.0411582, + "epoch": 0.9882647171989226, + "flos": 557350087680.0, + "grad_norm": 0.05211029783189689, + "language_loss": 0.84319884, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85383141, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.22106934, + "step": 5137, + "time_per_iteration": 2.7629799842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064001, + "balance_loss_mlp": 1.04021907, + "epoch": 0.9884570988841862, + "flos": 562820410368.0, + "grad_norm": 0.05978192539046426, + "language_loss": 0.80376399, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81440401, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.2376709, + "step": 5138, + "time_per_iteration": 2.698518753051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061053, + "balance_loss_mlp": 1.03812945, + "epoch": 0.9886494805694498, + "flos": 431763167232.0, + "grad_norm": 0.06509914319715783, + "language_loss": 0.8613711, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87198162, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.22912598, + "step": 5139, + "time_per_iteration": 2.496880531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064714, + "balance_loss_mlp": 1.04178977, + "epoch": 0.9888418622547134, + "flos": 592082348544.0, + "grad_norm": 0.05021855236929169, + "language_loss": 0.90418476, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91483188, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.22912598, + "step": 5140, + "time_per_iteration": 2.810107469558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064111, + "balance_loss_mlp": 1.04127049, + "epoch": 0.989034243939977, + "flos": 1134993461760.0, + "grad_norm": 0.054738913900339296, + "language_loss": 0.80376035, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81440145, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.22827148, + "step": 5141, + "time_per_iteration": 3.5171725749969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.03775382, + "epoch": 0.9892266256252404, + "flos": 566670048768.0, + "grad_norm": 0.05685524099004798, + "language_loss": 0.81969726, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83030808, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.2331543, + "step": 5142, + "time_per_iteration": 2.671959638595581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064239, + "balance_loss_mlp": 1.04062331, + "epoch": 0.989419007310504, + "flos": 640577051136.0, + "grad_norm": 0.0861584296867121, + "language_loss": 0.83990133, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.85054374, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.23620605, + "step": 5143, + "time_per_iteration": 2.88116192817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065538, + "balance_loss_mlp": 1.04206514, + "epoch": 0.9896113889957676, + "flos": 455478059520.0, + "grad_norm": 0.056260924480767305, + "language_loss": 0.81494832, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82560366, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.23461914, + "step": 5144, + "time_per_iteration": 2.6181671619415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.04035997, + "epoch": 0.9898037706810312, + "flos": 567339614208.0, + "grad_norm": 0.06359073729727067, + "language_loss": 0.80580842, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81644046, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.22839355, + "step": 5145, + "time_per_iteration": 2.6478822231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004339, + "balance_loss_mlp": 0.99771106, + "epoch": 0.9899961523662947, + "flos": 1550268191232.0, + "grad_norm": 0.0029775104009917776, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.7915076, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.06640625, + "step": 5146, + "time_per_iteration": 4.94190239906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063906, + "balance_loss_mlp": 1.04033875, + "epoch": 0.9901885340515583, + "flos": 610709787648.0, + "grad_norm": 0.06714350569578352, + "language_loss": 0.85334808, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86398715, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.23571777, + "step": 5147, + "time_per_iteration": 2.8776228427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062575, + "balance_loss_mlp": 1.03987753, + "epoch": 0.9903809157368219, + "flos": 517483385856.0, + "grad_norm": 0.05973429515052892, + "language_loss": 0.83089972, + "learning_rate": 2.426269020866512e-07, + "loss": 0.8415255, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.22692871, + "step": 5148, + "time_per_iteration": 2.5988693237304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065813, + "balance_loss_mlp": 1.04279327, + "epoch": 0.9905732974220854, + "flos": 1100426757120.0, + "grad_norm": 0.05728172246327606, + "language_loss": 0.80693364, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81759173, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.22998047, + "step": 5149, + "time_per_iteration": 3.4677181243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_mlp": 1.0427587, + "epoch": 0.990765679107349, + "flos": 858002056704.0, + "grad_norm": 0.07032896745482858, + "language_loss": 0.84725714, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85791647, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.23181152, + "step": 5150, + "time_per_iteration": 3.101963520050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065032, + "balance_loss_mlp": 1.04247713, + "epoch": 0.9909580607926125, + "flos": 491287721472.0, + "grad_norm": 0.05971113737071383, + "language_loss": 0.80380929, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81445956, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.22558594, + "step": 5151, + "time_per_iteration": 2.5920066833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064633, + "balance_loss_mlp": 1.041852, + "epoch": 0.9911504424778761, + "flos": 585060350976.0, + "grad_norm": 0.06220607613020044, + "language_loss": 0.79609364, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80673993, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.2277832, + "step": 5152, + "time_per_iteration": 2.7163023948669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061577, + "balance_loss_mlp": 1.03867722, + "epoch": 0.9913428241631397, + "flos": 570030359040.0, + "grad_norm": 0.05987133143138268, + "language_loss": 0.81748044, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82809621, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.22900391, + "step": 5153, + "time_per_iteration": 2.6854746341705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066119, + "balance_loss_mlp": 1.04346955, + "epoch": 0.9915352058484033, + "flos": 489745958400.0, + "grad_norm": 0.061045684154491624, + "language_loss": 0.86315995, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.87382114, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.22644043, + "step": 5154, + "time_per_iteration": 2.676959991455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063029, + "balance_loss_mlp": 1.03990245, + "epoch": 0.9917275875336667, + "flos": 744047741952.0, + "grad_norm": 0.05686586567139158, + "language_loss": 0.82752365, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83815396, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.23132324, + "step": 5155, + "time_per_iteration": 2.9827871322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065793, + "balance_loss_mlp": 1.04195142, + "epoch": 0.9919199692189303, + "flos": 508272081408.0, + "grad_norm": 0.05312703789662428, + "language_loss": 0.80161297, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81227094, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.23840332, + "step": 5156, + "time_per_iteration": 2.6331825256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062015, + "balance_loss_mlp": 1.03901994, + "epoch": 0.9921123509041939, + "flos": 543963174912.0, + "grad_norm": 0.05853430222577831, + "language_loss": 0.84317166, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85379183, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.2298584, + "step": 5157, + "time_per_iteration": 2.6789844036102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064673, + "balance_loss_mlp": 1.04108155, + "epoch": 0.9923047325894575, + "flos": 671561362944.0, + "grad_norm": 0.05962975100622896, + "language_loss": 0.77451545, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78516221, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.23571777, + "step": 5158, + "time_per_iteration": 2.7843782901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064845, + "balance_loss_mlp": 1.04091978, + "epoch": 0.9924971142747211, + "flos": 466557096960.0, + "grad_norm": 0.06079228956039329, + "language_loss": 0.80781394, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81846237, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.23913574, + "step": 5159, + "time_per_iteration": 2.699115753173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010682, + "balance_loss_mlp": 1.04509735, + "epoch": 0.9926894959599846, + "flos": 491581757952.0, + "grad_norm": 0.058577196677344574, + "language_loss": 0.82705361, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83773553, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.2310791, + "step": 5160, + "time_per_iteration": 2.573960304260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.04675007, + "epoch": 0.9928818776452482, + "flos": 492389715456.0, + "grad_norm": 0.10155127334981055, + "language_loss": 0.81777894, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82848155, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.23510742, + "step": 5161, + "time_per_iteration": 2.6028358936309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061966, + "balance_loss_mlp": 1.03905427, + "epoch": 0.9930742593305117, + "flos": 546357312000.0, + "grad_norm": 0.05598226563819152, + "language_loss": 0.84204501, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85266471, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.22900391, + "step": 5162, + "time_per_iteration": 2.7396605014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_mlp": 1.03965724, + "epoch": 0.9932666410157753, + "flos": 585510031872.0, + "grad_norm": 0.06298013807205607, + "language_loss": 0.86198676, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.87261665, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.2331543, + "step": 5163, + "time_per_iteration": 2.754847288131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064074, + "balance_loss_mlp": 1.04148424, + "epoch": 0.9934590227010388, + "flos": 537086909952.0, + "grad_norm": 0.1690844779753495, + "language_loss": 0.84142578, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.85206652, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.22583008, + "step": 5164, + "time_per_iteration": 2.668947219848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060593, + "balance_loss_mlp": 1.03828871, + "epoch": 0.9936514043863024, + "flos": 518014559232.0, + "grad_norm": 0.06013738975249062, + "language_loss": 0.87179309, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.88239902, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.22314453, + "step": 5165, + "time_per_iteration": 2.603496789932251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062895, + "balance_loss_mlp": 1.03958964, + "epoch": 0.993843786071566, + "flos": 744625903104.0, + "grad_norm": 0.059796641809083874, + "language_loss": 0.8027035, + "learning_rate": 9.938472493803419e-08, + "loss": 0.8133325, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.23303223, + "step": 5166, + "time_per_iteration": 3.0740067958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064775, + "balance_loss_mlp": 1.04208994, + "epoch": 0.9940361677568296, + "flos": 525918666240.0, + "grad_norm": 0.08114562381518384, + "language_loss": 0.82492352, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83557123, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.22680664, + "step": 5167, + "time_per_iteration": 2.67103910446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061467, + "balance_loss_mlp": 1.03862691, + "epoch": 0.9942285494420932, + "flos": 555650108928.0, + "grad_norm": 0.06282919009589391, + "language_loss": 0.80018061, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81079531, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.22851562, + "step": 5168, + "time_per_iteration": 2.746816873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063198, + "balance_loss_mlp": 1.04025054, + "epoch": 0.9944209311273566, + "flos": 585996788736.0, + "grad_norm": 0.0553184423711719, + "language_loss": 0.81770915, + "learning_rate": 8.162407083411872e-08, + "loss": 0.82834113, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.22937012, + "step": 5169, + "time_per_iteration": 2.698438882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_mlp": 1.04285061, + "epoch": 0.9946133128126202, + "flos": 735518486016.0, + "grad_norm": 0.05668738134113141, + "language_loss": 0.82296896, + "learning_rate": 7.609202086272804e-08, + "loss": 0.8336283, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.23071289, + "step": 5170, + "time_per_iteration": 3.0203161239624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010653, + "balance_loss_mlp": 1.04139829, + "epoch": 0.9948056944978838, + "flos": 646018011648.0, + "grad_norm": 0.0586239910556347, + "language_loss": 0.82445252, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83510554, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.23876953, + "step": 5171, + "time_per_iteration": 2.733257293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062523, + "balance_loss_mlp": 1.03965902, + "epoch": 0.9949980761831474, + "flos": 445846809600.0, + "grad_norm": 0.073524028673894, + "language_loss": 0.86205852, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87268376, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.2286377, + "step": 5172, + "time_per_iteration": 2.5351874828338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068489, + "balance_loss_mlp": 1.04420602, + "epoch": 0.995190457868411, + "flos": 435637398528.0, + "grad_norm": 0.05471779387138947, + "language_loss": 0.85440135, + "learning_rate": 6.066040520641414e-08, + "loss": 0.8650862, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.24279785, + "step": 5173, + "time_per_iteration": 2.545313596725464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063421, + "balance_loss_mlp": 1.04110503, + "epoch": 0.9953828395536745, + "flos": 514187315712.0, + "grad_norm": 0.06435296438429121, + "language_loss": 0.82080287, + "learning_rate": 5.590471806377062e-08, + "loss": 0.83143711, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.22302246, + "step": 5174, + "time_per_iteration": 2.5876967906951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066975, + "balance_loss_mlp": 1.0443728, + "epoch": 0.995575221238938, + "flos": 479847836160.0, + "grad_norm": 0.0684169578482112, + "language_loss": 0.82180631, + "learning_rate": 5.134312643245709e-08, + "loss": 0.83247602, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.22595215, + "step": 5175, + "time_per_iteration": 2.56477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061988, + "balance_loss_mlp": 1.03824103, + "epoch": 0.9957676029242016, + "flos": 587785600512.0, + "grad_norm": 0.0622556493926411, + "language_loss": 0.76661915, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77723902, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.23730469, + "step": 5176, + "time_per_iteration": 2.727720260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106837, + "balance_loss_mlp": 1.04471922, + "epoch": 0.9959599846094652, + "flos": 426465741312.0, + "grad_norm": 0.06993506205261471, + "language_loss": 0.80097485, + "learning_rate": 4.280223671243588e-08, + "loss": 0.81165856, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.2364502, + "step": 5177, + "time_per_iteration": 2.4686057567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064385, + "balance_loss_mlp": 1.04079378, + "epoch": 0.9961523662947287, + "flos": 611619061248.0, + "grad_norm": 0.058518209299634485, + "language_loss": 0.80758059, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81822443, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.23608398, + "step": 5178, + "time_per_iteration": 2.807483434677124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064873, + "balance_loss_mlp": 1.04153216, + "epoch": 0.9963447479799923, + "flos": 550785111552.0, + "grad_norm": 0.07135517223335183, + "language_loss": 0.74197721, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.75262594, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.2331543, + "step": 5179, + "time_per_iteration": 2.6507856845855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066091, + "balance_loss_mlp": 1.04279709, + "epoch": 0.9965371296652559, + "flos": 625873402368.0, + "grad_norm": 0.06151129189851597, + "language_loss": 0.8901037, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.90076458, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.23291016, + "step": 5180, + "time_per_iteration": 2.7257490158081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067144, + "balance_loss_mlp": 1.04370713, + "epoch": 0.9967295113505195, + "flos": 639522044928.0, + "grad_norm": 0.05801908419016484, + "language_loss": 0.82134813, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.83201957, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.23425293, + "step": 5181, + "time_per_iteration": 2.9021897315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061511, + "balance_loss_mlp": 1.03774095, + "epoch": 0.996921893035783, + "flos": 607389124608.0, + "grad_norm": 0.06523374059991581, + "language_loss": 0.76779681, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77841198, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.23754883, + "step": 5182, + "time_per_iteration": 2.7176480293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065298, + "balance_loss_mlp": 1.04209948, + "epoch": 0.9971142747210465, + "flos": 644162388480.0, + "grad_norm": 0.06922055405359702, + "language_loss": 0.81937838, + "learning_rate": 2.183802848243488e-08, + "loss": 0.83003139, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.23168945, + "step": 5183, + "time_per_iteration": 2.7730648517608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061879, + "balance_loss_mlp": 1.03976595, + "epoch": 0.9973066564063101, + "flos": 1040773722624.0, + "grad_norm": 0.05364949272503117, + "language_loss": 0.81309438, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82371318, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.22094727, + "step": 5184, + "time_per_iteration": 3.3700714111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068176, + "balance_loss_mlp": 1.04495406, + "epoch": 0.9974990380915737, + "flos": 665095131648.0, + "grad_norm": 0.0564934876502354, + "language_loss": 0.83195555, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84263736, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.23205566, + "step": 5185, + "time_per_iteration": 2.8419084548950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106209, + "balance_loss_mlp": 1.03943992, + "epoch": 0.9976914197768373, + "flos": 718121521152.0, + "grad_norm": 0.061013510739593706, + "language_loss": 0.77592587, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78654671, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.22644043, + "step": 5186, + "time_per_iteration": 2.900660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067293, + "balance_loss_mlp": 1.04322433, + "epoch": 0.9978838014621008, + "flos": 518328419328.0, + "grad_norm": 0.07107533241149569, + "language_loss": 0.79306746, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80374032, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.24060059, + "step": 5187, + "time_per_iteration": 2.6121342182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060009, + "balance_loss_mlp": 1.03775311, + "epoch": 0.9980761831473643, + "flos": 603430829568.0, + "grad_norm": 0.05826890310585048, + "language_loss": 0.84421951, + "learning_rate": 9.70582968801148e-09, + "loss": 0.8548196, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.22241211, + "step": 5188, + "time_per_iteration": 2.809272050857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064719, + "balance_loss_mlp": 1.04069877, + "epoch": 0.9982685648326279, + "flos": 453523691520.0, + "grad_norm": 0.056344907455401605, + "language_loss": 0.89689124, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90753841, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.24023438, + "step": 5189, + "time_per_iteration": 2.566086769104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061619, + "balance_loss_mlp": 1.03783727, + "epoch": 0.9984609465178915, + "flos": 481424103936.0, + "grad_norm": 0.06655173686317807, + "language_loss": 0.78688771, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79750389, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.2376709, + "step": 5190, + "time_per_iteration": 2.6371493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067596, + "balance_loss_mlp": 1.04430294, + "epoch": 0.9986533282031551, + "flos": 476941976064.0, + "grad_norm": 0.06114669924659269, + "language_loss": 0.84300482, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85368079, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.23266602, + "step": 5191, + "time_per_iteration": 2.6488306522369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067181, + "balance_loss_mlp": 1.04337525, + "epoch": 0.9988457098884186, + "flos": 641948488704.0, + "grad_norm": 0.05899170995522276, + "language_loss": 0.86477023, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87544203, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.23791504, + "step": 5192, + "time_per_iteration": 2.828174114227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064616, + "balance_loss_mlp": 1.04143012, + "epoch": 0.9990380915736822, + "flos": 396321693696.0, + "grad_norm": 0.07115874499342345, + "language_loss": 0.8789829, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.88962901, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.23181152, + "step": 5193, + "time_per_iteration": 2.4448673725128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106445, + "balance_loss_mlp": 1.04135919, + "epoch": 0.9992304732589458, + "flos": 576123259392.0, + "grad_norm": 0.06564969851429614, + "language_loss": 0.84961963, + "learning_rate": 1.552936970405927e-09, + "loss": 0.86026412, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.23083496, + "step": 5194, + "time_per_iteration": 2.733811855316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069185, + "balance_loss_mlp": 1.046404, + "epoch": 0.9994228549442093, + "flos": 544291716096.0, + "grad_norm": 0.07182096607307784, + "language_loss": 0.75418997, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76488185, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.2277832, + "step": 5195, + "time_per_iteration": 2.661844253540039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.04167914, + "epoch": 0.9996152366294728, + "flos": 1471314502656.0, + "grad_norm": 0.06834659341862818, + "language_loss": 0.81183863, + "learning_rate": 3.882343933003796e-10, + "loss": 0.82248634, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.23071289, + "step": 5196, + "time_per_iteration": 3.754668951034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048308, + "balance_loss_mlp": 1.02844799, + "epoch": 0.9998076183147364, + "flos": 618950149632.0, + "grad_norm": 0.1142694827050757, + "language_loss": 0.69779372, + "learning_rate": 9.70586077619906e-11, + "loss": 0.70827675, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.19836426, + "step": 5197, + "time_per_iteration": 4.012174844741821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026206, + "balance_loss_mlp": 1.01353955, + "epoch": 1.0, + "flos": 1290737617920.0, + "grad_norm": 0.028700024398426234, + "language_loss": 0.84229785, + "learning_rate": 0.0, + "loss": 0.85255992, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.12670898, + "step": 5198, + "time_per_iteration": 5.750073432922363 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.171926856433664e+16, + "train_loss": 0.8696983777453873, + "train_runtime": 15459.0512, + "train_samples_per_second": 43.036, + "train_steps_per_second": 0.336 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.171926856433664e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_sigmoidgating/training_args.bin b/sft_pretrain/Full_smoe_sigmoidgating/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b28f0633932ff84d8e0fde7beb2f9c59f0d04be --- /dev/null +++ b/sft_pretrain/Full_smoe_sigmoidgating/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54b92ce31f27a60f5f91da41c22febbdc5fe6a9ac82c4d361c2b9dbc9096639 +size 7992